In [1]:
import numpy as np

# Handling Missing Data

In [2]:
data_with_missing_values = np.array([
    [1.0, 2.0, 3],
    [4.0, 2, 6.0],
    [22, 8.0, 9.0],
    [3.3, 8.0, np.nan],
])
data_with_missing_values

array([[ 1. ,  2. ,  3. ],
       [ 4. ,  2. ,  6. ],
       [22. ,  8. ,  9. ],
       [ 3.3,  8. ,  nan]])

# Ignoring Tuple
Tuple in Data Science means rows  
Ignoring Tuple will remove the row from table

In [3]:
rows_with_missing = np.any(np.isnan(data_with_missing_values), axis=1)
rows_with_missing

array([False, False, False,  True])

In [11]:
missing_row_indices = np.where(rows_with_missing)[0]
missing_row_indices

array([3])

In [15]:
np.delete(data_with_missing_values, missing_row_indices, axis=0)

array([[ 1.,  2.,  3.],
       [ 4.,  2.,  6.],
       [22.,  8.,  9.]])

- 4th row is removed cause it has null value

## Using Global Constant

In [13]:
data_with_missing_values = np.array([
    [1.0, 2.0, np.nan],
    [4.0, np.nan, 6.0],
    [np.nan, 8.0, 9.0],
    [np.nan, 8.0, np.nan],
])
data_with_missing_values

array([[ 1.,  2., nan],
       [ 4., nan,  6.],
       [nan,  8.,  9.],
       [nan,  8., nan]])

In [14]:
np.nan_to_num(data_with_missing_values, nan=-1)

array([[ 1.,  2., -1.],
       [ 4., -1.,  6.],
       [-1.,  8.,  9.],
       [-1.,  8., -1.]])

## Imputation
It is the process of filling missing data using various statistical means

In [15]:
data_with_missing_values = np.array([
    [1.0, 2.0, np.nan],
    [4.0, np.nan, 6.0],
    [np.nan, 8.0, 9.0],
    [np.nan, 8.0, np.nan],
])

### Imputation By Mean

In [18]:
column_means = np.nanmean(data_with_missing_values, axis=0)
column_means

array([2.5, 6. , 7.5])

In [19]:
nan_indices = np.isnan(data_with_missing_values)
nan_indices

array([[False, False,  True],
       [False,  True, False],
       [ True, False, False],
       [ True, False,  True]])

In [28]:
np.where(nan_indices)[1]


array([2, 1, 0, 0, 2])

In [29]:
np.take(column_means, np.where(nan_indices)[1])

array([7.5, 6. , 2.5, 2.5, 7.5])

In [30]:
data_withou_missing_value = data_with_missing_values.copy()
data_withou_missing_value[nan_indices] = np.take(column_means, np.where(nan_indices)[1])

In [31]:
data_withou_missing_value

array([[1. , 2. , 7.5],
       [4. , 6. , 6. ],
       [2.5, 8. , 9. ],
       [2.5, 8. , 7.5]])

### Imputation By Media

In [32]:
column_median = np.nanmedian(data_with_missing_values, axis=0)
column_median

array([2.5, 8. , 7.5])

In [33]:
nan_indices = np.isnan(data_with_missing_values)
nan_indices

array([[False, False,  True],
       [False,  True, False],
       [ True, False, False],
       [ True, False,  True]])

In [34]:
data_withou_missing_value = data_with_missing_values.copy()
data_withou_missing_value[nan_indices] = np.take(column_median, np.where(nan_indices)[1])
data_withou_missing_value

array([[1. , 2. , 7.5],
       [4. , 8. , 6. ],
       [2.5, 8. , 9. ],
       [2.5, 8. , 7.5]])

## Handling Noisy Data

- Binnning
- Regression
- Clustering

In [35]:
raw_data = np.array([4, 8, 15, 21, 21, 24, 25, 28, 34])
raw_data

array([ 4,  8, 15, 21, 21, 24, 25, 28, 34])

In [36]:
sorted_data = np.sort(raw_data)
sorted_data

array([ 4,  8, 15, 21, 21, 24, 25, 28, 34])

In [37]:
len_arr = len(sorted_data)
bin_start = 0
bin_size = 3
bin_end = len_arr // bin_size

In [38]:
bin_list = []
for i in range(bin_size):
    if bin_end < len_arr:
        bin_list.append(sorted_data[bin_start:bin_end])
    else:
        bin_list.append(sorted_data[bin_start:])
    bin_start, bin_end = bin_end, bin_end + bin_size

In [39]:
bin_list 

[array([ 4,  8, 15]), array([21, 21, 24]), array([25, 28, 34])]

In [40]:
bin_arr = np.array(bin_list)
bin_arr

array([[ 4,  8, 15],
       [21, 21, 24],
       [25, 28, 34]])

**reshape(-1, 1)**
- -1: This is a special value in NumPy's reshape method. It tells NumPy to infer the size of this dimension based on the size of the other dimensions and the total number of elements in the array. Essentially, NumPy will figure out how many rows are needed automatically to accommodate all the elements while ensuring the number of columns is 1.

- 1: This indicates that we want to reshape the array into 1 column.



In [45]:
row_means = np.mean(bin_arr, axis=1).reshape(-1, 1)
row_means

array([[ 9.],
       [22.],
       [29.]])

In [46]:
array_with_row_means = np.tile(row_means, bin_arr.shape[1])

row_means: This is a 1D array containing the means of each row. After computing the mean for each row, we have an array of means. For example, if row_means is [9.0, 22.0, 29.0], it means each row in the original array should be replaced by its corresponding mean.

array.shape[1]: This returns the number of columns in the original array. It represents how many times to repeat each element of row_means across the columns of the new array. For example, if the original array has 3 columns, array.shape[1] would be 3.

np.tile(row_means, array.shape[1]): This repeats the row_means array horizontally (along the columns) to match the number of columns in the original array.|

In [47]:
array_with_row_means

array([[ 9.,  9.,  9.],
       [22., 22., 22.],
       [29., 29., 29.]])

## Normalization

Min-Max Normalization is a technique used to scale the values of a feature to a fixed range, usually [0, 1]. The formula for Min-Max normalization is:

$$
x_{\text{norm}} = \frac{x - x_{\text{min}}}{x_{\text{max}} - x_{\text{min}}}
$$

Where:
$$
\begin{array}{l}
x \text{ is the original value,} \\
x_{\text{min}} \text{ is the minimum value in the feature,} \\
x_{\text{max}} \text{ is the maximum value in the feature,} \\
x_{\text{norm}} \text{ is the normalized value after.}
\end{array}
$$ 


In [48]:
un_norm_arr = np.arange(0, 1000, 3)
un_norm_arr

array([  0,   3,   6,   9,  12,  15,  18,  21,  24,  27,  30,  33,  36,
        39,  42,  45,  48,  51,  54,  57,  60,  63,  66,  69,  72,  75,
        78,  81,  84,  87,  90,  93,  96,  99, 102, 105, 108, 111, 114,
       117, 120, 123, 126, 129, 132, 135, 138, 141, 144, 147, 150, 153,
       156, 159, 162, 165, 168, 171, 174, 177, 180, 183, 186, 189, 192,
       195, 198, 201, 204, 207, 210, 213, 216, 219, 222, 225, 228, 231,
       234, 237, 240, 243, 246, 249, 252, 255, 258, 261, 264, 267, 270,
       273, 276, 279, 282, 285, 288, 291, 294, 297, 300, 303, 306, 309,
       312, 315, 318, 321, 324, 327, 330, 333, 336, 339, 342, 345, 348,
       351, 354, 357, 360, 363, 366, 369, 372, 375, 378, 381, 384, 387,
       390, 393, 396, 399, 402, 405, 408, 411, 414, 417, 420, 423, 426,
       429, 432, 435, 438, 441, 444, 447, 450, 453, 456, 459, 462, 465,
       468, 471, 474, 477, 480, 483, 486, 489, 492, 495, 498, 501, 504,
       507, 510, 513, 516, 519, 522, 525, 528, 531, 534, 537, 54

In [49]:
# To Covert this to -1 to 1
x_min = un_norm_arr.min()  # Minimum value
x_max = un_norm_arr.max()

In [50]:
normalized_arr = 2 * (un_norm_arr - x_min) / (x_max - x_min) - 1

In [51]:
normalized_arr

array([-1.        , -0.99399399, -0.98798799, -0.98198198, -0.97597598,
       -0.96996997, -0.96396396, -0.95795796, -0.95195195, -0.94594595,
       -0.93993994, -0.93393393, -0.92792793, -0.92192192, -0.91591592,
       -0.90990991, -0.9039039 , -0.8978979 , -0.89189189, -0.88588589,
       -0.87987988, -0.87387387, -0.86786787, -0.86186186, -0.85585586,
       -0.84984985, -0.84384384, -0.83783784, -0.83183183, -0.82582583,
       -0.81981982, -0.81381381, -0.80780781, -0.8018018 , -0.7957958 ,
       -0.78978979, -0.78378378, -0.77777778, -0.77177177, -0.76576577,
       -0.75975976, -0.75375375, -0.74774775, -0.74174174, -0.73573574,
       -0.72972973, -0.72372372, -0.71771772, -0.71171171, -0.70570571,
       -0.6996997 , -0.69369369, -0.68768769, -0.68168168, -0.67567568,
       -0.66966967, -0.66366366, -0.65765766, -0.65165165, -0.64564565,
       -0.63963964, -0.63363363, -0.62762763, -0.62162162, -0.61561562,
       -0.60960961, -0.6036036 , -0.5975976 , -0.59159159, -0.58