# Masking and Data Cleaning

In [8]:
import numpy as np
arr=(np.array([-1,5,np.nan,9,-2]))

1. Masking with boolean array

In [9]:
print(np.isnan(arr))

print(arr[np.isnan(arr)])
print(arr[~np.isnan(arr)])
arr=arr[~np.isnan(arr)]

[False False  True False False]
[nan]
[-1.  5.  9. -2.]


2. Replacing outliers with specific value


In [10]:
arr[arr>5]=5
print(arr)
arr[arr<5]=3
print(arr)

[-1.  5.  5. -2.]
[3. 5. 5. 3.]


3. Removing the duplicates

In [11]:
print(np.unique(arr))
print(arr)

[3. 5.]
[3. 5. 5. 3.]


4. Categorical data to one_hot_encoded array

In [12]:
import numpy as np

# Categorical data
categories = np.array(['Red', 'Blue', 'Green', 'Blue', 'Red'])

# Get the unique categories
unique_categories = np.unique(categories)

# Create a dictionary that maps each category to a unique index
category_to_index = {category: idx for idx, category in enumerate(unique_categories)}

# Initialize a one-hot encoded matrix with zeros
one_hot_encoded = np.zeros((categories.size, unique_categories.size))

# Set the appropriate positions to 1
for i, category in enumerate(categories):
    one_hot_encoded[i, category_to_index[category]] = 1

print(one_hot_encoded)


#OR we can do this by eye also
print("or".center(30,'-'))
one_hot_encoded_eye=np.eye(unique_categories.size)[range(unique_categories.size)]

print(one_hot_encoded_eye)


[[0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]]
--------------or--------------
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


5. Data Normalization

In [13]:
data=np.array([2,3,6,7,3,4,6,7,9])
print(data)

Mean=np.mean(data)
std_dev=np.std(data)

z_score=(arr-Mean)/std_dev
print(z_score)

min_max_scaling=(data-min(data))/(max(data)-min(data))
print(min_max_scaling)

max_absolute=data/max(np.abs(data))
print(max_absolute)

iqr=np.percentile(data,25)-np.percentile(data,75)
robust_scaling=(data-np.median(data))/iqr
print(robust_scaling)

[2 3 6 7 3 4 6 7 9]
[-1.01015254 -0.10101525 -0.10101525 -1.01015254]
[0.         0.14285714 0.57142857 0.71428571 0.14285714 0.28571429
 0.57142857 0.71428571 1.        ]
[0.22222222 0.33333333 0.66666667 0.77777778 0.33333333 0.44444444
 0.66666667 0.77777778 1.        ]
[ 1.    0.75 -0.   -0.25  0.75  0.5  -0.   -0.25 -0.75]
