<a href="https://colab.research.google.com/github/Sanket80/GFG-DS-ML/blob/main/Data_Cleaning_And_Analysis_With_Numpy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

## Dealing with Null Values

### 1. None to np.nan

In [3]:
arr = np.array([1, 2, None, 4, 5,None])
arr

arr[arr == None] = np.nan
arr

array([1, 2, nan, 4, 5, nan], dtype=object)

### 2. Checking the Null Values

In [6]:
arr = np.array([1, 2, np.nan, 4, 5,np.nan])
arr

arr[np.isnan(arr)]

array([nan, nan])

### 3. Removing the Null Values

In [7]:
arr = np.array([1, 2, np.nan, 4, 5,np.nan])
arr

arr[~np.isnan(arr)]

array([1., 2., 4., 5.])

### 4. Filling Null with Fixed Values

In [10]:
arr = np.array([1, 2, np.nan, 4, 5,np.nan])
arr

arr[np.isnan(arr)] = 0
print(arr)

[1. 2. 0. 4. 5. 0.]


### 5. Filling the null with Average

In [12]:
arr = np.array([1, 2, np.nan, 4, 5,np.nan])
arr

arr[np.isnan(arr)] = np.nanmean(arr)
print(arr)

[1. 2. 3. 4. 5. 3.]


## Filtering and Aggregating data With Numpy

### 1. Filtering

#### 1.1) Filter Elements Greater than 5

In [22]:
arr = np.random.randint(1,11,size=(5,5))
arr

array([[ 8,  7,  2,  8,  9],
       [ 7,  4,  4,  7,  4],
       [ 7,  6,  2, 10,  7],
       [ 5,  9, 10,  7,  4],
       [ 9, 10,  1,  5,  6]])

In [24]:
arr[arr > 5]

array([ 8,  7,  8,  9,  7,  7,  7,  6, 10,  7,  9, 10,  7,  9, 10,  6])

#### 1.2) Filter Elements that are Greater than 5 and Even

In [29]:
arr[(arr > 5) & (arr % 2 == 0)]

array([ 8,  8,  6, 10, 10, 10,  6])

### Aggregate Functions

In [32]:
arr = np.array([1, 2, 3, 4, 5])
arr

print('Sum: ' , arr.sum())
print('Mean: ' , arr.mean())
print('Max: ' , arr.max())
print('Min: ' , arr.min())
print('Std: ' , arr.std())
print('Var: ' , arr.var())

Sum:  15
Mean:  3.0
Max:  5
Min:  1
Std:  1.4142135623730951
Var:  2.0


## Descriptive Statistics with NumPy

In [38]:
arr = np.random.randint(1,101,size=100)
arr

mean = np.mean(arr)
median = np.median(arr)
mode = np.bincount(arr).argmax()
max = np.max(arr)
min = np.min(arr)
std = np.std(arr)
var = np.var(arr)

q1 = np.percentile(arr, 25)
q2 = np.percentile(arr, 50)
q3 = np.percentile(arr, 75)

iqr = q3 - q1
skewness = np.mean((arr - mean)**3) / (np.std(arr)**3)
kurtosis = np.mean((arr - mean)**4) / (np.std(arr)**4)

print('Mean: ' , mean)
print('Median: ' , median)
print('Mode: ' , mode)
print('Max: ' , max)
print('Min: ' , min)
print('Std: ' , std)
print('Var: ' , var)
print('-'*20)
print('Q1: ' , q1)
print('Q2: ' , q2)
print('Q3: ' , q3)
print('IQR: ' , iqr)
print('-'*20)
print('Skewness: ' , skewness)
print('Kurtosis: ' , kurtosis)

Mean:  52.03
Median:  53.5
Mode:  18
Max:  100
Min:  1
Std:  29.006363095017612
Var:  841.3690999999998
--------------------
Q1:  23.0
Q2:  53.5
Q3:  75.25
IQR:  52.25
--------------------
Skewness:  -0.0335796480813424
Kurtosis:  1.7332244199119022


## Sorting and Searching

### 1.Searching

In [46]:
arr = np.array([1, 2, 3, 4, 5,7,12,13,8,4,15,4])

# Linear Search
index = np.where(arr == 4)[0]
print(index)

# Binary Search
index = np.searchsorted(arr, 4)
print(index)

[ 3  9 11]
3


### 2.Sorting

In [49]:
arr = np.array([1, 2, 3, 4, 5,7,12,13,8,4,15,4])

# Quick Sort
sorted_arr = np.sort(arr)
print(sorted_arr)

# Merge Sort
sorted_arr = np.sort(arr, kind='mergesort')
print(sorted_arr)

# Heap Sort
sorted_arr = np.sort(arr, kind='heapsort')
print(sorted_arr)

[ 1  2  3  4  4  4  5  7  8 12 13 15]
[ 1  2  3  4  4  4  5  7  8 12 13 15]
[ 1  2  3  4  4  4  5  7  8 12 13 15]


## Data Cleaning With NumPy

### 1. Masking with Boolean Arrays

In [50]:
arr = np.array([1,2,np.nan, 4,5,np.nan])

print(np.isnan(arr))
print(arr[np.isnan(arr)])
print(arr[~np.isnan(arr)])

[False False  True False False  True]
[nan nan]
[1. 2. 4. 5.]


### 2. Replacing Outliers with specific value

In [51]:
arr = np.array([1,2,3,4,5,6,7,8,9,10])

arr[arr > 7] = 7
arr[arr < 3] = 3

print(arr)

[3 3 3 4 5 6 7 7 7 7]


### 3. Removing the Duplicates

In [53]:
arr = np.array([1,2,3,4,5,6,7,8,9,10,1,2,3])

print(np.unique(arr))
print(set(arr))
print(list(set(arr)))

[ 1  2  3  4  5  6  7  8  9 10]
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


### 4. Categorical Data to One Hot Encoding

In [60]:
# 'A', 'B', 'C', 'D', 'E
# 'A' = [1, 0, 0, 0, 0]
# 'B' = [0, 1, 0, 0, 0]
# 'C' = [0, 0, 1, 0, 0]
# 'D' = [0, 0, 0, 1, 0]
# 'E' = [0, 0, 0, 0, 1]

categorical_arr = np.array(['A', 'B', 'C', 'D', 'E',])
numerical_arr = np.array([0,1,2,3,4])

one_hot_arr = np.eye(5)[numerical_arr]
print(one_hot_arr)


[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]


### 5.Data Normalization

In [62]:
# salary,age,work_exp

# Salary (10000-1000000)
# Age (18-60)
# Work_exp (0-30)
# Normalization is used to scale the data between 0 and 1 for faster computation

arr = np.array([1,2,3,4,5])

arr = (arr - np.min(arr)) / (np.max(arr) - np.min(arr))
print(arr)

# 0/4 = 0
# 1/4 = 0.25
# 2/4 = 0.5
# 3/4 = 0.75
# 4/4 = 1

[0.   0.25 0.5  0.75 1.  ]
