In [1]:
import pandas as pd
import numpy as np

# DATA CLEANING


## Identifying and handling Missing Data

#### - isna() & notnull() Functions

In [5]:
data_1={
      'A': [1, 2, np.nan, 4, None],
      'B': [None, 6, 7, 8, np.nan],
      'C': [10, 20, 30, 40, 50]
}
df_1=pd.DataFrame(data_1)
df_1

Unnamed: 0,A,B,C
0,1.0,,10
1,2.0,6.0,20
2,,7.0,30
3,4.0,8.0,40
4,,,50


In [89]:
df_1.isnull().sum()

A    2
B    2
C    0
dtype: int64

In [7]:
df_1.isna()

Unnamed: 0,A,B,C
0,False,True,False
1,False,False,False
2,True,False,False
3,False,False,False
4,True,True,False


In [8]:
df_1.isnull()

Unnamed: 0,A,B,C
0,False,True,False
1,False,False,False
2,True,False,False
3,False,False,False
4,True,True,False


In [9]:
df_1.notnull()

Unnamed: 0,A,B,C
0,True,False,True
1,True,True,True
2,False,True,True
3,True,True,True
4,False,False,True


### Removing Rows with missing values

#### -Dropna() Function

In [12]:
df_2=df_1.copy()
df_2

Unnamed: 0,A,B,C
0,1.0,,10
1,2.0,6.0,20
2,,7.0,30
3,4.0,8.0,40
4,,,50


In [13]:
df_2_dropped=df_2.dropna()
df_2_dropped

Unnamed: 0,A,B,C
1,2.0,6.0,20
3,4.0,8.0,40


#### -Dropna() with a subset of columns

In [15]:
df_2

Unnamed: 0,A,B,C
0,1.0,,10
1,2.0,6.0,20
2,,7.0,30
3,4.0,8.0,40
4,,,50


In [16]:
df_2_dropped=df_2.dropna(subset=['A','B'])
df_2_dropped

Unnamed: 0,A,B,C
1,2.0,6.0,20
3,4.0,8.0,40


### Filling Missing values [fillna() function]

#### - Numeric-Data, Filling with constant value

In [19]:
 data_2 = {
     'A': [1, 2, None, 4, 5],
     'B': [None, 10, 20, None, 50]}
df_3=pd.DataFrame(data_2)
df_3

Unnamed: 0,A,B
0,1.0,
1,2.0,10.0
2,,20.0
3,4.0,
4,5.0,50.0


In [20]:
#filling with constant'100'.
constant_filled_df=df_3.fillna(100)
constant_filled_df 

Unnamed: 0,A,B
0,1.0,100.0
1,2.0,10.0
2,100.0,20.0
3,4.0,100.0
4,5.0,50.0


#### - Numeric-Data, Filling with Mean or Median

In [22]:
df_3

Unnamed: 0,A,B
0,1.0,
1,2.0,10.0
2,,20.0
3,4.0,
4,5.0,50.0


In [91]:
column_A_mean_filled=df_3.copy()
column_A_mean_filled['A']=column_A_mean_filled['A'].fillna(column_A_mean_filled['A'].mean())
column_A_mean_filled

Unnamed: 0,A,B
0,1.0,
1,2.0,10.0
2,3.0,20.0
3,4.0,
4,5.0,50.0


In [95]:
column_B_median_filled=df_3.copy()
column_B_median_filled['B']=column_A_median_filled['B'].fillna(column_A_median_filled['B'].median())
column_B_median_filled

Unnamed: 0,A,B
0,1.0,20.0
1,2.0,10.0
2,,20.0
3,4.0,20.0
4,5.0,50.0


#### - Categorical-Data, Filling with a specific category or Mode

In [26]:
data_4 = {'Country': ['USA', 'Canada', None, 'Germany', None, 'India']}
df_4 = pd.DataFrame(data_4)
df_4

Unnamed: 0,Country
0,USA
1,Canada
2,
3,Germany
4,
5,India


In [27]:
# Filling with a specific category 'unknown'
filled_spec_cat=df_4.copy()
filled_spec_cat=filled_spec_cat.fillna('Unkonwn')
filled_spec_cat

Unnamed: 0,Country
0,USA
1,Canada
2,Unkonwn
3,Germany
4,Unkonwn
5,India


In [28]:
# Filling with the mode
filled_mode=df_4.copy()
filled_mode=filled_mode.fillna(filled_mode['Country'].mode()[0])
filled_mode

Unnamed: 0,Country
0,USA
1,Canada
2,Canada
3,Germany
4,Canada
5,India


#### - Backfill or Forwardfill

In [30]:
 data_5 = {
 'A': [1, 2, None, 4, None, 6],
 'B': [3, None, 7, None, 11, 13]
 }
 df_5 = pd.DataFrame(data_5)
 df_5

Unnamed: 0,A,B
0,1.0,3.0
1,2.0,
2,,7.0
3,4.0,
4,,11.0
5,6.0,13.0


In [31]:
# Forward Fill
forward_filled=df_5.copy()
forward_filled=forward_filled.ffill()
forward_filled

Unnamed: 0,A,B
0,1.0,3.0
1,2.0,3.0
2,2.0,7.0
3,4.0,7.0
4,4.0,11.0
5,6.0,13.0


In [32]:
#Backward fill
backward_filled=df_5.copy()
backward_filled=backward_filled.bfill()
backward_filled

Unnamed: 0,A,B
0,1.0,3.0
1,2.0,7.0
2,4.0,7.0
3,4.0,11.0
4,6.0,11.0
5,6.0,13.0


### Interpolation for Handling missing Data

#### - Linear interpolation (change btn consecutive data points is constant)

In [35]:
data_6 = pd.DataFrame({'A': [1, 2, np.nan, 4, np.nan], 'B': [5, np.nan, 7, 8,
 9]})
data_6

Unnamed: 0,A,B
0,1.0,5.0
1,2.0,
2,,7.0
3,4.0,8.0
4,,9.0


In [36]:
df_linear_interpolated=data_6.interpolate(method='linear')
df_linear_interpolated

Unnamed: 0,A,B
0,1.0,5.0
1,2.0,6.0
2,3.0,7.0
3,4.0,8.0
4,4.0,9.0


#### - Time-Based Interpolation

In [38]:
 time_index = pd.date_range(start='2023-01-01', periods=5, freq='D')
 time_series_data = pd.Series([10, np.nan, 30, np.nan, 50],
 index=time_index)
 print(time_series_data)

2023-01-01    10.0
2023-01-02     NaN
2023-01-03    30.0
2023-01-04     NaN
2023-01-05    50.0
Freq: D, dtype: float64


In [39]:
time_series_interpolated=time_series_data.interpolate(method='linear')
time_series_interpolated

2023-01-01    10.0
2023-01-02    20.0
2023-01-03    30.0
2023-01-04    40.0
2023-01-05    50.0
Freq: D, dtype: float64

## Handling Duplicates

### Detecting Duplicated Rows

In [42]:
# Sample data with duplicates
data_7 = {
 'ID': [1, 2, 3, 4, 1, 5, 2],
 'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Alice', 'Eve', 'Bob'],
 'Age': [25, 30, 22, 28, 25, 29, 30]
 }
df_7 = pd.DataFrame(data_7)
df_7

Unnamed: 0,ID,Name,Age
0,1,Alice,25
1,2,Bob,30
2,3,Charlie,22
3,4,David,28
4,1,Alice,25
5,5,Eve,29
6,2,Bob,30


In [43]:
#Using duplicated() to detect duplicate rows
duplicates=df_7.duplicated()
duplicates

0    False
1    False
2    False
3    False
4     True
5    False
6     True
dtype: bool

### Removing Duplicated Rows

In [45]:
# Deleting all duplicates in the dataframe
dropped_df=df_7.drop_duplicates()
dropped_df

Unnamed: 0,ID,Name,Age
0,1,Alice,25
1,2,Bob,30
2,3,Charlie,22
3,4,David,28
5,5,Eve,29


In [46]:
# dropping duplicates and keeping the first occurrence
dropped_df_keep_f=df_7.drop_duplicates(keep='first')
dropped_df_keep_f

Unnamed: 0,ID,Name,Age
0,1,Alice,25
1,2,Bob,30
2,3,Charlie,22
3,4,David,28
5,5,Eve,29


In [47]:
# dropping duplicates and keeping the last occurrence
dropped_df_keep_l=df_7.drop_duplicates(keep='last')
dropped_df_keep_l

Unnamed: 0,ID,Name,Age
2,3,Charlie,22
3,4,David,28
4,1,Alice,25
5,5,Eve,29
6,2,Bob,30
