In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.DataFrame({
    'age': [25, 30, np.nan, 46, 28, 40, 40],
    'income':[50000, np.nan, 60000, 65000, 62000, 55000, 55000],
    'city': ['Delhi', 'Mumbai', 'Delhi', np.nan, 'Delhi', 'Mumbai', 'Mumbai'],
})
print(data)

    age   income    city
0  25.0  50000.0   Delhi
1  30.0      NaN  Mumbai
2   NaN  60000.0   Delhi
3  46.0  65000.0     NaN
4  28.0  62000.0   Delhi
5  40.0  55000.0  Mumbai
6  40.0  55000.0  Mumbai


In [3]:
print(f"Missing Values : \n{data.isna().sum()}")

Missing Values : 
age       1
income    1
city      1
dtype: int64


In [4]:
# filling values

data['age'] = data['age'].fillna(data['age'].mean())
data['income'] = data['income'].fillna(data['income'].median())
data['city'] = data['city'].fillna(data['city'].mode()[0])

In [5]:
data

Unnamed: 0,age,income,city
0,25.0,50000.0,Delhi
1,30.0,57500.0,Mumbai
2,34.833333,60000.0,Delhi
3,46.0,65000.0,Delhi
4,28.0,62000.0,Delhi
5,40.0,55000.0,Mumbai
6,40.0,55000.0,Mumbai


In [6]:
cleaned = data.dropna()
drop_col = data.dropna(axis=1)

In [7]:
data.insert(3, column='Yaerly_income', value=data['income']*12)

In [8]:
data

Unnamed: 0,age,income,city,Yaerly_income
0,25.0,50000.0,Delhi,600000.0
1,30.0,57500.0,Mumbai,690000.0
2,34.833333,60000.0,Delhi,720000.0
3,46.0,65000.0,Delhi,780000.0
4,28.0,62000.0,Delhi,744000.0
5,40.0,55000.0,Mumbai,660000.0
6,40.0,55000.0,Mumbai,660000.0


In [9]:
# removing duplicates
print(data.duplicated())
data = data.drop_duplicates()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool


In [10]:
# handling outliers
df = pd.DataFrame({
    'salary' : [50000, 52000, 51000, 49000, 1000000]
})

In [11]:
Q1 = df['salary'].quantile(0.25)
Q3 = df['salary'].quantile(0.75)
IQR  = Q3 - Q1 # Inter Quartile Range 
print(Q1)
print(Q3)
print(IQR)

50000.0
52000.0
2000.0


In [12]:
lower_bound = Q1 - (1.5*IQR)
upper_bound = Q3 + (1.5*IQR)
print(lower_bound)
print(upper_bound)

47000.0
55000.0


In [13]:
filtered_df = df[(df['salary'] > lower_bound) & (df['salary'] < upper_bound)]
print(f"Filtered data : \n {filtered_df}")

Filtered data : 
    salary
0   50000
1   52000
2   51000
3   49000


In [14]:
# handling outliers using z score
df = pd.DataFrame({
    'salary': [50000,52000,51000,49000,1000000]
})
df

Unnamed: 0,salary
0,50000
1,52000
2,51000
3,49000
4,1000000


In [15]:
from scipy import stats

df['z_score'] = stats.zscore(df['salary'])
df

Unnamed: 0,salary,z_score
0,50000,-0.501315
1,52000,-0.496049
2,51000,-0.498682
3,49000,-0.503948
4,1000000,1.999993


In [16]:
outliers = df[df['z_score'].abs() > 1.5] # here it should be 3 
outliers # her

Unnamed: 0,salary,z_score
4,1000000,1.999993


In [19]:
no_outliers_df = df[(df['z_score'].abs() <= 1.5)].drop(columns='z_score')
no_outliers_df

Unnamed: 0,salary
0,50000
1,52000
2,51000
3,49000
