## Load your Dataset here

In [4]:
import pandas as pd
import numpy as np

# Create a sample dataset
np.random.seed(42)
data = {
    'A': np.random.randn(100),
    'B': np.random.randn(100),
    'C': np.random.randn(100),
    'D': np.random.randn(100)
}

# Introduce some missing values in colum A and B
data['A'][np.random.choice(100, 10, replace=False)] = np.nan
data['B'][np.random.choice(100, 5, replace=False)] = np.nan

# Introduce some outliers in column C
data['C'][np.random.choice(100, 3, replace=False)] = np.random.choice([15, -15], 3)

df = pd.DataFrame(data)
df.head(10)

Unnamed: 0,A,B,C,D
0,0.496714,-1.415371,0.357787,-0.828995
1,-0.138264,-0.420645,0.560785,-0.560181
2,0.647689,-0.342715,1.083051,0.747294
3,1.52303,-0.802277,1.053802,0.61037
4,-0.234153,-0.161286,-1.377669,-0.020902
5,-0.234137,0.404051,-0.937825,0.117327
6,1.579213,1.886186,15.0,1.277665
7,,0.174578,0.513786,-0.591571
8,-0.469474,0.25755,0.515048,0.547097
9,0.54256,-0.074446,3.852731,-0.202193


## Imputation or Deletion

In [8]:
# Impute missing values with mean
df['A'].fillna(df['A'].mean(), inplace=True)
df['B'].fillna(df['B'].mean(), inplace=True)

print("Data after imputation:")
df.head(10)

Data after imputation:


Unnamed: 0,A,B,C,D
0,0.496714,-1.415371,0.357787,-0.828995
1,-0.138264,-0.420645,0.560785,-0.560181
2,0.647689,-0.342715,1.083051,0.747294
3,1.52303,-0.802277,1.053802,0.61037
4,-0.234153,-0.161286,-1.377669,-0.020902
5,-0.234137,0.404051,-0.937825,0.117327
6,1.579213,1.886186,15.0,1.277665
7,-0.081366,0.174578,0.513786,-0.591571
8,-0.469474,0.25755,0.515048,0.547097
9,0.54256,-0.074446,3.852731,-0.202193


In [9]:
# Deletion
df.dropna(inplace=True)
print("Data after deletion:")
df.head(10)

Data after deletion:


Unnamed: 0,A,B,C,D
0,0.496714,-1.415371,0.357787,-0.828995
1,-0.138264,-0.420645,0.560785,-0.560181
2,0.647689,-0.342715,1.083051,0.747294
3,1.52303,-0.802277,1.053802,0.61037
4,-0.234153,-0.161286,-1.377669,-0.020902
5,-0.234137,0.404051,-0.937825,0.117327
6,1.579213,1.886186,15.0,1.277665
7,-0.081366,0.174578,0.513786,-0.591571
8,-0.469474,0.25755,0.515048,0.547097
9,0.54256,-0.074446,3.852731,-0.202193


## Handle Outliers ( Z-Score & Interquartile Range )

In [6]:
from scipy.stats import zscore

# Calculate Z-scores
z_scores = np.abs(zscore(df))

# Identify outliers
outliers = (z_scores > 3).any(axis=1)

# Handle outliers (e.g., remove them)
df_cleaned = df[~outliers]

print("Data after removing outliers:")
print(df_cleaned.head(10))

Data after removing outliers:
           A         B         C         D
0   0.496714 -1.415371  0.357787 -0.828995
1  -0.138264 -0.420645  0.560785 -0.560181
2   0.647689 -0.342715  1.083051  0.747294
3   1.523030 -0.802277  1.053802  0.610370
4  -0.234153 -0.161286 -1.377669 -0.020902
5  -0.234137  0.404051 -0.937825  0.117327
7  -0.081366  0.174578  0.513786 -0.591571
8  -0.469474  0.257550  0.515048  0.547097
9   0.542560 -0.074446  3.852731 -0.202193
10 -0.463418  0.053383  0.570891 -0.217681


In [7]:
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

# Identify outliers
outliers_iqr = ((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)

# Handle outliers (e.g., remove them)
df_cleaned_iqr = df[~outliers_iqr]

print("Data after removing outliers using IQR:")
print(df_cleaned_iqr.head(10))

Data after removing outliers using IQR:
           A         B         C         D
0   0.496714 -1.415371  0.357787 -0.828995
1  -0.138264 -0.420645  0.560785 -0.560181
2   0.647689 -0.342715  1.083051  0.747294
3   1.523030 -0.802277  1.053802  0.610370
4  -0.234153 -0.161286 -1.377669 -0.020902
5  -0.234137  0.404051 -0.937825  0.117327
7  -0.081366  0.174578  0.513786 -0.591571
8  -0.469474  0.257550  0.515048  0.547097
10 -0.463418  0.053383  0.570891 -0.217681
11 -0.465730 -0.026514  1.135566  1.098777
