In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import MinMaxScaler, StandardScaler


# Load dataset
df = pd.read_csv(r"C:\Users\swapn\Downloads\data_cleaning.csv")
df

Unnamed: 0,ID,Age,Income,Score,Name
0,1,39.97,28769.44,685,Person_1
1,2,33.62,43690.32,403,Person_2
2,3,41.48,200000.00,692,Person_3
3,4,50.23,37965.84,545,Person_4
4,5,32.66,47580.71,475,Person_5
...,...,...,...,...,...
95,5,20.36,55779.76,584,Person_5
96,10,37.96,36742.14,696,Person_10
97,98,37.61,52305.88,311,Person_98
98,99,35.05,50873.13,601,Person_99


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      100 non-null    int64  
 1   Age     100 non-null    float64
 2   Income  100 non-null    float64
 3   Score   100 non-null    int64  
 4   Name    100 non-null    object 
dtypes: float64(2), int64(2), object(1)
memory usage: 4.0+ KB


In [None]:
# Z-score outlier detection
z_scores = np.abs(stats.zscore(df['Income']))
df['Outlier_Z_Score'] = z_scores > 3  # Mark outliers where Z-score > 3

# Check and print the outliers detected by Z-score
outliers_z_score = df[df['Outlier_Z_Score'] == True]

if outliers_z_score.empty:
    print("No Z-Score outliers detected.")
else:
    print("Z-Score Outliers:")
    print(outliers_z_score)

Z-Score Outliers:
   ID    Age    Income  Score      Name  Outlier_Z_Score
2   3  41.48  200000.0    692  Person_3             True
5   6  32.66  250000.0    338  Person_6             True
8   9  30.31  180000.0    707  Person_9             True


In [None]:
df

Unnamed: 0,ID,Age,Income,Score,Name,Outlier_Z_Score
0,1,39.97,28769.44,685,Person_1,False
1,2,33.62,43690.32,403,Person_2,False
2,3,41.48,200000.00,692,Person_3,True
3,4,50.23,37965.84,545,Person_4,False
4,5,32.66,47580.71,475,Person_5,False
...,...,...,...,...,...,...
95,5,20.36,55779.76,584,Person_5,False
96,10,37.96,36742.14,696,Person_10,False
97,98,37.61,52305.88,311,Person_98,False
98,99,35.05,50873.13,601,Person_99,False


In [None]:
#IQR-based outlier detection for 'Income'
Q1 = df['Income'].quantile(0.25)
Q3 = df['Income'].quantile(0.75)
IQR = Q3 - Q1

df['Outlier_IQR'] = ((df['Income'] < (Q1 - 1.5 * IQR)) | (df['Income'] > (Q3 + 1.5 * IQR)))

# Print the IQR-based outliers
print("IQR-based Outliers:")
outliers = df[df['Outlier_IQR'] == True]
if not outliers.empty:
    print(outliers[['ID', 'Income']])
else:
    print("No IQR-based outliers found.")

IQR-based Outliers:
   ID    Income
2   3  200000.0
5   6  250000.0
8   9  180000.0


In [None]:
df = df[df['Outlier_Z_Score'] == False]
df

Unnamed: 0,ID,Age,Income,Score,Name,Outlier_Z_Score,Outlier_IQR
0,1,39.97,28769.44,685,Person_1,False,False
1,2,33.62,43690.32,403,Person_2,False,False
3,4,50.23,37965.84,545,Person_4,False,False
4,5,32.66,47580.71,475,Person_5,False,False
6,7,50.79,78292.79,776,Person_7,False,False
...,...,...,...,...,...,...,...
95,5,20.36,55779.76,584,Person_5,False,False
96,10,37.96,36742.14,696,Person_10,False,False
97,98,37.61,52305.88,311,Person_98,False,False
98,99,35.05,50873.13,601,Person_99,False,False


In [None]:
# Detect duplicates based on 'ID' and 'Name'
duplicates = df[df.duplicated(subset=['ID', 'Name'], keep=False)]

# Print the duplicates or a message if no duplicates found
print("\nDuplicates:")
if not duplicates.empty:
    print(duplicates)
else:
    print("No duplicates found.")


Duplicates:
    ID    Age    Income  Score       Name  Outlier_Z_Score  Outlier_IQR
4    5  32.66  47580.71    475   Person_5            False        False
9   10  40.43  48883.31    824  Person_10            False        False
95   5  20.36  55779.76    584   Person_5            False        False
96  10  37.96  36742.14    696  Person_10            False        False


In [None]:
df

Unnamed: 0,ID,Age,Income,Score,Name,Outlier_Z_Score,Outlier_IQR
0,1,39.97,28769.44,685,Person_1,False,False
1,2,33.62,43690.32,403,Person_2,False,False
3,4,50.23,37965.84,545,Person_4,False,False
4,5,32.66,47580.71,475,Person_5,False,False
6,7,50.79,78292.79,776,Person_7,False,False
...,...,...,...,...,...,...,...
95,5,20.36,55779.76,584,Person_5,False,False
96,10,37.96,36742.14,696,Person_10,False,False
97,98,37.61,52305.88,311,Person_98,False,False
98,99,35.05,50873.13,601,Person_99,False,False


In [None]:
# Remove duplicates based on 'ID' and 'Name' while keeping the first occurrence
df_cleaned = df.drop_duplicates(subset=['ID', 'Name'], keep='first')

# Optional: Check if duplicates have been removed
print("\nAfter removing duplicates:")
df_cleaned


After removing duplicates:


Unnamed: 0,ID,Age,Income,Score,Name,Outlier_Z_Score,Outlier_IQR
0,1,39.97,28769.44,685,Person_1,False,False
1,2,33.62,43690.32,403,Person_2,False,False
3,4,50.23,37965.84,545,Person_4,False,False
4,5,32.66,47580.71,475,Person_5,False,False
6,7,50.79,78292.79,776,Person_7,False,False
...,...,...,...,...,...,...,...
93,94,31.72,31313.92,467,Person_94,False,False
94,95,31.08,52597.71,342,Person_95,False,False
97,98,37.61,52305.88,311,Person_98,False,False
98,99,35.05,50873.13,601,Person_99,False,False


Standardization scales data so that it has a mean of 0 and a standard deviation of 1 (also known as Z-score normalization)

In [None]:
# 3. Data Standardization and Normalization

# Min-Max Scaling
scaler_minmax = MinMaxScaler()
df['Income_MinMax'] = scaler_minmax.fit_transform(df[['Income']])
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Income_MinMax'] = scaler_minmax.fit_transform(df[['Income']])


Unnamed: 0,ID,Age,Income,Score,Name,Outlier_Z_Score,Outlier_IQR,Income_MinMax
0,1,39.97,28769.44,685,Person_1,False,False,0.108516
1,2,33.62,43690.32,403,Person_2,False,False,0.322946
3,4,50.23,37965.84,545,Person_4,False,False,0.240679
4,5,32.66,47580.71,475,Person_5,False,False,0.378855
6,7,50.79,78292.79,776,Person_7,False,False,0.820221
...,...,...,...,...,...,...,...,...
95,5,20.36,55779.76,584,Person_5,False,False,0.496684
96,10,37.96,36742.14,696,Person_10,False,False,0.223093
97,98,37.61,52305.88,311,Person_98,False,False,0.446761
98,99,35.05,50873.13,601,Person_99,False,False,0.426171


Normalization scales data so that it falls within a specified range, often [0, 1] (also known as Min-Max scaling)

In [None]:
# Z-score Standardization
scaler_zscore = StandardScaler()
df['Income_ZScore'] = scaler_zscore.fit_transform(df[['Income']])
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Income_ZScore'] = scaler_zscore.fit_transform(df[['Income']])


Unnamed: 0,ID,Age,Income,Score,Name,Outlier_Z_Score,Outlier_IQR,Income_MinMax,Income_ZScore
0,1,39.97,28769.44,685,Person_1,False,False,0.108516,-1.492297
1,2,33.62,43690.32,403,Person_2,False,False,0.322946,-0.457910
3,4,50.23,37965.84,545,Person_4,False,False,0.240679,-0.854759
4,5,32.66,47580.71,475,Person_5,False,False,0.378855,-0.188210
6,7,50.79,78292.79,776,Person_7,False,False,0.820221,1.940900
...,...,...,...,...,...,...,...,...,...
95,5,20.36,55779.76,584,Person_5,False,False,0.496684,0.380188
96,10,37.96,36742.14,696,Person_10,False,False,0.223093,-0.939591
97,98,37.61,52305.88,311,Person_98,False,False,0.446761,0.139362
98,99,35.05,50873.13,601,Person_99,False,False,0.426171,0.040037
