In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

np.random.seed(42)
def createdata():
  data = {
      'Age': np.random.randint(18, 70, size=20),
      'Salary': np.random.randint(30000, 120000, size=20),
      'Purchased': np.random.choice([0, 1], size=20),
      'Gender': np.random.choice(['Male', 'Female'], size=20),
      'City': np.random.choice(['New York', 'San Francisco', 'Los Angeles'], size=20)
  }

  df = pd.DataFrame(data)
  return df

df = createdata()
df.head(10)


Unnamed: 0,Age,Salary,Purchased,Gender,City
0,56,92955,1,Female,San Francisco
1,69,94925,0,Female,San Francisco
2,46,97969,0,Male,New York
3,32,35311,0,Female,Los Angeles
4,60,113104,0,Male,San Francisco
5,25,83707,0,Female,San Francisco
6,38,115305,0,Male,San Francisco
7,56,58693,0,Female,San Francisco
8,36,101932,0,Male,San Francisco
9,40,55658,1,Male,San Francisco


In [None]:
df.shape

(20, 5)

# When you have missing value in data

In [None]:
# Introduce some missing values for demonstration
df.loc[5, 'Age'] = np.nan
df.loc[10, 'Salary'] = np.nan
df.head(10)

Unnamed: 0,Age,Salary,Purchased,Gender,City
0,56.0,92955.0,1,Female,San Francisco
1,69.0,94925.0,0,Female,San Francisco
2,46.0,97969.0,0,Male,New York
3,32.0,35311.0,0,Female,Los Angeles
4,60.0,113104.0,0,Male,San Francisco
5,,83707.0,0,Female,San Francisco
6,38.0,115305.0,0,Male,San Francisco
7,56.0,58693.0,0,Female,San Francisco
8,36.0,101932.0,0,Male,San Francisco
9,40.0,55658.0,1,Male,San Francisco


In [None]:
# Simple Imputation (mean, median, mode) -
#Pros: Fast and straightforward.
#Cons: May introduce bias if missing data isn’t random. --Unfold data science mising data treatment
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary'].fillna(df['Salary'].median(), inplace=True)
df.head(10)

Unnamed: 0,Age,Salary,Purchased,Gender,City
0,56.0,92955.0,1,Female,San Francisco
1,69.0,94925.0,0,Female,San Francisco
2,46.0,97969.0,0,Male,New York
3,32.0,35311.0,0,Female,Los Angeles
4,60.0,113104.0,0,Male,San Francisco
5,42.105263,83707.0,0,Female,San Francisco
6,38.0,115305.0,0,Male,San Francisco
7,56.0,58693.0,0,Female,San Francisco
8,36.0,101932.0,0,Male,San Francisco
9,40.0,55658.0,1,Male,San Francisco


In [None]:
# Listwise deletion
#pros : Simple .
#Cons - loosing data
df_dropped = df.dropna()
df_dropped.head(10)


Unnamed: 0,Age,Salary,Purchased,Gender,City
0,56.0,92955.0,1,Female,San Francisco
1,69.0,94925.0,0,Female,San Francisco
2,46.0,97969.0,0,Male,New York
3,32.0,35311.0,0,Female,Los Angeles
4,60.0,113104.0,0,Male,San Francisco
6,38.0,115305.0,0,Male,San Francisco
7,56.0,58693.0,0,Female,San Francisco
8,36.0,101932.0,0,Male,San Francisco
9,40.0,55658.0,1,Male,San Francisco
11,28.0,48431.0,0,Male,Los Angeles


In [None]:
df_dropped.shape

(18, 5)

In [None]:
# Predictive Imputation - seach for MICE imputation as well -
#Pros: More accurate for complex patterns.
#Cons: Computationally intensive, may add noise with high variance.
from sklearn.impute import KNNImputer
knn_imputer = KNNImputer(n_neighbors=3)
df[['Age', 'Salary']] = knn_imputer.fit_transform(df[['Age', 'Salary']])

In [None]:
df.head(10)

Unnamed: 0,Age,Salary,Purchased,Gender,City
0,56.0,92955.0,1,Female,San Francisco
1,69.0,94925.0,0,Female,San Francisco
2,46.0,97969.0,0,Male,New York
3,32.0,35311.0,0,Female,Los Angeles
4,60.0,113104.0,0,Male,San Francisco
5,42.666667,83707.0,0,Female,San Francisco
6,38.0,115305.0,0,Male,San Francisco
7,56.0,58693.0,0,Female,San Francisco
8,36.0,101932.0,0,Male,San Francisco
9,40.0,55658.0,1,Male,San Francisco


In [None]:
# Add indicator for missingness
#Pros: Allows the model to learn patterns of missingness.
#Cons: Can increase dimensionality and complexity.
df['Age_missing'] = df['Age'].isnull().astype(int)
df.head(10)

Unnamed: 0,Age,Salary,Purchased,Gender,City,Age_missing
0,56.0,92955.0,1,Female,San Francisco,0
1,69.0,94925.0,0,Female,San Francisco,0
2,46.0,97969.0,0,Male,New York,0
3,32.0,35311.0,0,Female,Los Angeles,0
4,60.0,113104.0,0,Male,San Francisco,0
5,,83707.0,0,Female,San Francisco,1
6,38.0,115305.0,0,Male,San Francisco,0
7,56.0,58693.0,0,Female,San Francisco,0
8,36.0,101932.0,0,Male,San Francisco,0
9,40.0,55658.0,1,Male,San Francisco,0


# When you have Categorical Variables in data

In [None]:
# Label Encoding
#Pros: Simple and space-efficient.
#Cons: Implies ordinal relationship, which may mislead models.
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])
df.head()

Unnamed: 0,Age,Salary,Purchased,Gender,City
0,56,92955,1,0,San Francisco
1,69,94925,0,0,San Francisco
2,46,97969,0,1,New York
3,32,35311,0,0,Los Angeles
4,60,113104,0,1,San Francisco


In [None]:
# One-Hot Encoding
#Pros: Great for non-ordinal categories; maintains all category information.
#Cons: Increases dimensionality, particularly with many unique values.#
df = pd.get_dummies(df, columns=['City'], drop_first=True)
df.head()

Unnamed: 0,Age,Salary,Purchased,Gender,City_New York,City_San Francisco
0,56,92955,1,0,False,True
1,69,94925,0,0,False,True
2,46,97969,0,1,True,False
3,32,35311,0,0,False,False
4,60,113104,0,1,False,True


In [None]:
df = createdata()

In [None]:
# Ordinal Encoding
#Pros: Effective for ordinal data; captures order.
#Cons: Not suitable for non-ordinal data, as it implies ranking.
df['City'] = df['City'].map({'New York': 1, 'San Francisco': 2, 'Los Angeles': 3})
df.head()

Unnamed: 0,Age,Salary,Purchased,Gender,City
0,41,63159,1,Female,1
1,43,43986,0,Female,1
2,42,91858,0,Female,3
3,62,42666,0,Male,3
4,58,68660,0,Male,3


In [None]:
# Target Encoding
#Pros: Useful for high-cardinality features.
#Cons: Can cause data leakage if target encoding is not done properly.
mean_target = df.groupby('City')['Purchased'].mean()
df['City_encoded'] = df['City'].map(mean_target)
df.head()

Unnamed: 0,Age,Salary,Purchased,Gender,City,City_encoded
0,41,63159,1,Female,1,0.333333
1,43,43986,0,Female,1,0.333333
2,42,91858,0,Female,3,0.0
3,62,42666,0,Male,3,0.0
4,58,68660,0,Male,3,0.0


# When you need to scale features

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

# Standardization (mean=0, variance=1)
#Pros: Works well for normally distributed data; suitable for many models.
#Cons: Sensitive to outliers.
scaler = StandardScaler()
df[['Age', 'Salary']] = scaler.fit_transform(df[['Age', 'Salary']])
df.head()

Unnamed: 0,Age,Salary,Purchased,Gender,City,City_encoded
0,0.026802,-0.496908,1,Female,1,0.333333
1,0.16081,-1.213082,0,Female,1,0.333333
2,0.093806,0.575092,0,Female,3,0.0
3,1.433886,-1.262388,0,Male,3,0.0
4,1.16587,-0.291428,0,Male,3,0.0


In [None]:
df = createdata()
df.head(5)

Unnamed: 0,Age,Salary,Purchased,Gender,City
0,50,98840,1,Male,Los Angeles
1,18,84384,1,Male,Los Angeles
2,44,81005,1,Female,New York
3,69,76576,1,Female,New York
4,30,69353,1,Male,Los Angeles


In [None]:
# Normalization (range 0-1)
#Pros: Keeps all data between 0 and 1; ideal for distance-based models.
#Cons: Can distort data distribution, especially with extreme outliers.
normalizer = MinMaxScaler()
df[['Age', 'Salary']] = normalizer.fit_transform(df[['Age', 'Salary']])
df.head()

Unnamed: 0,Age,Salary,Purchased,Gender,City
0,0.627451,0.767163,1,Male,Los Angeles
1,0.0,0.60404,1,Male,Los Angeles
2,0.509804,0.565911,1,Female,New York
3,1.0,0.515933,1,Female,New York
4,0.235294,0.434428,1,Male,Los Angeles


In [None]:
df = createdata()
df.head(5)

Unnamed: 0,Age,Salary,Purchased,Gender,City
0,19,62711,1,Male,New York
1,18,35539,1,Female,New York
2,65,83351,1,Female,San Francisco
3,29,91267,0,Male,New York
4,22,78354,0,Male,San Francisco


In [None]:
# Robust Scaling (uses median and IQR, robust to outliers)
#Pros: More robust to outliers by using median and IQR.
#Cons: Doesn’t work as well for normal distributions.
robust_scaler = RobustScaler()
df[['Age', 'Salary']] = robust_scaler.fit_transform(df[['Age', 'Salary']])
df.head(5)

Unnamed: 0,Age,Salary,Purchased,Gender,City
0,-0.8,-0.228719,1,Male,New York
1,-0.84,-0.849573,1,Female,New York
2,1.04,0.242885,1,Female,San Francisco
3,-0.4,0.423758,0,Male,New York
4,-0.68,0.128709,0,Male,San Francisco


# When you have outliers in data

In [None]:
df = createdata()
df.head(5)

Unnamed: 0,Age,Salary,Purchased,Gender,City
0,49,75017,1,Female,San Francisco
1,40,96320,1,Female,New York
2,50,57751,0,Male,San Francisco
3,20,108069,0,Female,Los Angeles
4,35,84748,0,Female,Los Angeles


In [None]:
# Outlier Detection and Treatment using IQR
#Pros: Simple and effective for mild outliers.
#Cons: May overly reduce variation if there are many extreme outliers.
Q1 = df['Salary'].quantile(0.25)
Q3 = df['Salary'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df['Salary'] = np.where(df['Salary'] > upper_bound, upper_bound,
                        np.where(df['Salary'] < lower_bound, lower_bound, df['Salary']))

In [None]:
df = createdata()
df.head(5)

Unnamed: 0,Age,Salary,Purchased,Gender,City
0,40,97215,1,Female,Los Angeles
1,26,99042,1,Male,New York
2,29,43284,1,Female,New York
3,18,102789,0,Male,San Francisco
4,18,114664,1,Female,Los Angeles


In [None]:
# Z-score method
#Pros: Good for normally distributed data.
#Cons: Not suitable for non-normal data; may miss outliers in skewed distributions.
from scipy import stats
df['Salary_zscore'] = stats.zscore(df['Salary'])
df['Salary'] = np.where(df['Salary_zscore'].abs() > 3, np.nan, df['Salary'])  # Replace outliers with NaN

In [None]:
df = createdata()
df.head(5)

Unnamed: 0,Age,Salary,Purchased,Gender,City
0,61,56432,1,Male,Los Angeles
1,34,113285,1,Female,New York
2,55,87854,1,Male,San Francisco
3,24,70262,1,Female,Los Angeles
4,63,67080,1,Female,New York


In [None]:
# Median replacement for outliers
#Pros: Keeps distribution shape intact, useful when capping isn’t feasible.
#Cons: May distort data if outliers represent real phenomena.
df['Salary_zscore'] = stats.zscore(df['Salary'])
median_salary = df['Salary'].median()
df['Salary'] = np.where(df['Salary_zscore'].abs() > 3, median_salary, df['Salary'])
df.head(5)

Unnamed: 0,Age,Salary,Purchased,Gender,City,Salary_zscore
0,61,56432.0,1,Male,Los Angeles,-0.583328
1,34,113285.0,1,Female,New York,1.465523
2,55,87854.0,1,Male,San Francisco,0.549048
3,24,70262.0,1,Female,Los Angeles,-0.084926
4,63,67080.0,1,Female,New York,-0.199598


# Feature Engineering

In [None]:
df = createdata()
df.head(5)

Unnamed: 0,Age,Salary,Purchased,Gender,City
0,25,57532,1,Male,New York
1,45,64349,0,Male,New York
2,53,75445,0,Male,San Francisco
3,43,35713,1,Female,New York
4,25,86178,0,Male,New York


In [None]:
# Creating a new feature based on Salary
#Pros: Simplified continuous features; useful in non-linear models.
#Cons: Can lose data granularity, potentially reducing model accuracy.
df['Income_Level'] = pd.cut(df['Salary'], bins=[0, 50000, 100000, 150000], labels=['Low', 'Medium', 'High'])
df.head(10)

Unnamed: 0,Age,Salary,Purchased,Gender,City,Income_Level
0,25,57532,1,Male,New York,Medium
1,45,64349,0,Male,New York,Medium
2,53,75445,0,Male,San Francisco,Medium
3,43,35713,1,Female,New York,Low
4,25,86178,0,Male,New York,Medium
5,67,79407,0,Male,San Francisco,Medium
6,45,100340,0,Male,Los Angeles,High
7,45,80990,0,Male,Los Angeles,Medium
8,54,105672,0,Male,San Francisco,High
9,58,95545,1,Male,San Francisco,Medium


In [None]:
# Polynomial Features
#Pros: Captures complex relationships between variables.
#Cons: Increases dimensionality, risking overfitting.
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(df[['Age', 'Salary']])
df_poly = pd.DataFrame(poly_features, columns=poly.get_feature_names_out(['Age', 'Salary']))
df = pd.concat([df, df_poly], axis=1)
df.head()

Unnamed: 0,Age,Salary,Purchased,Gender,City,Income_Level,Age.1,Salary.1,Age^2,Age Salary,Salary^2
0,25,57532,1,Male,New York,Medium,25.0,57532.0,625.0,1438300.0,3309931000.0
1,45,64349,0,Male,New York,Medium,45.0,64349.0,2025.0,2895705.0,4140794000.0
2,53,75445,0,Male,San Francisco,Medium,53.0,75445.0,2809.0,3998585.0,5691948000.0
3,43,35713,1,Female,New York,Low,43.0,35713.0,1849.0,1535659.0,1275418000.0
4,25,86178,0,Male,New York,Medium,25.0,86178.0,625.0,2154450.0,7426648000.0
