In [1]:
import pandas as pd


data = {
'age': [25, 27, None, 35, 40, 23, 36, 120], # 120 is a likely outlier
'salary': [50000, 60000, 55000, 80000, None, 45000, 70000, 1e7],
'city': ['Lahore', 'Karachi', 'Islamabad', 'Karachi', 'Lahore', 'Karachi', 'Islamabad', 'Lahore'],
'gender': ['M', 'F', 'F', 'M', 'M', None, 'F', 'M'],
'purchased': [0,1,0,1,1,0,1,0]
}


df = pd.DataFrame(data)
print(df)

     age      salary       city gender  purchased
0   25.0     50000.0     Lahore      M          0
1   27.0     60000.0    Karachi      F          1
2    NaN     55000.0  Islamabad      F          0
3   35.0     80000.0    Karachi      M          1
4   40.0         NaN     Lahore      M          1
5   23.0     45000.0    Karachi   None          0
6   36.0     70000.0  Islamabad      F          1
7  120.0  10000000.0     Lahore      M          0


In [3]:
# Basic checks
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        7 non-null      float64
 1   salary     7 non-null      float64
 2   city       8 non-null      object 
 3   gender     7 non-null      object 
 4   purchased  8 non-null      int64  
dtypes: float64(2), int64(1), object(2)
memory usage: 452.0+ bytes
None


In [4]:
print(df.describe())

              age        salary  purchased
count    7.000000  7.000000e+00   8.000000
mean    43.714286  1.480000e+06   0.500000
std     34.223356  3.756986e+06   0.534522
min     23.000000  4.500000e+04   0.000000
25%     26.000000  5.250000e+04   0.000000
50%     35.000000  6.000000e+04   0.500000
75%     38.000000  7.500000e+04   1.000000
max    120.000000  1.000000e+07   1.000000


In [5]:
print(df.isna().sum()) # counts missing values

age          1
salary       1
city         0
gender       1
purchased    0
dtype: int64


In [6]:
from sklearn.impute import SimpleImputer
import numpy as np


num_cols = ['age', 'salary']
cat_cols = ['city', 'gender']


# Numeric imputer: use median for robustness to outliers
num_imputer = SimpleImputer(strategy='median')
df[num_cols] = num_imputer.fit_transform(df[num_cols])


# Categorical imputer: fill missing with a new category 'Unknown' or the mode
cat_imputer = SimpleImputer(strategy='constant', fill_value='Unknown')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

In [7]:
# drop duplicates
df = df.drop_duplicates()

In [8]:
# IQR-based filtering example
Q1 = df['age'].quantile(0.25)
Q3 = df['age'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

outliers = df[(df['age'] < lower) | (df['age'] > upper)]
print('Outliers by age:\n', outliers)

# Options: drop, clip, or transform (e.g., log)
df['age_clipped'] = df['age'].clip(lower=lower, upper=upper)

Outliers by age:
      age      salary    city gender  purchased
7  120.0  10000000.0  Lahore      M          0


In [9]:
# Example: age binning, salary log, and an interaction term
import numpy as np

# age bins
df['age_group'] = pd.cut(df['age'], bins=[0,25,35,60,200], labels=['young','mid','senior','very_old'])

# salary log to reduce skewness (add small constant to avoid log(0))
df['salary_log'] = np.log1p(df['salary'])

# interaction
df['age_salary_ratio'] = df['age'] / (df['salary'] / 1e3) # age per thousand salary

In [11]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

# Example: One-hot encode city (low cardinality)
onehot = OneHotEncoder(handle_unknown='ignore')
encoded = onehot.fit_transform(df[['city']])
print(onehot.get_feature_names_out(['city']))

['city_Islamabad' 'city_Karachi' 'city_Lahore']


In [12]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

num_cols = ['age', 'salary']

scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# If you need MinMax for NN input:
mm = MinMaxScaler()
df[num_cols] = mm.fit_transform(df[num_cols])

In [13]:
from sklearn.decomposition import PCA
# Suppose X_scaled is your scaled numeric matrix
# pca = PCA(n_components=0.95) # keep 95% variance
# X_pca = pca.fit_transform(X_scaled)
# print('Explained variance ratio:', pca.explained_variance_ratio_.cumsum())

In [14]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE


# Univariate selection
selector = SelectKBest(score_func=f_classif, k=5)
# selector.fit(X_train, y_train)
# X_selected = selector.transform(X_train)


# RFE with logistic regression (wrapper method)
rfe = RFE(estimator=LogisticRegression(max_iter=200), n_features_to_select=5)
# rfe.fit(X_train, y_train)

In [15]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['purchased'])
y = df['purchased']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.