
## 1) Preprocessing Task :- Imputation 

In [6]:
import pandas as pd
df_data=pd.read_csv('./csv_files/sample_data.csv')

In [7]:
df_data.head()

Unnamed: 0,Age,Salary,Gender
0,25.0,50000.0,Male
1,30.0,60000.0,Female
2,35.0,55000.0,Male
3,40.0,,
4,,65000.0,Female


In [10]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Age     38 non-null     float64
 1   Salary  37 non-null     float64
 2   Gender  37 non-null     object 
dtypes: float64(2), object(1)
memory usage: 1.3+ KB


In [9]:
# finding the missing value in each columns

df_data.isna().sum()

Age       12
Salary    13
Gender    13
dtype: int64

In [11]:
# finding the categorical data from the dataset

categorical_data=[]
for i in df_data.columns:
    if df_data[i].dtypes=="object":
        categorical_data.append(i)
categorical_data

['Gender']

In [13]:
# Applying imputation pre-processing on missing records on categorical features

df_data['Gender']=df_data['Gender'].fillna(df_data['Gender'].mode()[0])

In [15]:
df_data['Gender'].isna().sum()

0

In [14]:
# finding the continous data from the dataset

continous_data=[]
for i in df_data.columns:
    if df_data[i].dtypes!="object":
        continous_data.append(i)
continous_data

['Age', 'Salary']

In [16]:
# Applying imputation pre-processing on missing records on continous features

df_data['Age']=df_data['Age'].fillna(df_data['Age'].mean())
df_data['Salary']=df_data['Salary'].fillna(df_data['Salary'].mean())

In [21]:
print(df_data['Age'].isna().sum())
print(df_data['Salary'].isna().sum())

0
0


## 2) Preprocessing Task:= Standardization

In [81]:
import pandas as pd 
from sklearn.preprocessing import StandardScaler
df=pd.read_csv('./csv_files/sample_data_1.csv')

In [82]:
df.head()

Unnamed: 0,Age,Salary,Gender
0,25.0,50000.0,Male
1,30.0,60000.0,Female
2,35.0,55000.0,Male
3,40.0,,
4,,65000.0,Female


In [83]:
df.drop(columns=['Gender'], inplace=True)

In [84]:
df.head()

Unnamed: 0,Age,Salary
0,25.0,50000.0
1,30.0,60000.0
2,35.0,55000.0
3,40.0,
4,,65000.0


In [85]:
# finding missng records

df.isna().sum()

Age       5
Salary    6
dtype: int64

In [86]:
# filling the missing records in dataset

df['Age']=df['Age'].fillna(df['Age'].mean())
df['Salary']=df['Salary'].fillna(df['Salary'].mean())

In [87]:
print(df['Age'].isna().sum())
print(df['Salary'].isna().sum())

0
0


In [88]:
# Standardizing the numeric columns

scaler = StandardScaler()
df[['Age', 'Salary']] = scaler.fit_transform(df[['Age', 'Salary']])

In [89]:
df.head()

Unnamed: 0,Age,Salary
0,-1.330969,-1.351853
1,-0.440532,0.096561
2,0.449905,-0.627646
3,1.340342,0.0
4,0.0,0.820768


## Preprocessing Task:= Handling Categorical Variables

In [75]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
df_encoding=pd.read_csv('./csv_files/one_hot_encoding_data.csv')

In [72]:
df_encoding.head()

Unnamed: 0,Color,Size
0,Red,Small
1,Blue,Medium
2,Green,Large
3,Blue,Medium
4,Green,Small


In [73]:
df_encoding.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Color   10 non-null     object
 1   Size    10 non-null     object
dtypes: object(2)
memory usage: 292.0+ bytes


In [80]:
# converting the categorical columns into binary matrix (0's and 1's)

df_encoded = pd.get_dummies(df_encoding, columns=['Color', 'Size'], dtype=int)
df_encoded

Unnamed: 0,Color_Blue,Color_Green,Color_Red,Size_Large,Size_Medium,Size_Small
0,0,0,1,0,0,1
1,1,0,0,0,1,0
2,0,1,0,1,0,0
3,1,0,0,0,1,0
4,0,1,0,0,0,1
5,0,0,1,0,1,0
6,1,0,0,1,0,0
7,0,1,0,0,0,1
8,0,0,1,1,0,0
9,1,0,0,0,1,0


## Preprocessing Task:= Outlier Management

In [108]:
import pandas as pd
from sklearn.datasets import load_iris

iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [109]:
df.shape

(150, 4)

In [96]:
def detect_outliers(df):
    outlier_indices = []
    for feature in df.columns:
        Q1 = df[feature].quantile(0.25)
        Q3 = df[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
        outlier_indices.extend(outliers.index)
    return outlier_indices

In [97]:
outlier_indices = detect_outliers(df)
print(f"\nNumber of outliers detected: {len(outlier_indices)}")
print(df.loc[outlier_indices])


Number of outliers detected: 4
    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
15                5.7               4.4                1.5               0.4
32                5.2               4.1                1.5               0.1
33                5.5               4.2                1.4               0.2
60                5.0               2.0                3.5               1.0


In [110]:
df_no_outliers = df.drop(index=outlier_indices)
df_no_outliers.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [111]:
df_no_outliers.shape

(146, 4)

## Preprocessing Task:= Cross Validation

In [113]:
from sklearn.datasets import load_digits
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np

def perform_cross_validation(k):
    digits = load_digits()
    X = digits.data
    y = digits.target
    model = RandomForestClassifier()
    scores = cross_val_score(model, X, y, cv=k)
    print(f"Cross-Validation Scores for {k}-fold: {scores}")
    print(f"Mean Cross-Validation Score: {scores.mean():.2f}")

k = 5  
perform_cross_validation(k)


Cross-Validation Scores for 5-fold: [0.91944444 0.91388889 0.94707521 0.96657382 0.92479109]
Mean Cross-Validation Score: 0.93
