In [145]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFE
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.model_selection import train_test_split

from google.colab import files
x = files.upload()

# Load the dataset
file_path = list(x.keys())[0]
df = pd.read_excel(file_path)

# Preview the dataset
print("\nDataset Head:")
print(df.head())

Saving Book3.xlsx to Book3 (16).xlsx

Dataset Head:
   ID   Age   Salary  Gender   Department  Score
0   1  25.0  50000.0    Male        Sales   85.0
1   2  30.0  60000.0  Female  Engineering   90.0
2   3  28.0  55000.0    Male           HR  105.0
3   4  22.0      NaN  Female        Sales   70.0
4   5  35.0  75000.0     NaN  Engineering    NaN


Upload and Load the Data :

This code uploads a file using files.upload() from Google Colab and loads it into a Pandas DataFrame. So I upload the file and load it. Then displays the first few rows to preview the data.

In [146]:
print("Dataset Info:")
print(df.info())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ID          10 non-null     int64  
 1   Age         9 non-null      float64
 2   Salary      9 non-null      float64
 3   Gender      9 non-null      object 
 4   Department  10 non-null     object 
 5   Score       9 non-null      float64
dtypes: float64(3), int64(1), object(2)
memory usage: 608.0+ bytes
None


Check data types and missing values.

In [147]:
# 1. Handling Missing Values:
# Identify missing values
print("Missing values before imputation:\n", df.isnull().sum())

Missing values before imputation:
 ID            0
Age           1
Salary        1
Gender        1
Department    0
Score         1
dtype: int64


In [148]:
# Impute missing values for numerical features with mean
df['Age_mean'] = df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary_mean'] = df['Salary'].fillna(df['Salary'].mean(), inplace=True)
df['Score_mean'] = df['Score'].fillna(df['Score'].mean(), inplace=True)

# Impute missing values for categorical features with mode
df['Gender_mode'] = df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)

df.head()
print("Missing values after imputation:\n", df.isnull().sum())

Missing values after imputation:
 ID              0
Age             0
Salary          0
Gender          0
Department      0
Score           0
Age_mean       10
Salary_mean    10
Score_mean     10
Gender_mode    10
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age_mean'] = df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary_mean'] = df['Salary'].fillna(df['Salary'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the int

1. Handling Missing Values:

Checks for missing values in each column then identify and impute missing values with mean for numerical features and mode for categorical features.

In [149]:
# 2. Scaling Data:
scaler = StandardScaler()

# Scale numerical features using Z-score:
for col in num_features:
    df[f"{col}_Z"] = scaler.fit_transform(df[[col]])

# Display results
print("Data after standardization (Z-score):")
print(df.head())

# Scale numerical features using Min-Max scaling:
minmax_scaler = MinMaxScaler()
for col in num_features:
    df[f"{col}_MinMax"] = minmax_scaler.fit_transform(df[[col]])

# Display results
print("\nData after Min-Max Scaling:")
print(df.head())

Data after standardization (Z-score):
   ID   Age        Salary  Gender   Department       Score Age_mean  \
0   1  25.0  50000.000000    Male        Sales   85.000000     None   
1   2  30.0  60000.000000  Female  Engineering   90.000000     None   
2   3  28.0  55000.000000    Male           HR  105.000000     None   
3   4  22.0  62222.222222  Female        Sales   70.000000     None   
4   5  35.0  75000.000000    Male  Engineering   85.333333     None   

  Salary_mean Score_mean Gender_mode      ID_Z     Age_Z      Salary_Z  \
0        None       None        None -1.566699 -1.004472 -4.788959e-01   
1        None       None        None -1.218544  0.022829 -8.707198e-02   
2        None       None        None -0.870388 -0.388091 -2.829839e-01   
3        None       None        None -0.522233 -1.620852 -2.850894e-16   
4        None       None        None -0.174078  1.050130  5.006639e-01   

    Score_Z  
0 -0.029743  
1  0.416401  
2  1.754834  
3 -1.368175  
4  0.000000  

Data 

2. Scaling Data:

Scale numerical features using standardization with Z-score and normalization with Min-Max scaling.

In [150]:
# 3. Handling Noise:
# Inject random noise into one of the numerical features:
noise_col = num_features[0]
noise = np.random.normal(0, 0.5, df.shape[0])
df[f"{noise_col}_Noise"] = df[noise_col] + noise

# Smooth noise using a rolling mean:
df[f"{noise_col}_Smoothed"] = df[f"{noise_col}_Noise"].rolling(window=3, center=True).mean().fillna(df[f"{noise_col}_Noise"])

# Display results
print("Noisy Data Head:")
print(df.head())

Noisy Data Head:
   ID   Age        Salary  Gender   Department       Score Age_mean  \
0   1  25.0  50000.000000    Male        Sales   85.000000     None   
1   2  30.0  60000.000000  Female  Engineering   90.000000     None   
2   3  28.0  55000.000000    Male           HR  105.000000     None   
3   4  22.0  62222.222222  Female        Sales   70.000000     None   
4   5  35.0  75000.000000    Male  Engineering   85.333333     None   

  Salary_mean Score_mean Gender_mode      ID_Z     Age_Z      Salary_Z  \
0        None       None        None -1.566699 -1.004472 -4.788959e-01   
1        None       None        None -1.218544  0.022829 -8.707198e-02   
2        None       None        None -0.870388 -0.388091 -2.829839e-01   
3        None       None        None -0.522233 -1.620852 -2.850894e-16   
4        None       None        None -0.174078  1.050130  5.006639e-01   

    Score_Z  ID_MinMax  Age_MinMax  Salary_MinMax  Score_MinMax  ID_Noise  \
0 -0.029743   0.000000    0.166667

3. Handling Noise:

Random noise is added to one numerical column to simulate noisy data.
Noise is generated from a normal distribution with mean 0 and standard deviation 0.5. A new column is created with the noisy version of the original column. A rolling mean with a window size of 3 is applied to smooth the noise. Missing values introduced by rolling are filled with original noisy values.

In [151]:
# 4. Handling Outliers:
# Detect outliers using Z-score
for col in num_features:
    z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
    df[f"{col}_Outlier"] = z_scores > 3

print("Data with Outliers Head:")
print(df.head())

# Remove outliers for demonstration:
# Applying all() to the boolean condition (z_scores <= 3) to filter the DataFrame:
df_no_outliers = df[(z_scores <= 3)]

# Display results
print("\nData without Outliers Head:")
print(df_no_outliers.head())

Data with Outliers Head:
   ID   Age        Salary  Gender   Department       Score Age_mean  \
0   1  25.0  50000.000000    Male        Sales   85.000000     None   
1   2  30.0  60000.000000  Female  Engineering   90.000000     None   
2   3  28.0  55000.000000    Male           HR  105.000000     None   
3   4  22.0  62222.222222  Female        Sales   70.000000     None   
4   5  35.0  75000.000000    Male  Engineering   85.333333     None   

  Salary_mean Score_mean Gender_mode  ...  ID_MinMax  Age_MinMax  \
0        None       None        None  ...   0.000000    0.166667   
1        None       None        None  ...   0.111111    0.444444   
2        None       None        None  ...   0.222222    0.333333   
3        None       None        None  ...   0.333333    0.000000   
4        None       None        None  ...   0.444444    0.722222   

   Salary_MinMax  Score_MinMax  ID_Noise  ID_Smoothed  ID_Outlier  \
0       0.363636      0.500000  0.375199     0.375199       False   
1

4. Handling Outliers:

Detect outliers using Z-score then Remove outliers for demonstration and then Applying all() to the boolean condition (z_scores <= 3) to filter the DataFrame.

In [136]:
# 5. Feature Selection:
Score_col = 'Score'

# Convert categorical features to numerical using get_dummies
# Check the current columns in the DataFrame
print(df.columns)

cat_features = [col for col in df.columns if col in ['Gender', 'Department'] and col != Score_col]
df = pd.get_dummies(df, columns=cat_features, drop_first=True)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Impute missing values using SimpleImputer before feature selection
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# i) Filter Method - Mutual Information (using f_regression for continuous target)
from sklearn.feature_selection import f_regression, SelectKBest
selector_filter = SelectKBest(score_func=f_regression, k=3)
X_train_filter = selector_filter.fit_transform(X_train, y_train)

# ii) Wrapper Method - Recursive Feature Elimination
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
model = LinearRegression()

n_features_to_select = 5
selector_rfe = RFE(estimator=model, n_features_to_select=n_features_to_select)
# Initialize RFE with the model
X_train_rfe = selector_rfe.fit_transform(X_train, y_train)

# iii) Embedded Method - Lasso Regression
from sklearn.linear_model import LassoCV
lasso = LassoCV(cv=5)
lasso.fit(X_train, y_train)
selected_features = np.where(lasso.coef_ != 0)[0]

# Display results
print("\nSelected features by Mutual Information:", selector_filter.get_support(indices=True))
print("\nSelected features by RFE:", selector_rfe.get_support(indices=True))
print("\nSelected features by Lasso Regression:", selected_features)

Index(['ID', 'Age', 'Salary', 'Score', 'Age_mean', 'Salary_mean', 'Score_mean',
       'Gender_mode', 'ID_Z', 'Age_Z', 'Salary_Z', 'Score_Z', 'ID_MinMax',
       'Age_MinMax', 'Salary_MinMax', 'Score_MinMax', 'ID_Noise',
       'ID_Smoothed', 'ID_Outlier', 'Age_Outlier', 'Salary_Outlier',
       'Score_Outlier', 'Gender_Male', 'Department_HR',
       'Department_Management', 'Department_Sales'],
      dtype='object')

Selected features by Mutual Information: [ 6 18 20]

Selected features by RFE: [ 6 10 17 18 20]

Selected features by Lasso Regression: [2]


