In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [2]:
data = pd.read_csv('noisy_dataset.csv')
data.head()

Unnamed: 0,Age,Income,Height,Pets,Hobby,RandomScore,Buy_House
0,63,126200,165,Dog,Traveling,61.273929,1
1,20,58607,194,Dog,Traveling,-105.161162,0
2,46,47550,198,,Reading,-23.066379,0
3,52,71366,162,Dog,Traveling,-61.197526,1
4,56,121372,153,Cat,Reading,-85.085226,1


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          5000 non-null   int64  
 1   Income       5000 non-null   int64  
 2   Height       5000 non-null   int64  
 3   Pets         3282 non-null   object 
 4   Hobby        5000 non-null   object 
 5   RandomScore  5000 non-null   float64
 6   Buy_House    5000 non-null   int64  
dtypes: float64(1), int64(4), object(2)
memory usage: 273.6+ KB


In [5]:
data.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Age,5000.0,,,,43.5932,14.937615,18.0,31.0,43.0,57.0,69.0
Income,5000.0,,,,84769.2102,37663.269954,20000.0,51632.0,83704.5,117799.25,149978.0
Height,5000.0,,,,174.4274,14.323874,150.0,162.0,175.0,187.0,199.0
Pets,3282.0,2.0,Cat,1667.0,,,,,,,
Hobby,5000.0,3.0,Traveling,1697.0,,,,,,,
RandomScore,5000.0,,,,-1.870536,99.817755,-365.568956,-68.284933,-1.592768,66.365399,409.350075
Buy_House,5000.0,,,,0.5314,0.499063,0.0,0.0,1.0,1.0,1.0


#### 🔎 Step 2 — Identifying irrelevant/noisy features without domain knowledge
2A. Mutual Information (MI)

- MI measures how much knowing a feature reduces uncertainty about the target (Buy_House).

- If MI ≈ 0 → feature provides no info.

In [6]:
from sklearn.feature_selection import mutual_info_classif

x = pd.get_dummies(data.drop(columns= "Buy_House"), drop_first= True)
y = data['Buy_House']

mi = mutual_info_classif(x, y, random_state=0)
mi_series = pd.Series(mi, index = x.columns).sort_values(ascending=False)
print(mi_series)

Age                0.035910
Income             0.032610
Hobby_Traveling    0.006457
RandomScore        0.004206
Height             0.003817
Pets_Dog           0.000104
Hobby_Reading      0.000000
dtype: float64


**2B. Correlation with Target (for numeric features only)**

- Helps to see linear relationship with target.

- Absolute correlation near 0 → weak influence.

In [8]:
num_cols = data.select_dtypes(include='number').drop(columns='Buy_House')
corr_with_target = num_cols.corrwith(y).sort_values(ascending= False)
print(corr_with_target)


Age            0.192126
Income         0.177889
RandomScore   -0.002046
Height        -0.038634
dtype: float64


**2C. Feature Importance from Tree-based Models**

- Train a quick RandomForest or DecisionTree.

- The model computes how much each feature helped reduce impurity.

In [11]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators= 200, random_state=0)
rf.fit(x,y)
importances = pd.Series(rf.feature_importances_, index=x.columns)
print(importances)

Age                0.238731
Income             0.307816
Height             0.171017
RandomScore        0.231860
Pets_Dog           0.018028
Hobby_Reading      0.015794
Hobby_Traveling    0.016755
dtype: float64


**2D. Recursive Feature Elimination (RFE)**

- Wrapper method: iteratively removes least important features and checks performance.

In [12]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter= 1000)
rfe = RFE(model, n_features_to_select=3)
fit = rfe.fit(x,y)

selected_features = x.columns[fit.support_]
print("Selected Features: ", list(selected_features))

Selected Features:  ['Age', 'Pets_Dog', 'Hobby_Reading']
