# Import libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt

# Load Dataset

In [2]:
ad = pd.read_csv('adult_with_headers.csv')
ad

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


# EDA

In [3]:
ad.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [4]:
ad.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income            object
dtype: object

In [5]:
ad.shape

(32561, 15)

In [6]:
ad.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

- There is no missing valuse so no need to handle missing value

# 	Applying scaling techniques to numerical features:

- Standard Scalar

In [7]:
from sklearn.preprocessing import StandardScaler

In [8]:
ad = pd.DataFrame(ad)

In [9]:
ad.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [10]:
scaler = StandardScaler()

In [11]:
numfeat = ad.select_dtypes(include=['int64']).columns

In [12]:
columns_to_scale = ['age','education_num','capital_gain','capital_loss','hours_per_week']

In [13]:
# Fit and transform the data
ad[columns_to_scale] = scaler.fit_transform(ad[columns_to_scale])

In [14]:
ad

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,0.030671,State-gov,77516,Bachelors,1.134739,Never-married,Adm-clerical,Not-in-family,White,Male,0.148453,-0.21666,-0.035429,United-States,<=50K
1,0.837109,Self-emp-not-inc,83311,Bachelors,1.134739,Married-civ-spouse,Exec-managerial,Husband,White,Male,-0.145920,-0.21666,-2.222153,United-States,<=50K
2,-0.042642,Private,215646,HS-grad,-0.420060,Divorced,Handlers-cleaners,Not-in-family,White,Male,-0.145920,-0.21666,-0.035429,United-States,<=50K
3,1.057047,Private,234721,11th,-1.197459,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,-0.145920,-0.21666,-0.035429,United-States,<=50K
4,-0.775768,Private,338409,Bachelors,1.134739,Married-civ-spouse,Prof-specialty,Wife,Black,Female,-0.145920,-0.21666,-0.035429,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,-0.849080,Private,257302,Assoc-acdm,0.746039,Married-civ-spouse,Tech-support,Wife,White,Female,-0.145920,-0.21666,-0.197409,United-States,<=50K
32557,0.103983,Private,154374,HS-grad,-0.420060,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,-0.145920,-0.21666,-0.035429,United-States,>50K
32558,1.423610,Private,151910,HS-grad,-0.420060,Widowed,Adm-clerical,Unmarried,White,Female,-0.145920,-0.21666,-0.035429,United-States,<=50K
32559,-1.215643,Private,201490,HS-grad,-0.420060,Never-married,Adm-clerical,Own-child,White,Male,-0.145920,-0.21666,-1.655225,United-States,<=50K


- MINMAX Scalar

In [15]:
from sklearn.preprocessing import MinMaxScaler

In [16]:
Scaler = MinMaxScaler()

In [17]:
columns_to_scale = ['age','education_num','capital_gain','capital_loss','hours_per_week']

In [18]:
# Fit and transform the data
ad[columns_to_scale] = Scaler.fit_transform(ad[columns_to_scale])

In [19]:
ad

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,0.301370,State-gov,77516,Bachelors,0.800000,Never-married,Adm-clerical,Not-in-family,White,Male,0.021740,0.0,0.397959,United-States,<=50K
1,0.452055,Self-emp-not-inc,83311,Bachelors,0.800000,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.000000,0.0,0.122449,United-States,<=50K
2,0.287671,Private,215646,HS-grad,0.533333,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.000000,0.0,0.397959,United-States,<=50K
3,0.493151,Private,234721,11th,0.400000,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.000000,0.0,0.397959,United-States,<=50K
4,0.150685,Private,338409,Bachelors,0.800000,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.000000,0.0,0.397959,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.136986,Private,257302,Assoc-acdm,0.733333,Married-civ-spouse,Tech-support,Wife,White,Female,0.000000,0.0,0.377551,United-States,<=50K
32557,0.315068,Private,154374,HS-grad,0.533333,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.000000,0.0,0.397959,United-States,>50K
32558,0.561644,Private,151910,HS-grad,0.533333,Widowed,Adm-clerical,Unmarried,White,Female,0.000000,0.0,0.397959,United-States,<=50K
32559,0.068493,Private,201490,HS-grad,0.533333,Never-married,Adm-clerical,Own-child,White,Male,0.000000,0.0,0.193878,United-States,<=50K


## Discuss the scenarios where each scaling technique is preferred and why.

 1. Standard Scaling (Z-score Normalization)
 Preferred when:
 Data is normally distributed or approximately normal.
 Features have different units or scales (e.g., age in years, income in dollars).
 Working with algorithms sensitive to distance or covariance (e.g., SVM, K-means, PCA).
 You don’t want to distort relationships between features.
 Why: Centers data around 0 with unit variance, ensuring features with larger magnitudes don't dominate.

 2. Min-Max Scaling (Normalization)
 Preferred when:
 Data is not normally distributed or has outliers.
 You need features on a fixed range (e.g., [0, 1]), especially for neural networks or algorithms like KNN.
 Preserving relationships between features is important.
Why: Scales features to a fixed range, ensuring the model treats all features equally, but sensitive to outliers.

# Encoding Techniques:

- One-Hot Encoding to categorical variables with less than 5 categories

In [20]:
# Get unique values for each column
u = ad.apply(pd.Series.unique)
u

age               [0.30136986301369867, 0.452054794520548, 0.287...
workclass         [ State-gov,  Self-emp-not-inc,  Private,  Fed...
fnlwgt            [77516, 83311, 215646, 234721, 338409, 284582,...
education         [ Bachelors,  HS-grad,  11th,  Masters,  9th, ...
education_num     [0.8, 0.5333333333333333, 0.4, 0.8666666666666...
marital_status    [ Never-married,  Married-civ-spouse,  Divorce...
occupation        [ Adm-clerical,  Exec-managerial,  Handlers-cl...
relationship      [ Not-in-family,  Husband,  Wife,  Own-child, ...
race              [ White,  Black,  Asian-Pac-Islander,  Amer-In...
sex                                                [ Male,  Female]
capital_gain      [0.02174021740217402, 0.0, 0.1408414084140841,...
capital_loss      [0.0, 0.4687786960514234, 0.3232323232323233, ...
hours_per_week    [0.3979591836734694, 0.12244897959183676, 0.15...
native_country    [ United-States,  Cuba,  Jamaica,  India,  ?, ...
income                                          

In [21]:
from sklearn.preprocessing import OneHotEncoder

In [22]:
categorical_columns = ad.select_dtypes(include=['object']).nunique()
columns_to_encode = categorical_columns[categorical_columns < 5].index

# Apply One-Hot Encoding to these columns
ad = pd.get_dummies(ad, columns=columns_to_encode, drop_first=False)

# Ensure output is in 0/1 format
ad

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_ Female,sex_ Male,income_ <=50K,income_ >50K
0,0.301370,State-gov,77516,Bachelors,0.800000,Never-married,Adm-clerical,Not-in-family,White,0.021740,0.0,0.397959,United-States,False,True,True,False
1,0.452055,Self-emp-not-inc,83311,Bachelors,0.800000,Married-civ-spouse,Exec-managerial,Husband,White,0.000000,0.0,0.122449,United-States,False,True,True,False
2,0.287671,Private,215646,HS-grad,0.533333,Divorced,Handlers-cleaners,Not-in-family,White,0.000000,0.0,0.397959,United-States,False,True,True,False
3,0.493151,Private,234721,11th,0.400000,Married-civ-spouse,Handlers-cleaners,Husband,Black,0.000000,0.0,0.397959,United-States,False,True,True,False
4,0.150685,Private,338409,Bachelors,0.800000,Married-civ-spouse,Prof-specialty,Wife,Black,0.000000,0.0,0.397959,Cuba,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.136986,Private,257302,Assoc-acdm,0.733333,Married-civ-spouse,Tech-support,Wife,White,0.000000,0.0,0.377551,United-States,True,False,True,False
32557,0.315068,Private,154374,HS-grad,0.533333,Married-civ-spouse,Machine-op-inspct,Husband,White,0.000000,0.0,0.397959,United-States,False,True,False,True
32558,0.561644,Private,151910,HS-grad,0.533333,Widowed,Adm-clerical,Unmarried,White,0.000000,0.0,0.397959,United-States,True,False,True,False
32559,0.068493,Private,201490,HS-grad,0.533333,Never-married,Adm-clerical,Own-child,White,0.000000,0.0,0.193878,United-States,False,True,True,False


### LabelEncoder with more the 5 catogery

In [23]:
from sklearn.preprocessing import LabelEncoder

In [24]:
df = pd.DataFrame(ad)

In [25]:
ad.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_ Female,sex_ Male,income_ <=50K,income_ >50K
0,0.30137,State-gov,77516,Bachelors,0.8,Never-married,Adm-clerical,Not-in-family,White,0.02174,0.0,0.397959,United-States,False,True,True,False
1,0.452055,Self-emp-not-inc,83311,Bachelors,0.8,Married-civ-spouse,Exec-managerial,Husband,White,0.0,0.0,0.122449,United-States,False,True,True,False
2,0.287671,Private,215646,HS-grad,0.533333,Divorced,Handlers-cleaners,Not-in-family,White,0.0,0.0,0.397959,United-States,False,True,True,False
3,0.493151,Private,234721,11th,0.4,Married-civ-spouse,Handlers-cleaners,Husband,Black,0.0,0.0,0.397959,United-States,False,True,True,False
4,0.150685,Private,338409,Bachelors,0.8,Married-civ-spouse,Prof-specialty,Wife,Black,0.0,0.0,0.397959,Cuba,True,False,True,False


In [26]:
label_encoder = LabelEncoder()

In [27]:
columns_to_encode = ['workclass', 'education','marital_status','occupation','relationship','race','native_country']

In [28]:
for col in columns_to_encode:
    ad[col] = label_encoder.fit_transform(ad[col])


In [29]:
ad

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_ Female,sex_ Male,income_ <=50K,income_ >50K
0,0.301370,7,77516,9,0.800000,4,1,1,4,0.021740,0.0,0.397959,39,False,True,True,False
1,0.452055,6,83311,9,0.800000,2,4,0,4,0.000000,0.0,0.122449,39,False,True,True,False
2,0.287671,4,215646,11,0.533333,0,6,1,4,0.000000,0.0,0.397959,39,False,True,True,False
3,0.493151,4,234721,1,0.400000,2,6,0,2,0.000000,0.0,0.397959,39,False,True,True,False
4,0.150685,4,338409,9,0.800000,2,10,5,2,0.000000,0.0,0.397959,5,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.136986,4,257302,7,0.733333,2,13,5,4,0.000000,0.0,0.377551,39,True,False,True,False
32557,0.315068,4,154374,11,0.533333,2,7,0,4,0.000000,0.0,0.397959,39,False,True,False,True
32558,0.561644,4,151910,11,0.533333,6,1,4,4,0.000000,0.0,0.397959,39,True,False,True,False
32559,0.068493,4,201490,11,0.533333,4,1,3,4,0.000000,0.0,0.193878,39,False,True,True,False


### Discuss the pros and cons of One-Hot Encoding and Label Encoding.

### *One-Hot Encoding*  
*Pros*:
- No implicit ordering, suitable for nominal data.
- Works well for small to medium categories.
- Prevents models from assuming category order.

*Cons*:
- Increases dimensionality, leading to sparse matrices.
- Can be inefficient for high-cardinality features (many categories).

*Best For*:
- Nominal data (unordered categories).
- Small to medium-sized categorical features.

### *Label Encoding*  
*Pros*:
- Compact (uses one column).
- Efficient for ordinal data (ordered categories).
- Faster computation for large datasets.

*Cons*:
- Imposes an ordinal relationship on nominal data.
- Can mislead models by implying order where none exists.

*Best For*:
- Ordinal data (ordered categories).
- Large datasets with high-cardinality features.

# 3. Feature Engineering:

In [30]:
ad['age_group'] = pd.cut(ad['age'], bins=[0,30,50,100], labels=['young','Middle-aged','Senior'])
ad['hours_per_week_log'] = np.log1p(ad['hours_per_week'])

In [31]:
ad

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_ Female,sex_ Male,income_ <=50K,income_ >50K,age_group,hours_per_week_log
0,0.301370,7,77516,9,0.800000,4,1,1,4,0.021740,0.0,0.397959,39,False,True,True,False,young,0.335013
1,0.452055,6,83311,9,0.800000,2,4,0,4,0.000000,0.0,0.122449,39,False,True,True,False,young,0.115513
2,0.287671,4,215646,11,0.533333,0,6,1,4,0.000000,0.0,0.397959,39,False,True,True,False,young,0.335013
3,0.493151,4,234721,1,0.400000,2,6,0,2,0.000000,0.0,0.397959,39,False,True,True,False,young,0.335013
4,0.150685,4,338409,9,0.800000,2,10,5,2,0.000000,0.0,0.397959,5,True,False,True,False,young,0.335013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.136986,4,257302,7,0.733333,2,13,5,4,0.000000,0.0,0.377551,39,True,False,True,False,young,0.320307
32557,0.315068,4,154374,11,0.533333,2,7,0,4,0.000000,0.0,0.397959,39,False,True,False,True,young,0.335013
32558,0.561644,4,151910,11,0.533333,6,1,4,4,0.000000,0.0,0.397959,39,True,False,True,False,young,0.335013
32559,0.068493,4,201490,11,0.533333,4,1,3,4,0.000000,0.0,0.193878,39,False,True,True,False,young,0.177206


# Skewmess

In [42]:
# Identify numerical columns
numerical_columns = ad.select_dtypes(include=[np.number]).columns
print("Numerical columns:", numerical_columns)

# Check skewness
for col in numerical_columns:
    skewness = ad[col].skew()
    print(f"Skewness of {col}: {skewness}")

Numerical columns: Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'capital_gain',
       'capital_loss', 'hours_per_week', 'native_country',
       'hours_per_week_log', 'outlier'],
      dtype='object')
Skewness of age: 0.5587433694130481
Skewness of workclass: -0.7520240119541499
Skewness of fnlwgt: 1.4469800945789826
Skewness of education: -0.9340424374279611
Skewness of education_num: -0.31167586791022756
Skewness of marital_status: -0.01350813802823499
Skewness of occupation: 0.11458331643295894
Skewness of relationship: 0.7868177781306165
Skewness of race: -2.4353862665623387
Skewness of capital_gain: 11.953847687699781
Skewness of capital_loss: 4.594629121679695
Skewness of hours_per_week: 0.22764253680450075
Skewness of native_country: -3.658303294717141
Skewness of hours_per_week_log: -0.37765229618573604
Skewness of outlier: -9.843749744977995


# Feature Selection:

# Use the Isolation Forest algorithm to identify and remove outliers. Discuss how outliers can affect model performance.

In [33]:
from sklearn.ensemble import IsolationForest

In [34]:
numfeats = ad.select_dtypes(include=['int64','float64']).columns

In [35]:
# Applying isolationForest

isolationforest = IsolationForest(contamination=0.01, random_state=42)
outliers = isolationforest.fit_predict(ad[numfeats])

In [36]:
ad['outlier'] = outliers
print(ad[['outlier']].value_counts())

outlier
 1         32235
-1           326
Name: count, dtype: int64


In [37]:
# Filter the outliers

data_no_outliers = ad[ad['outlier']==1]
data_no_outliers

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_ Female,sex_ Male,income_ <=50K,income_ >50K,age_group,hours_per_week_log,outlier
0,0.301370,7,77516,9,0.800000,4,1,1,4,0.021740,0.0,0.397959,39,False,True,True,False,young,0.335013,1
1,0.452055,6,83311,9,0.800000,2,4,0,4,0.000000,0.0,0.122449,39,False,True,True,False,young,0.115513,1
2,0.287671,4,215646,11,0.533333,0,6,1,4,0.000000,0.0,0.397959,39,False,True,True,False,young,0.335013,1
3,0.493151,4,234721,1,0.400000,2,6,0,2,0.000000,0.0,0.397959,39,False,True,True,False,young,0.335013,1
4,0.150685,4,338409,9,0.800000,2,10,5,2,0.000000,0.0,0.397959,5,True,False,True,False,young,0.335013,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.136986,4,257302,7,0.733333,2,13,5,4,0.000000,0.0,0.377551,39,True,False,True,False,young,0.320307,1
32557,0.315068,4,154374,11,0.533333,2,7,0,4,0.000000,0.0,0.397959,39,False,True,False,True,young,0.335013,1
32558,0.561644,4,151910,11,0.533333,6,1,4,4,0.000000,0.0,0.397959,39,True,False,True,False,young,0.335013,1
32559,0.068493,4,201490,11,0.533333,4,1,3,4,0.000000,0.0,0.193878,39,False,True,True,False,young,0.177206,1


In [38]:
# view outlier only

outliers_only = ad[ad['outlier']==1]
outliers_only

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_ Female,sex_ Male,income_ <=50K,income_ >50K,age_group,hours_per_week_log,outlier
0,0.301370,7,77516,9,0.800000,4,1,1,4,0.021740,0.0,0.397959,39,False,True,True,False,young,0.335013,1
1,0.452055,6,83311,9,0.800000,2,4,0,4,0.000000,0.0,0.122449,39,False,True,True,False,young,0.115513,1
2,0.287671,4,215646,11,0.533333,0,6,1,4,0.000000,0.0,0.397959,39,False,True,True,False,young,0.335013,1
3,0.493151,4,234721,1,0.400000,2,6,0,2,0.000000,0.0,0.397959,39,False,True,True,False,young,0.335013,1
4,0.150685,4,338409,9,0.800000,2,10,5,2,0.000000,0.0,0.397959,5,True,False,True,False,young,0.335013,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.136986,4,257302,7,0.733333,2,13,5,4,0.000000,0.0,0.377551,39,True,False,True,False,young,0.320307,1
32557,0.315068,4,154374,11,0.533333,2,7,0,4,0.000000,0.0,0.397959,39,False,True,False,True,young,0.335013,1
32558,0.561644,4,151910,11,0.533333,6,1,4,4,0.000000,0.0,0.397959,39,True,False,True,False,young,0.335013,1
32559,0.068493,4,201490,11,0.533333,4,1,3,4,0.000000,0.0,0.193878,39,False,True,True,False,young,0.177206,1


In [39]:
# Drop the "outlier" columns if it is no longer needed

data_no_outliers = data_no_outliers.drop(columns=['outlier'])

In [45]:
#!pip install ppscore

In [46]:
#!pip install --upgrade pip setuptools wheel

In [47]:
# Import the PPS library
#import ppscore as pps

# Compute the PPS matrix for the dataset
#pps_matrix = pps.matrix(cleaned_data)

# Display the top 10 feature relationships with the highest PPS
#pps_matrix[['x', 'y', 'ppscore']].sort_values(by='ppscore', ascending=False).head(10)

# How Outliers Can Affect Model Performance

Outliers are data points that differ significantly from the rest of the data and can negatively impact the performance of machine learning models in the following ways:

Biasing Model Parameters:

Many machine learning algorithms, like Linear Regression and Logistic Regression, rely on the distribution of the data. Outliers can skew the results by distorting the true relationships between features and target variables, leading to biased model coefficients or predictions.
Model Overfitting:

Outliers can cause a model to overfit the data, especially if the algorithm doesn't differentiate well between outliers and normal data points. The model might "learn" the noise (outliers) as meaningful patterns, resulting in poor generalization to new data.
Reduced Model Accuracy:

Algorithms like K-Nearest Neighbors or Decision Trees can be heavily affected by outliers, as they may distort distance metrics (in KNN) or lead to splits that don't represent the true structure of the data (in Decision Trees).
Misleading Evaluation Metrics:

Outliers can distort model evaluation metrics like Mean Squared Error (MSE) or R-squared. These metrics can be inflated or deflated by outliers, leading to misleading conclusions about model performance.

# Conclusion
- Encoding: Use One-Hot Encoding for non-ordinal categories and Label Encoding for ordinal ones or to avoid high dimensionality.
- Feature Engineering: Domain-specific feature creation and handling skewed data improve model performance.
- Feature Selection: Isolation Forests remove noise, and PPS identifies meaningful relationships, providing a robust preprocessing pipeline.