In [12]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import IsolationForest
import numpy as np
import ppscore as pps

# Load dataset
df = pd.read_csv("C:\\Users\\user\\AppData\\Local\\Temp\\2976f667-1c67-4d68-b2e1-2de44c2234ef_EDA2.zip.4ef\\EDA2\\adult_with_headers.csv")

# Display basic information
print("Dataset Info:")
print(df.info())

print("\nDataset Summary Statistics:")
print(df.describe(include='all'))

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Handle missing values
df.fillna({
    'age': df['age'].median(),
    'workclass': df['workclass'].mode()[0],
    'education': df['education'].mode()[0],
    'hours_per_week': df['hours_per_week'].median(),
}, inplace=True)

# Apply scaling techniques
numerical_cols = ['age', 'hours_per_week', 'capital_gain', 'capital_loss']

# Standard Scaling
standard_scaler = StandardScaler()
df[numerical_cols] = standard_scaler.fit_transform(df[numerical_cols])

# Min-Max Scaling
minmax_scaler = MinMaxScaler()
df[numerical_cols] = minmax_scaler.fit_transform(df[numerical_cols])

# Apply One-Hot Encoding
categorical_cols_small = ['workclass', 'education']
one_hot_encoder = OneHotEncoder(drop='first', sparse_output=False)
one_hot_encoded = one_hot_encoder.fit_transform(df[categorical_cols_small])
df_encoded = pd.DataFrame(one_hot_encoded, columns=one_hot_encoder.get_feature_names_out(categorical_cols_small))

# Concatenate the original dataframe with the one-hot encoded dataframe
df = pd.concat([df, df_encoded], axis=1)
df.drop(columns=categorical_cols_small, inplace=True)

# Apply Label Encoding
categorical_cols_large = ['occupation', 'native_country']
label_encoder = LabelEncoder()
for col in categorical_cols_large:
    df[col] = label_encoder.fit_transform(df[col])

# Feature Engineering
df['work_experience'] = df['age'] - df['education_num']
df['income_per_hour'] = df['hours_per_week'] / (df['capital_gain'] + 1)
df['log_capital_gain'] = np.log1p(df['capital_gain'])

# Feature Selection
iso_forest = IsolationForest(contamination=0.05)
outliers = iso_forest.fit_predict(df[numerical_cols])
df['outlier'] = outliers
df = df[df['outlier'] == 1]
df.drop(columns=['outlier'], inplace=True)

# PPS Matrix
pps_matrix = pps.matrix(df)
print("\nPPS Matrix:")
print(pps_matrix)



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
None

Dataset Summary Statistics:
                 age workclass        fnlwgt 




PPS Matrix:
                     x                        y   ppscore            case  \
0                  age                      age  1.000000  predict_itself   
1                  age                   fnlwgt  0.000000      regression   
2                  age            education_num  0.000000      regression   
3                  age           marital_status  0.322879  classification   
4                  age               occupation  0.000000      regression   
...                ...                      ...       ...             ...   
1516  log_capital_gain   education_ Prof-school  0.000000      regression   
1517  log_capital_gain  education_ Some-college  0.000000      regression   
1518  log_capital_gain          work_experience  0.000000      regression   
1519  log_capital_gain          income_per_hour  0.000000      regression   
1520  log_capital_gain         log_capital_gain  1.000000  predict_itself   

      is_valid_score               metric  baseline_score   mo