<a href="https://colab.research.google.com/github/Ranjith-A13/Python/blob/main/EDA2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:


import pandas as pd


import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import IsolationForest

import ppscore as pps

# Load dataset
df = pd.read_csv("adult_with_headers.csv")
print(df.columns.tolist())

# Basic data exploration
print(df.info())
print(df.describe())
print(df.isnull().sum())


# Handle missing values (drop for simplicity)
df = df.dropna()

# Scaling
scaler_std = StandardScaler()
scaler_mm = MinMaxScaler()
df['age_std'] = scaler_std.fit_transform(df[['age']])
df['age_mm'] = scaler_mm.fit_transform(df[['age']])
df['capital_total'] = df['capital_gain'] + df['capital_loss']
df['log_capital_gain'] = np.log1p(df['capital_gain'])

# Encoding
# One-Hot Encoding for 'sex' (assumed < 5 categories)
df = pd.get_dummies(df, columns=['sex'])

# Label Encoding for 'education' (assumed ≥ 5 categories)
le = LabelEncoder()
df['education_encoded'] = le.fit_transform(df['education'])






['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 no

In [10]:
# Feature Engineering
df['capital_total'] = df['capital_gain'] + df['capital_loss'] # Changed 'capital-gain' to 'capital_gain'
df['age_bucket'] = pd.cut(df['age'], bins=[0, 30, 50, 100], labels=['Young', 'Middle', 'Old'])
df['log_capital_gain'] = np.log1p(df['capital_gain']) # Changed 'capital-gain' to 'capital_gain'


# Outlier detection using Isolation Forest
iso = IsolationForest(contamination=0.01, random_state=42)
df['outlier'] = iso.fit_predict(df.select_dtypes(include=[np.number]))
df = df[df['outlier'] != -1]
df = df.drop(columns=['outlier'])

# PPS matrix
pps_matrix = pps.matrix(df)
print(pps_matrix.sort_values(by='ppscore', ascending=False))

# Correlation matrix
# Select only numeric columns for correlation calculation
numeric_df = df.select_dtypes(include=[np.number])
print(numeric_df.corr())



                     x                  y  ppscore            case  \
483         age_bucket         age_bucket      1.0  predict_itself   
0                  age                age      1.0  predict_itself   
460  education_encoded  education_encoded      1.0  predict_itself   
444  education_encoded      education_num      1.0      regression   
23           workclass          workclass      1.0  predict_itself   
..                 ...                ...      ...             ...   
231       capital_loss     hours_per_week      0.0      regression   
232       capital_loss     native_country      0.0  classification   
234       capital_loss            age_std      0.0      regression   
235       capital_loss             age_mm      0.0      regression   
83           education   log_capital_gain      0.0      regression   

     is_valid_score               metric  baseline_score  model_score  \
483            True                 None        0.000000     1.000000   
0            