In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
import numpy as np


In [2]:
# Load the dataset
df = pd.read_csv('/content/adult_with_headers.csv')


In [3]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [5]:
df.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [6]:
# Handle missing values (if any)
imputer = SimpleImputer(strategy='mean')  # You can change the strategy as needed
df['age'] = imputer.fit_transform(df[['age']])

In [7]:
# Apply scaling techniques to numerical features
numerical_features = ['age', 'hours_per_week', 'capital_gain', 'capital_loss']

In [8]:
# Standard Scaling
scaler_standard = StandardScaler()
df_standard_scaled = df.copy()
df_standard_scaled[numerical_features] = scaler_standard.fit_transform(df[numerical_features])

In [9]:
# Min-Max Scaling
scaler_minmax = MinMaxScaler()
df_minmax_scaled = df.copy()
df_minmax_scaled[numerical_features] = scaler_minmax.fit_transform(df[numerical_features])

In [10]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [11]:

# One-Hot Encoding for categorical variables with less than 5 categories
categorical_less_than_5 = ['sex', 'native_country']
one_hot = OneHotEncoder(drop='first', sparse_output=False)  # Updated parameter name
df_one_hot_encoded = pd.DataFrame(one_hot.fit_transform(df[categorical_less_than_5]),
                                   columns=one_hot.get_feature_names_out(categorical_less_than_5))

In [12]:
# creating instance of one-hot-encoder
OHE = OneHotEncoder()# Specifies the way unknown categories are handled during transform.

In [13]:
# Adding the one-hot encoded columns back to the original dataframe
df = pd.concat([df, df_one_hot_encoded], axis=1)
df.drop(columns=categorical_less_than_5, inplace=True)  # Dropping original columns after encoding

In [14]:
df_one_hot_encoded

Unnamed: 0,sex_ Male,native_country_ Cambodia,native_country_ Canada,native_country_ China,native_country_ Columbia,native_country_ Cuba,native_country_ Dominican-Republic,native_country_ Ecuador,native_country_ El-Salvador,native_country_ England,...,native_country_ Portugal,native_country_ Puerto-Rico,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
32557,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
32558,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
32559,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [15]:
# Label Encoding for categorical variables with more than 5 categories
categorical_more_than_5 = ['occupation', 'workclass']
label_encoder = LabelEncoder()
df_label_encoded = df.copy()
for col in categorical_more_than_5:
    df_label_encoded[col] = label_encoder.fit_transform(df[col])


In [16]:
# Feature 1: Age Group (young, middle-aged, senior)
df['age_group'] = pd.cut(df['age'], bins=[0, 30, 60, 100], labels=['young', 'middle-aged', 'senior'])


In [17]:
# Feature 2: Income per hour (assuming income is related to hours per week)
df['income_per_hour'] = df['capital_gain'] / df['hours_per_week']

In [18]:
# Apply transformation (Log transformation for skewed data)
df['capital_gain_log'] = df['capital_gain'].apply(lambda x: np.log1p(x))  # Log transform to reduce skewness

In [19]:
from sklearn.ensemble import IsolationForest

In [20]:
# Applying Isolation Forest to detect outliers
iso_forest = IsolationForest(contamination=0.05, random_state=42)
df['outlier'] = iso_forest.fit_predict(df[numerical_features])

In [21]:
# Remove outliers (keeping only non-outliers)
df_clean = df[df['outlier'] == 1].drop(columns=['outlier'])

In [2]:
!pip install ppscore



In [12]:
import ppscore as pps
import pandas as pd
import numpy as np
from scipy import stats
import warnings
warnings.filterwarnings("ignore")

In [18]:
# Assuming 'numerical_feature' is the column you want to check for outliers
# Replace 'numerical_feature' with the actual column name
numerical_feature = 'age'

# Calculate z-scores for the numerical feature
z = np.abs(stats.zscore(df[numerical_feature]))

# Define a threshold for outlier detection (e.g., z-score > 3)
threshold = 3

# Create a new column 'outlier' and mark outliers as 1
df['outlier'] = np.where(z > threshold, 1, 0)

# Remove outliers (keeping only non-outliers)
df_clean = df[df['outlier'] == 0].drop(columns=['outlier'])

# Apply the PPS (Predictive Power Score)
pps_matrix = pps.matrix(df_clean)

In [19]:
# Apply the PPS (Predictive Power Score)
pps_matrix = pps.matrix(df_clean)


In [20]:
# Check if df_clean is defined correctly
if not df_clean.empty:
    # Compute PPS matrix
    pps_matrix = pps.matrix(df_clean)

    # Show important PPS relationships
    pps_high_score = pps_matrix[pps_matrix['ppscore'] > 0.2]
    print("\nHigh Predictive Power Score Relationships:")
    print(pps_high_score[['x', 'y', 'ppscore']])
else:
    print("The cleaned DataFrame is empty. Please check for outliers removal.")


High Predictive Power Score Relationships:
                  x               y   ppscore
0               age             age  1.000000
5               age  marital_status  0.318257
7               age    relationship  0.206589
16        workclass       workclass  1.000000
32           fnlwgt          fnlwgt  1.000000
48        education       education  1.000000
49        education   education_num  1.000000
59        education          income  0.230341
63    education_num       education  1.000000
64    education_num   education_num  1.000000
74    education_num          income  0.230341
75   marital_status             age  0.207451
80   marital_status  marital_status  1.000000
82   marital_status    relationship  0.411424
84   marital_status             sex  0.319634
91       occupation       workclass  0.230126
96       occupation      occupation  1.000000
99       occupation             sex  0.326918
110    relationship  marital_status  0.643609
112    relationship    relationship 

In [21]:
# Compare with correlation matrix
corr_matrix = df_clean.corr()
print("\nCorrelation Matrix:")
print(corr_matrix)


Correlation Matrix:
                     age    fnlwgt  education_num  capital_gain  capital_loss  \
age             1.000000 -0.075121       0.041005      0.079492      0.057136   
fnlwgt         -0.075121  1.000000      -0.043339      0.000459     -0.009475   
education_num   0.041005 -0.043339       1.000000      0.122713      0.080487   
capital_gain    0.079492  0.000459       0.122713      1.000000     -0.031598   
capital_loss    0.057136 -0.009475       0.080487     -0.031598      1.000000   
hours_per_week  0.082650 -0.019604       0.147309      0.078142      0.054863   

                hours_per_week  
age                   0.082650  
fnlwgt               -0.019604  
education_num         0.147309  
capital_gain          0.078142  
capital_loss          0.054863  
hours_per_week        1.000000  
