In [21]:
# 1. Data Exploration and Preprocessing:
import pandas as pd
df = pd.read_csv("adult_with_headers.csv")
df 

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [22]:
# Conduct basic data exploration (summary statistics, missing values, data types):

# Summary statistics
print(df.describe())

# Missing values
print(df.isnull().sum())

# Data types
print(df.dtypes)

                age        fnlwgt  education_num  capital_gain  capital_loss  \
count  32561.000000  3.256100e+04   32561.000000  32561.000000  32561.000000   
mean      38.581647  1.897784e+05      10.080679   1077.648844     87.303830   
std       13.640433  1.055500e+05       2.572720   7385.292085    402.960219   
min       17.000000  1.228500e+04       1.000000      0.000000      0.000000   
25%       28.000000  1.178270e+05       9.000000      0.000000      0.000000   
50%       37.000000  1.783560e+05      10.000000      0.000000      0.000000   
75%       48.000000  2.370510e+05      12.000000      0.000000      0.000000   
max       90.000000  1.484705e+06      16.000000  99999.000000   4356.000000   

       hours_per_week  
count    32561.000000  
mean        40.437456  
std         12.347429  
min          1.000000  
25%         40.000000  
50%         40.000000  
75%         45.000000  
max         99.000000  
age               0
workclass         0
fnlwgt            0
edu

In [23]:
print(df.dropna()) 

       age          workclass  fnlwgt    education  education_num  \
0       39          State-gov   77516    Bachelors             13   
1       50   Self-emp-not-inc   83311    Bachelors             13   
2       38            Private  215646      HS-grad              9   
3       53            Private  234721         11th              7   
4       28            Private  338409    Bachelors             13   
...    ...                ...     ...          ...            ...   
32556   27            Private  257302   Assoc-acdm             12   
32557   40            Private  154374      HS-grad              9   
32558   58            Private  151910      HS-grad              9   
32559   22            Private  201490      HS-grad              9   
32560   52       Self-emp-inc  287927      HS-grad              9   

            marital_status          occupation    relationship    race  \
0            Never-married        Adm-clerical   Not-in-family   White   
1       Married-civ-spo

In [24]:
# Handle missing values as per the best practices:

df_dropped = df.dropna()

# Impute missing values with the mean (for numerical features)
numerical_cols = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
for col in numerical_cols:
  df[col] = df[col].fillna(df[col].mean())

# Impute missing values with the mode (for categorical features)
categorical_cols = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country', 'income']
for col in categorical_cols:
  df[col] = df[col].fillna(df[col].mode()[0])

print(df_dropped.isnull().sum())
print(df.isnull().sum())

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64
age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


In [25]:
# Apply scaling techniques to numerical features:

# 1. Standard Scaling
X_cont = df[['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']]

from sklearn.preprocessing import StandardScaler, MinMaxScaler

SS = StandardScaler()
X_cont_SS = SS.fit_transform(X_cont)
X_cont_SS = pd.DataFrame(X_cont_SS)
X_cont_SS.columns = list(X_cont)
X_cont_SS.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,0.837109,-1.008707,1.134739,-0.14592,-0.21666,-2.222153
2,-0.042642,0.245079,-0.42006,-0.14592,-0.21666,-0.035429
3,1.057047,0.425801,-1.197459,-0.14592,-0.21666,-0.035429
4,-0.775768,1.408176,1.134739,-0.14592,-0.21666,-0.035429


In [26]:
# 2. Min-Max Scaling
from sklearn.preprocessing import MinMaxScaler
MM = MinMaxScaler()
X_cont_MM = MM.fit_transform(X_cont) 
X_cont_MM = pd.DataFrame(X_cont_MM)
X_cont_MM.columns = list(X_cont)
X_cont_MM.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,0.30137,0.044302,0.8,0.02174,0.0,0.397959
1,0.452055,0.048238,0.8,0.0,0.0,0.122449
2,0.287671,0.138113,0.533333,0.0,0.0,0.397959
3,0.493151,0.151068,0.4,0.0,0.0,0.397959
4,0.150685,0.221488,0.8,0.0,0.0,0.397959


In [27]:
# Discuss the scenarios where each scaling technique is preferred and why

# Standard Scaling:

# Preferred when the data follows a normal distribution or when algorithms assume a normal distribution (e.g., linear regression, logistic regression, support vector machines).
# Centers the data around zero with a standard deviation of one.
# Useful when features have different scales and you want to give them equal importance.
# Robust to outliers as it scales based on standard deviation.

# Min-Max Scaling:

# Preferred when the data does not necessarily follow a normal distribution.
# Scales the data to a specific range (usually between 0 and 1).
# Useful when you want to preserve the relative relationships between data points.
# Sensitive to outliers as it scales based on minimum and maximum values.

In [28]:
# 2. Encoding Techniques:
# Apply One-Hot Encoding to categorical variables with less than 5 categories

X_cate = df[['workclass', 'education', 'occupation', 'native_country', 'income']]

X_cate_dummies = pd.get_dummies(X_cate)
X_cate_dummies.head() 
list(X_cate_dummies)
X_cate_dummies.head() 

Unnamed: 0,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,education_ 10th,...,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia,income_ <=50K,income_ >50K
0,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,True,False,False,True,False
1,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,True,False,False,True,False
2,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False
3,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False
4,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False


In [38]:
# Use Label Encoding for categorical variables with more than 5 categories.

from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()

X_cate_2 = df[['income', 'workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']]

X_cate_2['income']=LE.fit_transform(X_cate_2['income'])
X_cate_2['workclass']=LE.fit_transform(X_cate_2['workclass'])
X_cate_2['education']=LE.fit_transform(X_cate_2['education'])
X_cate_2['marital_status']=LE.fit_transform(X_cate_2['marital_status'])
X_cate_2['occupation']=LE.fit_transform(X_cate_2['occupation'])
X_cate_2['relationship']=LE.fit_transform(X_cate_2['relationship'])
X_cate_2['race']=LE.fit_transform(X_cate_2['race'])
X_cate_2['sex']=LE.fit_transform(X_cate_2['sex'])
X_cate_2['native_country']=LE.fit_transform(X_cate_2['native_country'])

X_cate_2.head() 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_cate_2['income']=LE.fit_transform(X_cate_2['income'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_cate_2['workclass']=LE.fit_transform(X_cate_2['workclass'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_cate_2['education']=LE.fit_transform(X_cate_2['education'])
A value is trying to be s

Unnamed: 0,income,workclass,education,marital_status,occupation,relationship,race,sex,native_country
0,0,7,9,4,1,1,4,1,39
1,0,6,9,2,4,0,4,1,39
2,0,4,11,0,6,1,4,1,39
3,0,4,1,2,6,0,2,1,39
4,0,4,9,2,10,5,2,0,5


In [39]:
# Concatenation
X = pd.concat([X_cont_SS, X_cate_2], axis=1)
print(X.shape) 

Y = X["income"]
print(Y.shape)

(32561, 15)
(32561,)


In [40]:
X

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,income,workclass,education,marital_status,occupation,relationship,race,sex,native_country
0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429,0,7,9,4,1,1,4,1,39
1,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153,0,6,9,2,4,0,4,1,39
2,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429,0,4,11,0,6,1,4,1,39
3,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429,0,4,1,2,6,0,2,1,39
4,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429,0,4,9,2,10,5,2,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409,0,4,7,2,13,5,4,0,39
32557,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429,1,4,11,2,7,0,4,1,39
32558,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429,0,4,11,6,1,4,4,0,39
32559,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225,0,4,11,4,1,3,4,1,39


In [41]:
# Pros and Cons of One-Hot Encoding and Label Encoding

# 1. One-Hot Encoding:

# Pros:
# - Creates binary features for each category, avoiding ordinal relationships.
# - Suitable for nominal categorical variables where there is no inherent order.

# Cons:
# - Can significantly increase the dimensionality of the dataset.
# - Not suitable for ordinal categorical variables where order matters.

# 2. Label Encoding:

# Pros:
# - Does not increase the dimensionality of the dataset.
# - Suitable for ordinal categorical variables where order is important.

# Cons:
# - Can mislead some algorithms that assume equal intervals between encoded values.
# - Not suitable for nominal categorical variables where there is no inherent order.

In [42]:
# 3. Feature Engineering:
# Create at least 2 new features that could be beneficial for the model. Explain the rationale behind your choices

# 1. Combined Capital:
df['combined_capital'] = df['capital_gain'] - df['capital_loss']

# Rationale: This feature combines the capital gain and loss into a single metric, representing the net capital change.

# 2. Hours per Week Category:
def categorize_hours(hours):
  if hours <= 30:
    return 'Part-time'
  elif hours <= 40:
    return 'Full-time'
  else:
    return 'Overtime'

df['hours_category'] = df['hours_per_week'].apply(categorize_hours)
df['hours_category']

# Rationale: Creating categories for hours worked per week might capture non-linear relationships with income.

0        Full-time
1        Part-time
2        Full-time
3        Full-time
4        Full-time
           ...    
32556    Full-time
32557    Full-time
32558    Full-time
32559    Part-time
32560    Full-time
Name: hours_category, Length: 32561, dtype: object

In [43]:
# Apply a transformation (e.g., log transformation) to at least one skewed numerical feature and justify your choice

# Check for skewness in numerical features
for col in numerical_cols:
  print(f"Skewness of {col}: {df[col].skew()}")

Skewness of age: 0.5587433694130484
Skewness of fnlwgt: 1.4469800945789826
Skewness of education_num: -0.3116758679102297
Skewness of capital_gain: 11.953847687699799
Skewness of capital_loss: 4.594629121679692
Skewness of hours_per_week: 0.22764253680450092


In [44]:
import numpy as np

# Applying log transformation to 'capital_gain' due to high skewness
df['capital_gain_log'] = np.log1p(df['capital_gain'])
df['capital_gain_log']

0        7.684784
1        0.000000
2        0.000000
3        0.000000
4        0.000000
           ...   
32556    0.000000
32557    0.000000
32558    0.000000
32559    0.000000
32560    9.617471
Name: capital_gain_log, Length: 32561, dtype: float64

In [45]:
# 4. Feature Selection:
# Use the Isolation Forest algorithm to identify and remove outliers. Discuss how outliers can affect model performance

import numpy as np
from sklearn.ensemble import IsolationForest

iso_forest = IsolationForest(contamination=0.01)  # Expect 1% of data to be outliers
outlier_predictions = iso_forest.fit_predict(X)

# Identify outliers (predicted as -1)
outlier_indices = np.where(outlier_predictions == -1)[0]

# Remove outliers from the dataset
X_outlier = X.drop(outlier_indices)
Y_outlier = Y.drop(outlier_indices)

In [46]:
# How Outliers Affect Model Performance:

# Impacts:
# - Bias: Outliers can significantly bias the model's parameters.
# - Overfitting: Models might overfit to outliers.
# - Reduced Accuracy: Outliers can lead to decreased accuracy and other performance metrics on unseen data.
# - Transformation: Apply transformations (e.g.,log transformation) to reduce the impact of extreme values.
# - Evaluation: Always evaluate the impact of outlier handling on model performance using appropriate metrics and validation techniques.

In [47]:
# Apply the PPS (Predictive Power Score) to find and discuss the relationships between features. Compare its findings with the correlation matrix
#!pip install ppscore 

In [49]:
import ppscore as pps

# Calculate the Predictive Power Score matrix
pps_matrix = pps.matrix(X)
print(pps_matrix)

# Calculate the correlation matrix
corr_matrix = X.corr()
print(corr_matrix)

                  x               y  ppscore            case  is_valid_score  \
0               age             age      1.0  predict_itself            True   
1               age          fnlwgt      0.0      regression            True   
2               age   education_num      0.0      regression            True   
3               age    capital_gain      0.0      regression            True   
4               age    capital_loss      0.0      regression            True   
..              ...             ...      ...             ...             ...   
220  native_country      occupation      0.0      regression            True   
221  native_country    relationship      0.0      regression            True   
222  native_country            race      0.0      regression            True   
223  native_country             sex      0.0      regression            True   
224  native_country  native_country      1.0  predict_itself            True   

                  metric  baseline_scor

In [None]:
# Comparison:

# The Predictive Power Score (PPS) and correlation matrix provide different perspectives on feature relationships:
# - Correlation: Measures the linear association between two variables. It ranges from -1 (perfect negative correlation) to 1 (perfect positive correlation), with 0 indicating no linear relationship.
# - PPS: Quantifies the ability of one feature to predict another, considering both linear and non-linear relationships. It ranges from 0 (no predictive power) to 1 (perfect predictive power).

# Conclusion:
# The PPS provides a more comprehensive view of feature relationships compared to the correlation matrix, especially when dealing with non-linear dependencies. It can guide feature selection for machine learning models.