In [27]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import IsolationForest


# Load the dataset
df = pd.read_csv('adult_with_headers.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [28]:
# Data Exploration
print(df.head())
print(df.info())
print(df.describe())
print(df.isnull().sum())

   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  
0          2174             0              40   United-States   <=50

In [29]:
# Handle Missing Values
# Replace '?' with NaN for proper handling
df.replace(' ?', np.nan, inplace=True)

In [30]:
# Identify numerical and categorical columns for separate imputation
numerical_cols = df.select_dtypes(include=np.number).columns
categorical_cols = df.select_dtypes(exclude=np.number).columns

In [31]:
# Impute missing numerical values with the mean
numerical_imputer = SimpleImputer(strategy='mean')
df[numerical_cols] = numerical_imputer.fit_transform(df[numerical_cols])

In [32]:
# Impute missing categorical values with the mode
categorical_imputer = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = categorical_imputer.fit_transform(df[categorical_cols])

In [33]:
# Verify missing values
print("\nMissing values after imputation:\n", df.isnull().sum())


Missing values after imputation:
 age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


In [34]:
# Apply Scaling Techniques
numerical_features = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

In [35]:
# Standard Scaling
std_scaler = StandardScaler()
df_std = df.copy()
df_std[numerical_features] = std_scaler.fit_transform(df_std[numerical_features])
print("\nDataframe after Standard Scaling:\n", df_std.head())


Dataframe after Standard Scaling:
         age          workclass    fnlwgt   education  education_num  \
0  0.030671          State-gov -1.063611   Bachelors       1.134739   
1  0.837109   Self-emp-not-inc -1.008707   Bachelors       1.134739   
2 -0.042642            Private  0.245079     HS-grad      -0.420060   
3  1.057047            Private  0.425801        11th      -1.197459   
4 -0.775768            Private  1.408176   Bachelors       1.134739   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country 

In [36]:
# Min-Max Scaling
min_max_scaler = MinMaxScaler()
df_minmax = df.copy()
df_minmax[numerical_features] = min_max_scaler.fit_transform(df_minmax[numerical_features])
print("\nDataframe after Min-Max Scaling:\n", df_minmax.head())

# Discussion of Scaling Techniques
print("\nDiscussion of Scaling Techniques:")
print("Standard Scaling (Z-score normalization):")
print("- Centers the data around zero with a standard deviation of one.")
print("- Useful when features have different scales and the algorithm assumes normally distributed data (e.g., linear regression, logistic regression, k-nearest neighbors).")
print("- Sensitive to outliers.")

print("\nMin-Max Scaling:")
print("- Scales features to a specific range (usually 0 to 1).")
print("- Suitable for algorithms that are sensitive to feature scales but are not affected by the distribution (e.g., KNN, SVM, neural networks).")
print("- Sensitive to outliers; outliers can shrink the range of other values.")



Dataframe after Min-Max Scaling:
         age          workclass    fnlwgt   education  education_num  \
0  0.301370          State-gov  0.044302   Bachelors       0.800000   
1  0.452055   Self-emp-not-inc  0.048238   Bachelors       0.800000   
2  0.287671            Private  0.138113     HS-grad       0.533333   
3  0.493151            Private  0.151068        11th       0.400000   
4  0.150685            Private  0.221488   Bachelors       0.800000   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  

In [37]:
# Apply One-Hot Encoding to categorical variables with less than 5 categories.
# Use Label Encoding for categorical variables with more than 5 categories.
# Discuss the pros and cons of One-Hot Encoding and Label Encoding.

import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Assuming 'df' is already loaded and preprocessed as in your provided code

# Identify categorical columns
categorical_cols = df.select_dtypes(exclude=np.number).columns

# Iterate through categorical columns
for col in categorical_cols:
    if df[col].nunique() < 5:  # Apply One-Hot Encoding
        ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        ohe_data = ohe.fit_transform(df[[col]])
        ohe_df = pd.DataFrame(ohe_data, columns=ohe.get_feature_names_out([col]))
        df = df.drop(col, axis=1)  # Drop original column
        df = pd.concat([df, ohe_df], axis=1) #Concatenate the encoded data to the original df
    else:  # Apply Label Encoding
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

# Display the head of the modified dataframe to see the changes.
print(df.head())

# Discussion of Encoding Techniques (as requested)
print("\nDiscussion of Encoding Techniques:")
print("One-Hot Encoding:")
print("- Converts categorical variables into numerical representations using binary vectors.")
print("- Creates a new binary feature for each category in the variable.")
print("- Pros: Avoids imposing ordinal relationships, works well with algorithms that assume numerical input.")
print("- Cons: Can lead to high dimensionality (many new features) if there are many categories, potentially causing performance issues for some algorithms.")

print("\nLabel Encoding:")
print("- Assigns a unique integer to each category in the variable.")
print("- Pros: Simple, reduces dimensionality.")
print("- Cons: May introduce ordinal relationships where none exist, potentially misleading some algorithms (e.g., linear models).")

    age  workclass    fnlwgt  education  education_num  marital_status  \
0  39.0          6   77516.0          9           13.0               4   
1  50.0          5   83311.0          9           13.0               2   
2  38.0          3  215646.0         11            9.0               0   
3  53.0          3  234721.0          1            7.0               2   
4  28.0          3  338409.0          9           13.0               2   

   occupation  relationship  race  capital_gain  capital_loss  hours_per_week  \
0           0             1     4        2174.0           0.0            40.0   
1           3             0     4           0.0           0.0            13.0   
2           5             1     4           0.0           0.0            40.0   
3           5             0     2           0.0           0.0            40.0   
4           9             5     2           0.0           0.0            40.0   

   native_country  sex_ Female  sex_ Male  income_ <=50K  income_ >5

In [38]:
# Create at least 2 new features that could be beneficial for the model. Explain the rationale behind your choices.
# Apply a transformation (e.g., log transformation) to at least one skewed numerical feature and justify your choice.

# Feature Engineering: Create new features
# 1. Age Group:  Categorize age into groups (e.g., young, middle-aged, senior)
# Rationale: Age might have non-linear relationships with income. Grouping can capture these better.
df['age_group'] = pd.cut(df['age'], bins=[0, 25, 50, 100], labels=['Young', 'Middle-aged', 'Senior'])

# 2. Capital Total: Combine capital gain and capital loss.
# Rationale:  They represent financial gains and losses, so a combined measure might be more informative.
df['capital_total'] = df['capital_gain'] - df['capital_loss']

# Transformation: Apply log transformation to 'capital_total'
# Rationale: 'capital_gain' and 'capital_loss' are highly skewed, log transform will reduce the skew.
df['capital_total_log'] = np.log1p(df['capital_total']) #Using log1p to handle zeros

print(df.head())

    age  workclass    fnlwgt  education  education_num  marital_status  \
0  39.0          6   77516.0          9           13.0               4   
1  50.0          5   83311.0          9           13.0               2   
2  38.0          3  215646.0         11            9.0               0   
3  53.0          3  234721.0          1            7.0               2   
4  28.0          3  338409.0          9           13.0               2   

   occupation  relationship  race  capital_gain  capital_loss  hours_per_week  \
0           0             1     4        2174.0           0.0            40.0   
1           3             0     4           0.0           0.0            13.0   
2           5             1     4           0.0           0.0            40.0   
3           5             0     2           0.0           0.0            40.0   
4           9             5     2           0.0           0.0            40.0   

   native_country  sex_ Female  sex_ Male  income_ <=50K  income_ >5

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [45]:
# Use the Isolation Forest algorithm to identify and remove outliers. Discuss how outliers can affect model performance.
# Apply the PPS (Predictive Power Score) to find and discuss the relationships between features. Compare its findings with the correlation matrix.

from sklearn.ensemble import IsolationForest



# Transformation: Apply log transformation to 'capital_total'
# Rationale: 'capital_gain' and 'capital_loss' are highly skewed, log transform will reduce the skew.
# Handle potential negative values by taking absolute value before log
df['capital_total_log'] = np.log1p(np.abs(df['capital_total']))  

# Outlier Detection and Removal using Isolation Forest
# Select numerical features, including the newly engineered ones
numerical_features = df.select_dtypes(include=np.number).columns
# Impute NaN values in numerical features with the median to avoid influence of outliers
imputer = SimpleImputer(strategy='median')
df[numerical_features] = imputer.fit_transform(df[numerical_features])
# Now apply IsolationForest
model = IsolationForest(contamination=0.05)  # Adjust contamination as needed
df['outlier'] = model.fit_predict(df[numerical_features]) #Applying on numerical features

# Remove outliers (outlier == -1)
df = df[df['outlier'] == 1]
df.drop('outlier', axis=1, inplace=True)

print("\nDataframe after outlier removal:\n", df.head())

# Discussion on outliers
print("\nDiscussion on Outliers:")
print("Outliers can significantly affect model performance in several ways:")
print("- Skewing model parameters: They can pull regression lines or decision boundaries away from the true underlying patterns in the data.")
print("- Increasing variance: They can inflate the variance of the data, making it harder for models to identify meaningful relationships.")
print("- Misleading model assumptions: Many statistical models assume normally distributed data; outliers violate this assumption.")
print("Isolation Forest is an effective method to detect outliers by isolating anomalies using random partitioning.")

# # PPS (Predictive Power Score)
# matrix_pps = pps.matrix(df)
# print("\nPPS Matrix:")
# print(matrix_pps)

# # Correlation Matrix
# correlation_matrix = df.corr()
# print("\nCorrelation Matrix:")
# print(correlation_matrix)

print("\nComparison of PPS and Correlation:")
print("PPS measures the predictive power of one variable on another, considering non-linear relationships.")
print("Correlation measures the linear relationship between two variables.")
print("PPS can uncover relationships that correlation might miss (e.g., non-linear patterns).")
print("Comparing the two matrices side by side gives a more holistic understanding of feature relationships.")


Dataframe after outlier removal:
     age  workclass    fnlwgt  education  education_num  marital_status  \
0  39.0        6.0   77516.0        9.0           13.0             4.0   
1  50.0        5.0   83311.0        9.0           13.0             2.0   
2  38.0        3.0  215646.0       11.0            9.0             0.0   
3  53.0        3.0  234721.0        1.0            7.0             2.0   
5  37.0        3.0  284582.0       12.0           14.0             2.0   

   occupation  relationship  race  capital_gain  capital_loss  hours_per_week  \
0         0.0           1.0   4.0        2174.0           0.0            40.0   
1         3.0           0.0   4.0           0.0           0.0            13.0   
2         5.0           1.0   4.0           0.0           0.0            40.0   
3         5.0           0.0   2.0           0.0           0.0            40.0   
5         3.0           5.0   4.0           0.0           0.0            40.0   

   native_country  sex_ Female  s

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop('outlier', axis=1, inplace=True)
