# Assignment 12

**EDA2**

In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.impute import SimpleImputer

In [37]:
# Load the dataset
data = pd.read_csv('adult_with_headers.csv')

In [40]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [49]:
data.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


for categorical features to get counts, unique values, top occurrences, and frequency.


In [50]:
data.describe(include=['O'])

Unnamed: 0,workclass,education,marital_status,occupation,relationship,race,sex,native_country,income
count,32561,32561,32561,32561,32561,32561,32561,32561,32561
unique,9,16,7,15,6,5,2,42,2
top,Private,HS-grad,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,<=50K
freq,22696,10501,14976,4140,13193,27816,21790,29170,24720


In [42]:
data.isnull().sum()

Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
education_num,0
marital_status,0
occupation,0
relationship,0
race,0
sex,0


**Handling Missing Values:**

Removing and Imputation

In [45]:
# Impute missing values for numerical and categorical features
num_features = data.select_dtypes(include=['int64', 'float64']).columns
cat_features = data.select_dtypes(include=['object']).columns

imputer = ColumnTransformer([
    ('num_imputer', SimpleImputer(strategy='median'), num_features),
    ('cat_imputer', SimpleImputer(strategy='most_frequent'), cat_features)
])

num_features = num_features.intersection(data.columns)
cat_features = cat_features.intersection(data.columns)

**Scaling Numerical Features:**

*Standard scaling is suggested when the data has a normal, or Gaussian, distribution. It works well with methods like linear discriminant analysis, logistic regression, and linear regression that assume standard normally distributed data.*

Centers the data by subtracting the mean and scales it by the standard deviation, resulting in a distribution with a mean of 0 and a standard deviation of 1.

In [51]:
scaler = StandardScaler()
data['age_standard_scaled'] = scaler.fit_transform(data[['age']])

*Effective for cases were the data fits inside a specific range and doesn't have a normal distribution. In algorithms like k-nearest neighbors and neural networks, which don't require any distribution of the data, it is often used.*


 Transforms features by scaling each feature to a given range, typically between 0 and 1.

In [52]:
scaler = MinMaxScaler()
data['age_min_max_scaled'] = scaler.fit_transform(data[['age']])

**Encoding Techniques:**

One-Hot Encoding:

In [53]:
data = pd.get_dummies(data, columns=['sex', 'marital_status'], drop_first=True)

In [54]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,occupation,relationship,race,capital_gain,capital_loss,...,income,age_standard_scaled,age_min_max_scaled,sex_ Male,marital_status_ Married-AF-spouse,marital_status_ Married-civ-spouse,marital_status_ Married-spouse-absent,marital_status_ Never-married,marital_status_ Separated,marital_status_ Widowed
0,39,State-gov,77516,Bachelors,13,Adm-clerical,Not-in-family,White,2174,0,...,<=50K,0.030671,0.30137,True,False,False,False,True,False,False
1,50,Self-emp-not-inc,83311,Bachelors,13,Exec-managerial,Husband,White,0,0,...,<=50K,0.837109,0.452055,True,False,True,False,False,False,False
2,38,Private,215646,HS-grad,9,Handlers-cleaners,Not-in-family,White,0,0,...,<=50K,-0.042642,0.287671,True,False,False,False,False,False,False
3,53,Private,234721,11th,7,Handlers-cleaners,Husband,Black,0,0,...,<=50K,1.057047,0.493151,True,False,True,False,False,False,False
4,28,Private,338409,Bachelors,13,Prof-specialty,Wife,Black,0,0,...,<=50K,-0.775768,0.150685,False,False,True,False,False,False,False


Label Encoding:

**Pros and Cons of Encoding Techniques:**

One-Hot Encoding:

Pros:

*Avoids making any presumptions regarding the ordinal link between categories.
prevents the natural ordering of the model that was presumed.*

Cons:

*The dataset's dimension increases, which may result in the "threat of dimensionality." could result to data that is insufficient, especially if high-cardinality attributes are used.*

Label Encoding:


Pros:

*Simple, efficient, and requiring little computation overhead.
maintains the dimensionality of the dataset.*

Cons:

*Suggests an ordinal relationship among categories which wouldn't be possible, which might cause the model to make inaccurate assumptions.
Nominal categorical variables without an inherent order are not adequate.*

In [79]:
categorical_features = X.select_dtypes(include=['object']).columns

# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Iterate through categorical features and apply Label Encoding
for feature in categorical_features:
    X[feature] = label_encoder.fit_transform(X[feature])


# Now apply SelectKBest
selector = SelectKBest(score_func=mutual_info_classif, k=10)
X_selected = selector.fit_transform(X, y)

# Get selected feature names
selected_features = X.columns[selector.get_support()]
print("Selected Features:", selected_features)

Selected Features: Index(['age', 'education_num', 'occupation', 'relationship', 'capital_gain',
       'age_standard_scaled', 'age_min_max_scaled',
       'marital_status_ Married-civ-spouse', 'marital_status_ Never-married',
       'education_hours_interaction'],
      dtype='object')


In [82]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)


In [83]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Initialize and train the classifier
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = classifier.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

       <=50K       0.89      0.90      0.89      4903
        >50K       0.63      0.61      0.62      1416

    accuracy                           0.83      6319
   macro avg       0.76      0.75      0.76      6319
weighted avg       0.83      0.83      0.83      6319

