In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
import matplotlib.pyplot as plt
import seaborn as sns


In [37]:

column_names = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num',
    'marital-status', 'occupation', 'relationship', 'race', 'sex',
    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'
]


train_data = pd.read_csv(
    'adult.data',
    header=None,
    names=column_names,
    na_values=' ?',
    skipinitialspace=True
)


test_data = pd.read_csv(
    'adult.test',
    header=None,
    names=column_names,
    na_values=' ?',
    skipinitialspace=True,
    skiprows=1
)


data = pd.concat([train_data, test_data], ignore_index=True)

print(f"Total instances: {data.shape[0]}")
print(f"Total features: {data.shape[1]}")


Total instances: 48842
Total features: 15


In [41]:

print(data.head())


print(data.info())


print(data.describe(include='all'))


print("Missing values per column:")
print(data.isnull().sum())


   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country income  
0          2174             0              40  United-States  <=50K  
1             0             0             

In [43]:

categorical_features = data.select_dtypes(include=['object']).columns.tolist()
categorical_features.remove('income')  # Exclude target variable
numerical_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"Categorical features: {categorical_features}")
print(f"Numerical features: {numerical_features}")


categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


Categorical features: ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
Numerical features: ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']


In [45]:

data['income'] = data['income'].apply(lambda x: x.strip().strip('.'))

label_encoder = LabelEncoder()
data['income'] = label_encoder.fit_transform(data['income'])

print(data['income'].value_counts())


income
0    37155
1    11687
Name: count, dtype: int64


In [47]:
X = data.drop('income', axis=1)
y = data['income']


In [49]:

train_size = 32561
test_size = 16281

X_train = X.iloc[:train_size]
y_train = y.iloc[:train_size]

X_test = X.iloc[train_size:train_size + test_size]
y_test = y.iloc[train_size:train_size + test_size]

print(f"Training instances: {X_train.shape[0]}")
print(f"Testing instances: {X_test.shape[0]}")


Training instances: 32561
Testing instances: 16281


In [51]:
from sklearn.linear_model import LogisticRegression


In [53]:
from sklearn.tree import DecisionTreeClassifier


In [55]:
logreg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, solver='liblinear', random_state=42))
])


In [57]:
dt_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])


In [59]:

logreg_param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10],
    'classifier__penalty': ['l1', 'l2']
}


dt_param_grid = {
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__criterion': ['gini', 'entropy']
}


In [61]:

logreg_grid_search = GridSearchCV(
    logreg_pipeline,
    logreg_param_grid,
    cv=5,
    n_jobs=-1,
    scoring='f1',
    verbose=1
)

start_time = time.time()
logreg_grid_search.fit(X_train, y_train)
logreg_training_time = time.time() - start_time
print(f"Logistic Regression best params: {logreg_grid_search.best_params_}")
print(f"Logistic Regression training time: {logreg_training_time:.2f} seconds")


dt_grid_search = GridSearchCV(
    dt_pipeline,
    dt_param_grid,
    cv=5,
    n_jobs=-1,
    scoring='f1',
    verbose=1
)

start_time = time.time()
dt_grid_search.fit(X_train, y_train)
dt_training_time = time.time() - start_time
print(f"Decision Tree best params: {dt_grid_search.best_params_}")
print(f"Decision Tree training time: {dt_training_time:.2f} seconds")


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Logistic Regression best params: {'classifier__C': 1, 'classifier__penalty': 'l2'}
Logistic Regression training time: 11.84 seconds
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Decision Tree best params: {'classifier__criterion': 'gini', 'classifier__max_depth': 10, 'classifier__min_samples_split': 2}
Decision Tree training time: 9.91 seconds


In [63]:

start_time = time.time()
logreg_predictions = logreg_grid_search.predict(X_test)
logreg_prediction_time = time.time() - start_time


start_time = time.time()
dt_predictions = dt_grid_search.predict(X_test)
dt_prediction_time = time.time() - start_time

print(f"Logistic Regression prediction time: {logreg_prediction_time:.4f} seconds")
print(f"Decision Tree prediction time: {dt_prediction_time:.4f} seconds")


Logistic Regression prediction time: 0.0300 seconds
Decision Tree prediction time: 0.0325 seconds


In [65]:
def get_metrics(y_true, y_pred):
    return {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1-Score': f1_score(y_true, y_pred),
        'ROC-AUC': roc_auc_score(y_true, y_pred)
    }

logreg_metrics = get_metrics(y_test, logreg_predictions)
dt_metrics = get_metrics(y_test, dt_predictions)


In [67]:
import pandas as pd

results = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC'],
    'Logistic Regression': list(logreg_metrics.values()),
    'Decision Tree': list(dt_metrics.values())
})

time_results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Decision Tree'],
    'Training Time (s)': [logreg_training_time, dt_training_time],
    'Prediction Time (s)': [logreg_prediction_time, dt_prediction_time]
})

print("Performance Metrics:")
print(results)

print("\nTraining and Prediction Times:")
print(time_results)


Performance Metrics:
      Metric  Logistic Regression  Decision Tree
0   Accuracy             0.853019       0.860205
1  Precision             0.730124       0.760451
2     Recall             0.599324       0.595944
3   F1-Score             0.658289       0.668222
4    ROC-AUC             0.765404       0.768941

Training and Prediction Times:
                 Model  Training Time (s)  Prediction Time (s)
0  Logistic Regression          11.843578             0.030000
1        Decision Tree           9.912246             0.032521
