In [13]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
url = '../files/adult_dataset.csv'
df = pd.read_csv(url, na_values='?', skipinitialspace=True)

# Step m: Data Cleaning
df.dropna(inplace=True)  # Remove missing values
df = df[df.select_dtypes(include=[np.number]).ge(0).all(1)]  # Remove negative values if any

# Step n: Outlier Detection & Removal (Z-Score method)
from scipy.stats import zscore
z_scores = np.abs(zscore(df.select_dtypes(include=[np.number])))
df = df[(z_scores < 3).all(axis=1)]

# Step o: Data Transformation
# Encode categorical variables
cat_cols = df.select_dtypes(include=['object']).columns
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Feature scaling
scaler = StandardScaler()
num_cols = df.select_dtypes(include=[np.number]).columns.drop('income')
df[num_cols] = scaler.fit_transform(df[num_cols])

# Split features and labels
X = df.drop('income', axis=1)
y = df['income']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step p: Model Building

# Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)
lr_acc = accuracy_score(y_test, lr_preds)

# Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_preds = nb_model.predict(X_test)
nb_acc = accuracy_score(y_test, nb_preds)

# Print results
print("=== Logistic Regression ===")
print("Accuracy:", lr_acc)
print(classification_report(y_test, lr_preds))

print("=== Naive Bayes ===")
print("Accuracy:", nb_acc)
print(classification_report(y_test, nb_preds))

# Accuracy Comparison
print("Logistic Regression Accuracy: {:.2f}%".format(lr_acc * 100))
print("Naive Bayes Accuracy: {:.2f}%".format(nb_acc * 100))


=== Logistic Regression ===
Accuracy: 0.8215968112090832
              precision    recall  f1-score   support

           0       0.84      0.94      0.89      6389
           1       0.68      0.41      0.51      1890

    accuracy                           0.82      8279
   macro avg       0.76      0.68      0.70      8279
weighted avg       0.81      0.82      0.80      8279

=== Naive Bayes ===
Accuracy: 0.8011837178403188
              precision    recall  f1-score   support

           0       0.91      0.83      0.87      6389
           1       0.55      0.71      0.62      1890

    accuracy                           0.80      8279
   macro avg       0.73      0.77      0.74      8279
weighted avg       0.83      0.80      0.81      8279

Logistic Regression Accuracy: 82.16%
Naive Bayes Accuracy: 80.12%
