In [None]:
# Part 2 - Classification

import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Ignore warnings
warnings.filterwarnings("ignore")

# Load Data
column_names = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
    'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
    'hours-per-week', 'native-country', 'income'
]

data_directory = "Ass4Data/Part 2 - classification/"

# Load training and test data
train_data = pd.read_csv(data_directory + "adult.data", delimiter=',', names=column_names)
test_data = pd.read_csv(data_directory + "adult.test", delimiter=',', names=column_names, skiprows=1)

# Preprocess the 'income' column to make it consistent and encode as binary
train_data['income'] = train_data['income'].str.strip().map({'<=50K': 0, '<=50K.': 0, '>50K': 1, '>50K.': 1})
test_data['income'] = test_data['income'].str.strip().map({'<=50K': 0, '<=50K.': 0, '>50K': 1, '>50K.': 1})

# Drop the 'income' column from the feature sets
X_train = train_data.drop(columns=['income'])
X_test = test_data.drop(columns=['income'])

# Standardize numeric features (scaling)
numeric_features = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
scaler = StandardScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

# Perform one-hot encoding for categorical columns
categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
encoder = OneHotEncoder(sparse=False, drop='first')  # Drop first column to avoid multicollinearity
X_train_encoded = encoder.fit_transform(X_train[categorical_features])
X_test_encoded = encoder.transform(X_test[categorical_features])

# Combine encoded features with numeric features
X_train = np.concatenate((X_train_encoded, X_train.drop(columns=categorical_features).values), axis=1)
X_test = np.concatenate((X_test_encoded, X_test.drop(columns=categorical_features).values), axis=1)

# Impute missing values with the mean of the respective columns
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Ensure that 'income' labels are binary (0 or 1)
y_train = train_data['income']
y_test = test_data['income']

results = []

# List of classification algorithms
classifiers = [
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('Naive Bayes', GaussianNB()),
    ('Support Vector Machine', SVC(probability=True)),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('AdaBoost', AdaBoostClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier()),
    ('Linear Discriminant Analysis', LinearDiscriminantAnalysis()),
    ('Multi-layer Perceptron', MLPClassifier(max_iter=1000)),
    ('Logistic Regression', LogisticRegression())
]

for name, clf in classifiers:
    # Fit the classifier
    clf.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = clf.predict(X_test)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Calculate AUC only for classifiers with probability predictions
    if hasattr(clf, 'predict_proba'):
        auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    else:
        auc = None
    
    # Record results
    results.append((name, accuracy, precision, recall, f1, auc))

# Display results in a table
result_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC'])
print(result_df)

# Identify the Best Algorithms
best_algorithms = {
    'Accuracy': max(results, key=lambda x: x[1])[0],
    'Precision': max(results, key=lambda x: x[2])[0],
    'Recall': max(results, key=lambda x: x[3])[0],
    'F1-Score': max(results, key=lambda x: x[4])[0]
}

# Report Findings
print("Performance Metrics:")
header = ["Algorithm", "Accuracy", "Precision", "Recall", "F1-Score", "AUC"]
print("{:<35} {:<10} {:<10} {:<10} {:<10} {:<10}".format(*header))
for result in results:
    print("{:<35} {:.2f}       {:.2f}       {:.2f}       {:.2f}       {:.2f}".format(*result))

print("\nBest Algorithms:")
for metric, algorithm in best_algorithms.items():
    print(f"Best algorithm for {metric}: {algorithm}")