### Library Setup

In [1]:
def separate():
    print('*' * 40)

print("Data Analysis and Wrangling Packages")
import pandas as pd # Library for data processing and analysis.
print("- pandas version: {}". format(pd.__version__))
separate()

print("Modelling Packages")
import sklearn # Collection of machine learning algorithms.
print("- scikit-learn version: {}". format(sklearn.__version__))
from sklearn.metrics import accuracy_score
from sklearn.model_selection import (train_test_split, KFold, cross_val_score)
#GridSearchCV, RandomizedSearchCV, StratifiedKFold, learning_curve, 
# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier, PassiveAggressiveClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.cluster import AffinityPropagation, AgglomerativeClustering, Birch, DBSCAN, KMeans, OPTICS
from sklearn.neural_network import MLPClassifier
separate()

# Track time taken to run (for modelling).
import time

Data Analysis and Wrangling Packages
- pandas version: 2.1.4
****************************************
Modelling Packages
- scikit-learn version: 1.3.0
****************************************


### Modelling Setup

In [2]:
# Aquiring data.
adult_data = pd.read_csv("./data/adult_processed.csv") 

# X consists of all the features (target is excluded).
X = adult_data.drop(["income"], axis=1) 

# y consists of the target column `income`.
y = adult_data["income"] 

# Splitting the data 7:3.
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.3, random_state = 2024) 

# All 20 machine learning algorithms.
model_list = [
    ("BIRCH", Birch()),
    ("DBSCAN", DBSCAN()),
    ("OPTICS", OPTICS()),
    ("Perceptron", Perceptron()),
    ("Random Forest", RandomForestClassifier()),
    ("Decision Tree", DecisionTreeClassifier()),
    ("Neural Network", MLPClassifier()),
    ("Adaptive Boosting", AdaBoostClassifier()),
    ("Gradient Boosting", GradientBoostingClassifier()),
    ("K-Means Clustering", KMeans()),
    ("Logistic Regression", LogisticRegression()),
    ("K-Nearest Neighbours", KNeighborsClassifier()),
    ("Affinity Propagation", AffinityPropagation()),
    ("Gaussian Naive Bayes", GaussianNB()), 
    ("Linear Support Vector", LinearSVC()),  
    ("Bagging Classification", BaggingClassifier()),
    ("Agglomerative Clustering", AgglomerativeClustering()),
    ("Support Vector Classifier", SVC()),
    ("Stochastic Gradient Descent", SGDClassifier()),
    ("Passive Aggressive Classifier", PassiveAggressiveClassifier()) 
]

# List to store all performances of the models.
results = []

# Store the name of each model for visualization purposes.
model_names = [] 

### Training the 20 Models

In [3]:
# Start tracking time
start_time = time.time()

for name, model in model_list:
    # Cross-validation, 5 folds.
    kfold = KFold(n_splits=5) 
    # Standard of measure: "accuracy" (suitable for this classification problem).
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy') 
    # Append the mean of 10 folds for each model.
    results.append(round(cv_results.mean() * 100, 2)) 
    # Attach the model name.
    model_names.append(name) 

# Calculate time taken.
end_time = time.time()
elapsed_time = end_time - start_time

# Indication when training is complete.
print(f"Training complete. Time taken: {elapsed_time} seconds")

KeyboardInterrupt: 

### Scores and Decision

In [None]:
compare_models = pd.DataFrame({
    'Model': model_names,
    'Score': results})
compare_models.sort_values(by='Score', ascending=False)

### Hyperparameter Tuning