### 6.2.0. Library Setup

In [16]:
def separate():
    print('*' * 40)

print("Data Analysis and Wrangling Packages")
import pandas as pd # Library for data processing and analysis.
print("- pandas version: {}". format(pd.__version__))
separate()

print("Modelling Packages")
import sklearn # Collection of machine learning algorithms.
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
import joblib # To export model.
print("- scikit-learn version: {}". format(sklearn.__version__))

# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier, PassiveAggressiveClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.cluster import AffinityPropagation, Birch, DBSCAN, KMeans
from sklearn.neural_network import MLPClassifier
separate()

# Ignore depreciation warnings.
import warnings
warnings.filterwarnings('ignore') 

# Track time taken to run (for modelling).
import time

Data Analysis and Wrangling Packages
- pandas version: 2.1.4
****************************************
Modelling Packages
- scikit-learn version: 1.3.0
****************************************


### 6.2.1. Modelling Setup

In [2]:
# Aquiring data.
adult_data = pd.read_csv("./data/adult_processed.csv") 

# X consists of all the features (target is excluded).
X = adult_data.drop(["income"], axis=1) 

# y consists of the target column `income`.
y = adult_data["income"] 

# Splitting the data 7:3.
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.3, random_state = 2024) 

# All 17 machine learning algorithms.
model_list = [
    ("BIRCH", Birch()),
    ("Perceptron", Perceptron()),
    ("Random Forest", RandomForestClassifier()),
    ("Decision Tree", DecisionTreeClassifier()),
    ("Neural Network", MLPClassifier()),
    ("Adaptive Boosting", AdaBoostClassifier()),
    ("Gradient Boosting", GradientBoostingClassifier()),
    ("K-Means Clustering", KMeans()),
    ("Logistic Regression", LogisticRegression()),
    ("K-Nearest Neighbours", KNeighborsClassifier()),
    ("Affinity Propagation", AffinityPropagation()),
    ("Gaussian Naive Bayes", GaussianNB()), 
    ("Linear Support Vector", LinearSVC()),  
    ("Bagging Classification", BaggingClassifier()),
    ("Support Vector Classifier", SVC()),
    ("Stochastic Gradient Descent", SGDClassifier()),
    ("Passive Aggressive Classifier", PassiveAggressiveClassifier()) 
]

# List to store all performances of the models.
results = []

# Store the name of each model for visualization purposes.
model_names = [] 

### 6.2.2. Training the 17 Models

In [3]:
# Start tracking time
start_time = time.time()

for name, model in model_list:
    # Cross-validation, 5 folds.
    kfold = KFold(n_splits=5) 
    # Standard of measure: "accuracy" (suitable for this classification problem).
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy') 
    # Append the mean of 10 folds for each model.
    results.append(round(cv_results.mean() * 100, 2)) 
    # Attach the model name.
    model_names.append(name) 

# Calculate time taken.
end_time = time.time()
elapsed_time = end_time - start_time

# Indication when training is complete.
print(f"Training complete. Time taken: {elapsed_time} seconds")

Training complete. Time taken: 3783.7183842658997 seconds


### 6.2.3. Scores and Decision

In [6]:
compare_models = pd.DataFrame({
    'Model': model_names,
    'Score': results})
compare_models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
6,Gradient Boosting,84.26
5,Adaptive Boosting,84.21
4,Neural Network,84.07
14,Support Vector Classifier,83.91
12,Linear Support Vector,83.61
8,Logistic Regression,83.58
2,Random Forest,83.05
13,Bagging Classification,82.55
15,Stochastic Gradient Descent,82.45
3,Decision Tree,82.08


#### Observations:

- Out of 17 models tested, 12 achieved an accuracy of at least 80% on our binary classification task, which is considered exemplary.
- Although the Perceptron and Passive Aggressive Classifier performed well, achieving accuracy scores in the 70s range, they fell slightly short of the 80% mark.
- The BIRCH model's accuracy of 40% indicates that it technically performs worse than a coin-flip, although it's worth noting that the binary target distribution tends to be skewed toward 0 rather than 1.
- The K-Means Clustering and Affinity Propagation models yielded particularly low accuracies, suggesting that these models are not suitable for our specific problem and dataset.

#### Due to resource constraints, we will only conduct GridSearch on the best-performing model: Gradient Boosting.

Note: While we are using GridSearch, the number of iterations will be limited to approximately 400. Consider this a proof-of-concept, but we will select appropriate parameters and ranges.

### 6.3.1. GridSearch Setup

In [7]:
# The best performing algorithm. 
gbc = GradientBoostingClassifier(random_state=2048) 

param_grid = { 
    'n_estimators': [100, 200, 300],  # Number of trees in the forest (higher will increase accuracy, but risk overfitting).
    'learning_rate': [0.05, 0.1, 0.2],  # Learning rate (lower requires more trees to capture data relationships but can help prevent overfitting
    'max_depth': [3, 4, 5],  # Maximum depth of the trees (Deeper trees can capture more complex patterns but are also more prone to overfitting).
    'min_samples_split': [2, 3, 4]  # Minimum number of samples required to split a node (higher results in simpler trees and  less overfitting).
}

# Number of iterations: 3 (n_estimators) * 3 (learning_rate) * 3 (max_depth) * 3 (min_samples_split) * 5 (CV) = 405 iterations.
grid_search = GridSearchCV(estimator=gbc, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

### 6.3.2. Running GridSearch

In [8]:
# Calculate time taken.
end_time = time.time()
elapsed_time = end_time - start_time

# Perform GridSearch.
grid_search.fit(X_train, Y_train)

# Indication when training is complete.
print(f"Training complete. Time taken: {elapsed_time} seconds")

Training complete. Time taken: 3926.420565843582 seconds


### 6.3.3. GridSearch Results

In [10]:
best_params = grid_search.best_params_
print("Best Parameters: ", best_params)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(Y_test, y_pred)
print("Accuracy: ", accuracy)

Best Parameters:  {'learning_rate': 0.1, 'max_depth': 4, 'min_samples_split': 4, 'n_estimators': 100}
Accuracy:  0.8483810365786275


In [None]:
## YAP

### 6.4. Optimal Parameters vs Base Model

In [22]:
# Train a new base Gradient Boosting Model.
gbc = GradientBoostingClassifier()
gbc.fit(X_train, Y_train)
acc_gbc = round(gbc.score(X_train, Y_train) * 100, 2)
print("Base Model: ", acc_gbc)


# Train a new Gradient Boosting Model with the best parameters from GridSearchCV.
gbc_gs = GradientBoostingClassifier(learning_rate=0.1, max_depth=4, min_samples_split=4, n_estimators=100)
gbc_gs.fit(X_train, Y_train)
acc_gbc_gs = round(gbc_gs.score(X_train, Y_train) * 100, 2)
print("GridSearch-ed Parameters: ", acc_gbc_gs)

Base Model:  84.66
GridSearch-ed Parameters:  84.96


#### YAP

### 6.5. Export Model

In [18]:
joblib.dump(gbc_gs, "models/trained_adult_income_classifier.pkl") 

['models/trained_adult_income_classifier.pkl']

### 6.Sample Prediction

In [31]:
sample_data = [ # Row 30161, actual `income` => 1.
    4, # `age`
    2,	# `workclass`
    0,	# `education-num`
    1,	# `marital-status`
    4,	# `occupation`
    1,	# `relationship`
    1,	# `race`
    1,	# `sex`
    2,	# `capital-gain`
    0,	# `capital-loss`
    2,	# `hours-per-week`
    1,   # `native-country`
]

sample_data = [sample_data]

# Load model.
model = joblib.load('models/trained_adult_income_classifier.pkl') 

# Result.
result = model.predict(sample_data)
print(result)

[1]


### End of Modelling