In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, make_scorer, recall_score, f1_score
from scipy.stats import randint
from imblearn.combine import SMOTETomek
from sklearn.feature_selection import RFECV

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

import joblib
import random
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1, 16, 25"

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device.type == "cuda":
    print('GPU available')
else:
    print('GPU not available')
device

GPU available


device(type='cuda')

In [13]:
# Setup seed for Reproduciblity
seed = 42
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [14]:
task2_dev_set = "610_ps4_training/trainingT2HAR/har_train.csv"
task2_df = pd.read_csv(task2_dev_set, header=None)

In [15]:
column_names = task2_df.columns
label = column_names[-1]

# Everything except the last element are features
X = task2_df.drop(columns=[label])
y = task2_df[label] 

# Make sure all columns are numeric
X = X.apply(pd.to_numeric)

# change the data structure from pandas to numpy 
X_numpy_arr = X.to_numpy()
y_numpy_arr = y.to_numpy()

In [16]:
# Define and fit the model
def determine_important_features():
    # From bell curves of learning Curving 75 trees is the approximatied peak of the curve.
    num_of_trees = [75]    
    for i in num_of_trees:
        model = RandomForestClassifier(n_estimators = i, random_state=seed)
        model.fit(X, y)

        # Calulate relevant value of all data define as important value which max as 1 and min as 0 
        feature_importances = pd.DataFrame(model.feature_importances_, index = X.columns, columns=['importance']).sort_values('importance', ascending=False)
        print(feature_importances)

In [20]:
#Hard Coding important 34 features
important_features = [63, 6, 64, 46, 15, 68, 101, 132, 110, 7, 24, 94, 9, 151, 161, 36, 98, 119, 12, 65, 79, 43, 138, 178, 141, 30, 134, 148, 66, 133, 34, 19, 77, 169]
print(len(important_features))

34


In [21]:
# Getting the important feature columns only
X_important_features = X[important_features]

# changing the data structure from pandas to numpy 
X_numpy_arr = X_important_features.to_numpy()
y_numpy_arr = y.to_numpy()

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_numpy_arr, y_numpy_arr, test_size=0.2, random_state=seed, stratify=y_numpy_arr)

In [22]:
# Applying SMOTE-Tomek to the training data BALANCING
smote_tomek = SMOTETomek(random_state=seed)
X_train_resampled, y_train_resampled = smote_tomek.fit_resample(X_train, y_train)

In [23]:
# Stratified K-Fold Cross-Validation setup
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

clf = RandomForestClassifier(random_state=seed)

# Use RFECV to select optimal number of features with cross-validation
rfecv = RFECV(estimator=clf, step=5, cv=StratifiedKFold(5), scoring='recall_macro')
models = []

In [34]:
# Cross-validation
for fold, (train_idx, valid_idx) in enumerate(kfold.split(X_train_resampled, y_train_resampled)):
    X_fold_train, X_fold_valid = X_train_resampled[train_idx], X_train_resampled[valid_idx]
    y_fold_train, y_fold_valid = y_train_resampled[train_idx], y_train_resampled[valid_idx]

    # print("before len(X_train_optimal):", len(X_fold_train))
    # print("before len(y_fold_train):", len(y_fold_train))

    # Fit RFECV to the data
    rfecv.fit(X_fold_train, y_fold_train)
    
    # IDENTIFY the Optimal number of features
    optimal_num_of_features = rfecv.n_features_
    print("Optimal number of features: %d" % optimal_num_of_features)
    # print("list of booleans:", rfecv.support_)
    
    # TRAIN your model with those features
    X_train_optimal = X_fold_train[:, rfecv.support_]  # Select the optimal features

    # print("len(X_train_optimal):", len(X_train_optimal))
    # print("len(y_fold_train):", len(y_fold_train))

    clf.fit(X_train_optimal, y_fold_train)
    
    # Training the model
    # clf.fit(X_fold_train, y_fold_train)
    
    # Predicting on the validation fold
    # y_pred = clf.predict(X_fold_valid)
    X_valid_optimal = X_fold_valid[:, rfecv.support_]  # Select the optimal features
    y_pred = clf.predict(X_valid_optimal)
    
    # Metrics
    accuracy = accuracy_score(y_fold_valid, y_pred)
    precision = precision_score(y_fold_valid, y_pred, average='macro')
    recall = recall_score(y_fold_valid, y_pred, average='macro')
    f1 = f1_score(y_fold_valid, y_pred, average='macro')
    
    print(f"Fold {fold + 1} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

    # Save metrics
    temp_list = [train_idx, valid_idx, recall, accuracy, f1]
    models.append(temp_list)

Optimal number of features: 34
Fold 1 - Accuracy: 0.9486, Precision: 0.9508, Recall: 0.9476, F1 Score: 0.9465
Optimal number of features: 34
Fold 2 - Accuracy: 0.9486, Precision: 0.9493, Recall: 0.9473, F1 Score: 0.9453
Optimal number of features: 34
Fold 3 - Accuracy: 0.9577, Precision: 0.9587, Recall: 0.9569, F1 Score: 0.9561
Optimal number of features: 29
Fold 4 - Accuracy: 0.9519, Precision: 0.9523, Recall: 0.9509, F1 Score: 0.9489
Optimal number of features: 34
Fold 5 - Accuracy: 0.9386, Precision: 0.9411, Recall: 0.9370, F1 Score: 0.9341


In [39]:
# We want the greatest recall value
recalls = []
# models is a list of lists
for i, list_of_metrics in enumerate(models):
    r = list_of_metrics[2]
    recalls.append(r)

index_max = np.argmax(recalls)
best_recall = recalls[index_max]
print("best recall:", best_recall)
print(index_max)

best recall: 0.9568863772193591
2


In [40]:
# models[index_max] = [train_idx, valid_idx, recall, accuracy, f1]
train_idx = models[index_max][0]
valid_idx = models[index_max][1]

In [42]:
# Evaluation on the test set
# y_test_pred = clf.predict(X_test)
X_fold_train, X_fold_valid = X_train_resampled[train_idx], X_train_resampled[valid_idx]
y_fold_train, y_fold_valid = y_train_resampled[train_idx], y_train_resampled[valid_idx]

# Fit RFECV to the data
rfecv.fit(X_fold_train, y_fold_train)

# IDENTIFY the Optimal number of features
optimal_num_of_features = rfecv.n_features_

In [43]:
# TRAIN your model with those features
X_train_optimal = X_fold_train[:, rfecv.support_]  # Select the optimal features
clf.fit(X_train_optimal, y_fold_train)

In [49]:
# VALIDATE
X_valid_optimal = X_fold_valid[:, rfecv.support_]  # Select the optimal features
y_pred = rf_model_loaded.predict(X_valid_optimal)

test_accuracy = accuracy_score(y_fold_valid, y_pred)
test_precision = precision_score(y_fold_valid, y_pred, average='macro')
test_recall = recall_score(y_fold_valid, y_pred, average='macro')
test_f1 = f1_score(y_fold_valid, y_pred, average='macro')

# Fold 3 - Accuracy: 0.9577, Precision: 0.9587, Recall: 0.9569, F1 Score: 0.9561
print(f"\nTest Set Evaluation - Accuracy: {test_accuracy:.4f}, Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, F1 Score: {test_f1:.4f}")
#determine_important_features()


Test Set Evaluation - Accuracy: 0.9577, Precision: 0.9587, Recall: 0.9569, F1 Score: 0.9561


In [50]:
final_model_name = "task2_randomforest_smote_balanced_rfecv_model.joblib"
joblib.dump(clf, final_model_name)
# Load the saved model
rf_model_loaded = joblib.load(final_model_name)

In [51]:
print(f"Model was trained with {clf.n_features_in_} features.")
print(f"Prediction data has {X_numpy_arr.shape[1]} features.")

Model was trained with 34 features.
Prediction data has 34 features.
