## We will train the Random Forest Classifier model in 3 different ways
1. Unweighted
2. Class-Weighted
3. SMOTE-Augmented

### Unweighted Random Forest Classifier

In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# To maintain uniformity across multiple iterations
RANDOM_SEED = 42

In [3]:
pos_df = pd.read_csv("../dataset/combined_extracted/custom_feature/positive.csv")
neg_df = pd.read_csv("../dataset/combined_extracted/custom_feature/negative.csv")

pos_df['label'] = 1
neg_df['label'] = 0

In [4]:
# Dividing the dataset in 3 parts - Train set, Validation set ans Test set

full_df = pd.concat([pos_df, neg_df], ignore_index=True)

train_df, temp_df = train_test_split(
    full_df, test_size=0.3, stratify=full_df['label'], random_state=RANDOM_SEED
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.5, stratify=temp_df['label'], random_state=RANDOM_SEED
)

train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

print(f"Train: {len(train_df)}, Validation: {len(val_df)}, Test: {len(test_df)}")
base = "../dataset/model_data"
os.makedirs(base, exist_ok=True)
train_df.to_csv(os.path.join(base, "train_set.csv"), index=False)
val_df.to_csv(os.path.join(base, "validation_set.csv"), index=False)
test_df.to_csv(os.path.join(base, "test_set.csv"), index=False)

Train: 3134, Validation: 672, Test: 672


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

In [7]:
# Training the model

features = ["hankel_mean", "hankel_std", "hankel_skew", "hankel_kurtosis", "hankel_peak"]

X_train = train_df[features]
y_train = train_df["label"]

clf = RandomForestClassifier(random_state=RANDOM_SEED)
clf.fit(X_train, y_train)

train_preds = clf.predict(X_train)
print("Training Accuracy:", accuracy_score(y_train, train_preds))

Training Accuracy: 1.0


In [9]:
# Validating and fine tuning

X_val = val_df[features]
y_val = val_df["label"]

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=RANDOM_SEED),
    param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
val_preds = best_model.predict(X_val)

print("Validation Accuracy:", accuracy_score(y_val, val_preds))
print(classification_report(y_val, val_preds))

Fitting 3 folds for each of 12 candidates, totalling 36 fits
Validation Accuracy: 0.9122023809523809
              precision    recall  f1-score   support

           0       0.81      0.32      0.46        78
           1       0.92      0.99      0.95       594

    accuracy                           0.91       672
   macro avg       0.86      0.66      0.71       672
weighted avg       0.90      0.91      0.89       672



In [10]:
# Testing

X_test = test_df[features]
y_test = test_df["label"]

test_preds = best_model.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, test_preds))
print(classification_report(y_test, test_preds))


Test Accuracy: 0.8913690476190477
              precision    recall  f1-score   support

           0       0.59      0.25      0.35        79
           1       0.91      0.98      0.94       593

    accuracy                           0.89       672
   macro avg       0.75      0.61      0.65       672
weighted avg       0.87      0.89      0.87       672



### Class-Weighted Random Forest Classifier

In [11]:
# Training the model

features = ["hankel_mean", "hankel_std", "hankel_skew", "hankel_kurtosis", "hankel_peak"]

X_train = train_df[features]
y_train = train_df["label"]

clf = RandomForestClassifier(class_weight="balanced", random_state=RANDOM_SEED)
clf.fit(X_train, y_train)

train_preds = clf.predict(X_train)
print("Training Accuracy:", accuracy_score(y_train, train_preds))

Training Accuracy: 1.0


In [12]:
# Validating and fine tuning

X_val = val_df[features]
y_val = val_df["label"]

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(
    RandomForestClassifier(class_weight="balanced", random_state=RANDOM_SEED),
    param_grid,
    cv=3,
    scoring='balanced_accuracy',
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
val_preds = best_model.predict(X_val)

print("Validation Accuracy:", accuracy_score(y_val, val_preds))
print(classification_report(y_val, val_preds))


Fitting 3 folds for each of 12 candidates, totalling 36 fits
Validation Accuracy: 0.8645833333333334
              precision    recall  f1-score   support

           0       0.43      0.53      0.47        78
           1       0.94      0.91      0.92       594

    accuracy                           0.86       672
   macro avg       0.68      0.72      0.70       672
weighted avg       0.88      0.86      0.87       672



In [13]:
# Testing

X_test = test_df[features]
y_test = test_df["label"]

test_preds = best_model.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, test_preds))
print(classification_report(y_test, test_preds))

Test Accuracy: 0.8735119047619048
              precision    recall  f1-score   support

           0       0.47      0.58      0.52        79
           1       0.94      0.91      0.93       593

    accuracy                           0.87       672
   macro avg       0.71      0.75      0.72       672
weighted avg       0.89      0.87      0.88       672



### Random Forest Clkassifier with SMOTE-Augmentation

In [14]:
from imblearn.over_sampling import SMOTE

In [15]:
# Training the model

features = ["hankel_mean", "hankel_std", "hankel_skew", "hankel_kurtosis", "hankel_peak"]

X_train = train_df[features]
y_train = train_df["label"]

smote = SMOTE(random_state=RANDOM_SEED)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

clf = RandomForestClassifier(random_state=RANDOM_SEED)
clf.fit(X_train_resampled, y_train_resampled)

train_preds = clf.predict(X_train)
print("Training Accuracy:", accuracy_score(y_train, train_preds))

Training Accuracy: 1.0


In [16]:
# Validating and fine tuning

X_val = val_df[features]
y_val = val_df["label"]

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=RANDOM_SEED),
    param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X_train_resampled, y_train_resampled)

best_model = grid_search.best_estimator_
val_preds = best_model.predict(X_val)

print("Validation Accuracy:", accuracy_score(y_val, val_preds))
print(classification_report(y_val, val_preds))

Fitting 3 folds for each of 12 candidates, totalling 36 fits
Validation Accuracy: 0.8764880952380952
              precision    recall  f1-score   support

           0       0.48      0.63      0.54        78
           1       0.95      0.91      0.93       594

    accuracy                           0.88       672
   macro avg       0.71      0.77      0.74       672
weighted avg       0.89      0.88      0.88       672



In [17]:
# Testing

X_test = test_df[features]
y_test = test_df["label"]

test_preds = best_model.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, test_preds))
print(classification_report(y_test, test_preds))

Test Accuracy: 0.8511904761904762
              precision    recall  f1-score   support

           0       0.41      0.59      0.48        79
           1       0.94      0.89      0.91       593

    accuracy                           0.85       672
   macro avg       0.68      0.74      0.70       672
weighted avg       0.88      0.85      0.86       672

