In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import joblib

class RandomForestClassifierModel:
    def __init__(self, data_path):
        self.df = pd.read_csv(data_path)
        self.preprocess_data()
        self.setup_class_weights()

    def preprocess_data(self):
        """Preprocess the data by encoding categorical variables"""
        self.df['Status'] = LabelEncoder().fit_transform(self.df['Status'])
        self.df['Firmware_Version'] = LabelEncoder().fit_transform(self.df['Firmware_Version'])
        self.df['Antenna_ID'] = LabelEncoder().fit_transform(self.df['Antenna_ID'])

    def setup_class_weights(self):
        """Calculate class weights for imbalanced dataset"""
        self.class_weights = compute_class_weight('balanced',
                                                  classes=np.unique(self.df['Status']),
                                                  y=self.df['Status'])
        self.class_weights_dict = {i: weight for i, weight in enumerate(self.class_weights)}
        print("Class Weights:", self.class_weights_dict)

    def prepare_data(self, features, target='Status'):
        """Prepare train, validation, and test datasets"""
        X = self.df[features]
        y = self.df[target]

        # First split: train+val / test
        X_train_val, X_test, y_train_val, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        # Second split: train / val
        X_train, X_val, y_train, y_val = train_test_split(
            X_train_val, y_train_val, test_size=0.2, random_state=42
        )

        return X_train, X_val, X_test, y_train, y_val, y_test

    def train_and_evaluate_rf(self, X_train, X_val, X_test, y_train, y_val, y_test):
        """Train and evaluate the random forest model"""
        model = RandomForestClassifier(
            n_estimators=50,
            max_depth=7,
            min_samples_split=10,
            random_state=42,
            class_weight=self.class_weights_dict
        )

        model.fit(X_train, y_train)

        joblib.dump(model, 'random_forest_model.joblib')
        print(f"Model saved to {'random_forest_model.joblib'}")

        val_accuracy = model.score(X_val, y_val)
        print("Validation Accuracy:", val_accuracy)

        test_accuracy = model.score(X_test, y_test)
        print("Test Accuracy:", test_accuracy)

        y_pred = model.predict(X_test)
        print("\nClassification Report on Test Dataset:")
        print(classification_report(y_test, y_pred, target_names=['Failure', 'Normal', 'Warning']))

        print("\nConfusion Matrix on Test Dataset:")
        print(confusion_matrix(y_test, y_pred))

        return val_accuracy, test_accuracy

features_base = ['SINR', 'Signal_Strength', 'Traffic', 'Temperature']
features_with_power_downtime = features_base + ['Power_Consumption', 'Downtime']
features_with_humidity = features_base + ['Humidity']
features_all = features_base + ['Power_Consumption', 'Downtime', 'Humidity']

feature_sets = {
    "Base Features": features_base,
    "With Power Consumption and Downtime": features_with_power_downtime,
    "With Humidity": features_with_humidity,
    "With All Features": features_all
}

if __name__ == "__main__":
    rf_classifier = RandomForestClassifierModel("../../mobilis_data_cleaned.csv")

    best_features = features_with_humidity
    X_train, X_val, X_test, y_train, y_val, y_test = rf_classifier.prepare_data(best_features)

    val_acc, test_acc = rf_classifier.train_and_evaluate_rf(X_train, X_val, X_test, y_train, y_val, y_test)

Class Weights: {0: 0.5535342538064367, 1: 4.121625273294519, 2: 1.0517411064096536}
Model saved to random_forest_model.joblib
Validation Accuracy: 1.0
Test Accuracy: 1.0

Classification Report on Test Dataset:
              precision    recall  f1-score   support

     Failure       1.00      1.00      1.00     80817
      Normal       1.00      1.00      1.00     10865

    accuracy                           1.00    134224
   macro avg       1.00      1.00      1.00    134224
weighted avg       1.00      1.00      1.00    134224


Confusion Matrix on Test Dataset:
[[80817     0     0]
 [    0 10865     0]
 [    0     0 42542]]
