In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_decision_forests as tfdf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import seaborn as sns
import matplotlib.pyplot as plt

# TPU/GPU Strategy setup
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
    strategy = tf.distribute.TPUStrategy(tpu)
    print("Running on TPU:", tpu.master())
except ValueError:
    print("TPU not found. Running on CPU/GPU.")
    strategy = tf.distribute.get_strategy()

class RandomForestClassifier:
    def __init__(self, data_path):
        self.df = pd.read_csv(data_path)
        self.preprocess_data()
        self.setup_class_weights()
        
    def preprocess_data(self):
        """Preprocess the data by encoding categorical variables"""
        self.df['Status'] = LabelEncoder().fit_transform(self.df['Status'])
        self.df['Firmware_Version'] = LabelEncoder().fit_transform(self.df['Firmware_Version'])
        self.df['Antenna_ID'] = LabelEncoder().fit_transform(self.df['Antenna_ID'])
        
    def setup_class_weights(self):
        """Calculate class weights for imbalanced dataset"""
        self.class_weights = compute_class_weight('balanced', 
                                                classes=np.unique(self.df['Status']), 
                                                y=self.df['Status'])
        self.class_weights_dict = {i: weight for i, weight in enumerate(self.class_weights)}
        print("Class Weights:", self.class_weights_dict)
        
    def convert_to_tf_dataset(self, X, y):
        """Convert data to TensorFlow dataset"""
        dataset = tf.data.Dataset.from_tensor_slices((X.values, y.values))
        dataset = dataset.batch(1024)
        return dataset
    
    def prepare_data(self, features, target='Status'):
        """Prepare train, validation, and test datasets"""
        X = self.df[features]
        y = self.df[target]
        
        # First split: train+val / test
        X_train_val, X_test, y_train_val, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        # Second split: train / val
        X_train, X_val, y_train, y_val = train_test_split(
            X_train_val, y_train_val, test_size=0.2, random_state=42
        )
        
        # Convert to TF datasets
        train_dataset = self.convert_to_tf_dataset(X_train, y_train)
        val_dataset = self.convert_to_tf_dataset(X_val, y_val)
        test_dataset = self.convert_to_tf_dataset(X_test, y_test)
        
        return train_dataset, val_dataset, test_dataset, y_test.values
    
    def train_and_evaluate_rf(self, train_dataset, val_dataset, test_dataset, y_test):
        """Train and evaluate the random forest model"""
        with strategy.scope():
            model = tfdf.keras.RandomForestModel(
                num_trees=50,
                max_depth=7,
                min_examples=10,
                task=tfdf.keras.Task.CLASSIFICATION,
                random_seed=42
            )
            model.compile(metrics=["accuracy"])
        
        # Train the model
        model.fit(train_dataset, validation_data=val_dataset, 
                 class_weight=self.class_weights_dict, verbose=1)
        
        # Evaluate on validation set
        val_evaluation = model.evaluate(val_dataset, return_dict=True)
        print("Validation Accuracy:", val_evaluation["accuracy"])
        
        # Evaluate on test set
        test_evaluation = model.evaluate(test_dataset, return_dict=True)
        print("Test Accuracy:", test_evaluation["accuracy"])
        
        # Generate predictions
        y_pred = model.predict(test_dataset)
        y_pred = np.argmax(y_pred, axis=1)
        
        # Print classification report
        print("\nClassification Report on Test Dataset:")
        print(classification_report(y_test, y_pred, 
                                 target_names=['Failure', 'Normal', 'Warning']))
        
        # Print confusion matrix
        print("\nConfusion Matrix on Test Dataset:")
        print(confusion_matrix(y_test, y_pred))
        
        return val_evaluation["accuracy"], test_evaluation["accuracy"]
    
    def compare_feature_sets(self, feature_sets):
        """Compare different feature sets"""
        results = {}
        for key, features in feature_sets.items():
            print(f"\nTraining with feature set: {key}")
            train_ds, val_ds, test_ds, y_test = self.prepare_data(features)
            val_acc, test_acc = self.train_and_evaluate_rf(
                train_ds, val_ds, test_ds, y_test
            )
            results[key] = {
                'Validation Accuracy': val_acc, 
                'Test Accuracy': test_acc
            }
        return results

# Define feature sets
features_base = ['SINR', 'Signal_Strength', 'Traffic', 'Temperature']
features_with_power_downtime = features_base + ['Power_Consumption', 'Downtime']
features_with_humidity = features_base + ['Humidity']
features_all = features_base + ['Power_Consumption', 'Downtime', 'Humidity']

feature_sets = {
    "Base Features": features_base,
    "With Power Consumption and Downtime": features_with_power_downtime,
    "With Humidity": features_with_humidity,
    "With All Features": features_all
}

# Usage example
if __name__ == "__main__":
    # Initialize classifier
    rf_classifier = RandomForestClassifier("../mobilis_data_cleaned.csv")
    
    # Compare feature sets
    print("\nComparing feature sets...")
    results = rf_classifier.compare_feature_sets(feature_sets)
    
    # Print results
    print("\nFeature Set Comparison Results:")
    for feature_set, metrics in results.items():
        print(f"{feature_set}:")
        print(f"  Validation Accuracy: {metrics['Validation Accuracy']:.4f}")
        print(f"  Test Accuracy: {metrics['Test Accuracy']:.4f}")

TPU not found. Running on CPU/GPU.
Class Weights: {0: 0.5535342538064367, 1: 4.121625273294519, 2: 1.0517411064096536}

Comparing feature sets...

Training with feature set: Base Features
Use /var/folders/4t/vgrmglh57ds018mbdc9875y80000gn/T/tmp51dmbhsx as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.568218. Found 429513 examples.
Reading validation dataset...
Num validation examples: tf.Tensor(107379, shape=(), dtype=int32)
Validation dataset read in 0:00:00.185946. Found 107379 examples.
Training model...


I0000 00:00:1738059516.455788 1330287 kernel.cc:782] Start Yggdrasil model training
I0000 00:00:1738059516.456842 1330287 kernel.cc:783] Collect training examples
I0000 00:00:1738059516.456849 1330287 kernel.cc:795] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

I0000 00:00:1738059516.458950 1330287 kernel.cc:401] Number of batches: 420
I0000 00:00:1738059516.458956 1330287 kernel.cc:402] Number of examples: 429513
I0000 00:00:1738059516.477332 1330287 kernel.cc:802] Training dataset:
Number of records: 429513
Number of columns: 6

Number of columns by type:
	NUMERICAL: 5 (83.3333%)
	CATEGORICAL: 1 (16.6667%)

Columns:

NUMERICAL: 5 (83.3333%)
	1: "__WEIGHTS" NUMERIC

Model trained in 0:00:01.824971
Compiling model...
Model compiled.


I0000 00:00:1738059518.256316 1333487 random_forest.cc:811] Training of tree  50/50 (tree index:49) done accuracy:0.982675 logloss:0.628853
I0000 00:00:1738059518.256371 1333475 random_forest.cc:891] Final OOB metrics: accuracy:0.982675 logloss:0.628853
I0000 00:00:1738059518.258873 1333475 kernel.cc:926] Export model in log directory: /var/folders/4t/vgrmglh57ds018mbdc9875y80000gn/T/tmp51dmbhsx with prefix c9a437b61349469b
I0000 00:00:1738059518.260942 1333475 kernel.cc:944] Save model in resources
I0000 00:00:1738059518.264140 1330287 abstract_model.cc:914] Model self evaluation:
Number of predictions (without weights): 429513
Number of predictions (with weights): 429296
Task: CLASSIFICATION
Label: __LABEL

Accuracy: 0.982675  CI95[W][0.982344 0.983001]
LogLoss: : 0.628853
ErrorRate: : 0.0173253

Default Accuracy: : 0.333599
Default LogLoss: : 1.09861
Default ErrorRate: : 0.666401

Confusion Table:
truth\prediction
                   1                  2                  3
1  141648.

Validation Accuracy: 0.9803686141967773
Test Accuracy: 0.9799812436103821

Classification Report on Test Dataset:
              precision    recall  f1-score   support

     Failure       1.00      0.99      0.99     80817
      Normal       0.85      1.00      0.92     10865

    accuracy                           0.98    134224
   macro avg       0.94      0.98      0.96    134224
weighted avg       0.98      0.98      0.98    134224


Confusion Matrix on Test Dataset:
[[79889   217   711]
 [    0 10865     0]
 [    0  1759 40783]]

Training with feature set: With Power Consumption and Downtime
Use /var/folders/4t/vgrmglh57ds018mbdc9875y80000gn/T/tmp8dvsbhcr as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.492359. Found 429513 examples.
Reading validation dataset...
Num validation examples: tf.Tensor(107379, shape=(), dtype=int32)
Validation dataset read in 0:00:00.182065. Found 107379 examples.
Training model...


I0000 00:00:1738059520.043756 1330287 kernel.cc:782] Start Yggdrasil model training
I0000 00:00:1738059520.043767 1330287 kernel.cc:783] Collect training examples
I0000 00:00:1738059520.043771 1330287 kernel.cc:795] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

I0000 00:00:1738059520.043814 1330287 kernel.cc:401] Number of batches: 420
I0000 00:00:1738059520.043817 1330287 kernel.cc:402] Number of examples: 429513
I0000 00:00:1738059520.067809 1330287 kernel.cc:802] Training dataset:
Number of records: 429513
Number of columns: 8

Number of columns by type:
	NUMERICAL: 7 (87.5%)
	CATEGORICAL: 1 (12.5%)

Columns:

NUMERICAL: 7 (87.5%)
	1: "__WEIGHTS" NUMERICAL mean:0

Model trained in 0:00:02.202203
Compiling model...
Model compiled.


I0000 00:00:1738059522.234242 1333651 random_forest.cc:811] Training of tree  50/50 (tree index:49) done accuracy:0.982672 logloss:0.632374
I0000 00:00:1738059522.234334 1333641 random_forest.cc:891] Final OOB metrics: accuracy:0.982672 logloss:0.632374
I0000 00:00:1738059522.235860 1333641 kernel.cc:926] Export model in log directory: /var/folders/4t/vgrmglh57ds018mbdc9875y80000gn/T/tmp8dvsbhcr with prefix 4404d1320713432c
I0000 00:00:1738059522.237027 1333641 kernel.cc:944] Save model in resources
I0000 00:00:1738059522.239043 1330287 abstract_model.cc:914] Model self evaluation:
Number of predictions (without weights): 429513
Number of predictions (with weights): 429296
Task: CLASSIFICATION
Label: __LABEL

Accuracy: 0.982672  CI95[W][0.982341 0.982998]
LogLoss: : 0.632374
ErrorRate: : 0.017328

Default Accuracy: : 0.333599
Default LogLoss: : 1.09861
Default ErrorRate: : 0.666401

Confusion Table:
truth\prediction
                   1                  2                  3
1  141647.7

Validation Accuracy: 0.9803686141967773
Test Accuracy: 0.9799812436103821

Classification Report on Test Dataset:
              precision    recall  f1-score   support

     Failure       1.00      0.99      0.99     80817
      Normal       0.85      1.00      0.92     10865

    accuracy                           0.98    134224
   macro avg       0.94      0.98      0.96    134224
weighted avg       0.98      0.98      0.98    134224


Confusion Matrix on Test Dataset:
[[79889   217   711]
 [    0 10865     0]
 [    0  1759 40783]]

Training with feature set: With Humidity
Use /var/folders/4t/vgrmglh57ds018mbdc9875y80000gn/T/tmp_3yl_w51 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.486068. Found 429513 examples.
Reading validation dataset...
Num validation examples: tf.Tensor(107379, shape=(), dtype=int32)
Validation dataset read in 0:00:00.186834. Found 107379 examples.
Training model...


I0000 00:00:1738059524.030115 1330287 kernel.cc:782] Start Yggdrasil model training
I0000 00:00:1738059524.030124 1330287 kernel.cc:783] Collect training examples
I0000 00:00:1738059524.030128 1330287 kernel.cc:795] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

I0000 00:00:1738059524.030181 1330287 kernel.cc:401] Number of batches: 420
I0000 00:00:1738059524.030184 1330287 kernel.cc:402] Number of examples: 429513
I0000 00:00:1738059524.051301 1330287 kernel.cc:802] Training dataset:
Number of records: 429513
Number of columns: 7

Number of columns by type:
	NUMERICAL: 6 (85.7143%)
	CATEGORICAL: 1 (14.2857%)

Columns:

NUMERICAL: 6 (85.7143%)
	1: "__WEIGHTS" NUMERIC

Model trained in 0:00:02.046539
Compiling model...
Model compiled.


I0000 00:00:1738059526.065278 1333826 random_forest.cc:811] Training of tree  50/50 (tree index:49) done accuracy:0.999974 logloss:0.00452913
I0000 00:00:1738059526.065351 1333815 random_forest.cc:891] Final OOB metrics: accuracy:0.999974 logloss:0.00452913
I0000 00:00:1738059526.066857 1333815 kernel.cc:926] Export model in log directory: /var/folders/4t/vgrmglh57ds018mbdc9875y80000gn/T/tmp_3yl_w51 with prefix b161877bf15f4d92
I0000 00:00:1738059526.068026 1333815 kernel.cc:944] Save model in resources
I0000 00:00:1738059526.069315 1330287 abstract_model.cc:914] Model self evaluation:
Number of predictions (without weights): 429513
Number of predictions (with weights): 429296
Task: CLASSIFICATION
Label: __LABEL

Accuracy: 0.999974  CI95[W][0.999957 0.999985]
LogLoss: : 0.00452913
ErrorRate: : 2.58088e-05

Default Accuracy: : 0.333599
Default LogLoss: : 1.09861
Default ErrorRate: : 0.666401

Confusion Table:
truth\prediction
                   1                 2                  3
1  

Validation Accuracy: 0.9999720454216003
Test Accuracy: 0.9999925494194031

Classification Report on Test Dataset:
              precision    recall  f1-score   support

     Failure       1.00      1.00      1.00     80817
      Normal       1.00      1.00      1.00     10865

    accuracy                           1.00    134224
   macro avg       1.00      1.00      1.00    134224
weighted avg       1.00      1.00      1.00    134224


Confusion Matrix on Test Dataset:
[[80816     0     1]
 [    0 10865     0]
 [    0     0 42542]]

Training with feature set: With All Features
Use /var/folders/4t/vgrmglh57ds018mbdc9875y80000gn/T/tmphl_y3o9p as temporary training directory
Reading training dataset...




Training dataset read in 0:00:00.509909. Found 429513 examples.
Reading validation dataset...




Num validation examples: tf.Tensor(107379, shape=(), dtype=int32)
Validation dataset read in 0:00:00.196453. Found 107379 examples.
Training model...


I0000 00:00:1738059527.854149 1330287 kernel.cc:782] Start Yggdrasil model training
I0000 00:00:1738059527.854160 1330287 kernel.cc:783] Collect training examples
I0000 00:00:1738059527.854165 1330287 kernel.cc:795] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

I0000 00:00:1738059527.854205 1330287 kernel.cc:401] Number of batches: 420
I0000 00:00:1738059527.854208 1330287 kernel.cc:402] Number of examples: 429513
I0000 00:00:1738059527.883131 1330287 kernel.cc:802] Training dataset:
Number of records: 429513
Number of columns: 9

Number of columns by type:
	NUMERICAL: 8 (88.8889%)
	CATEGORICAL: 1 (11.1111%)

Columns:

NUMERICAL: 8 (88.8889%)
	1: "__WEIGHTS" NUMERIC

Model trained in 0:00:02.190994
Compiling model...
Model compiled.


I0000 00:00:1738059530.033342 1333993 random_forest.cc:811] Training of tree  50/50 (tree index:48) done accuracy:0.999304 logloss:0.0219263
I0000 00:00:1738059530.033563 1333979 random_forest.cc:891] Final OOB metrics: accuracy:0.999304 logloss:0.0219263
I0000 00:00:1738059530.035426 1333979 kernel.cc:926] Export model in log directory: /var/folders/4t/vgrmglh57ds018mbdc9875y80000gn/T/tmphl_y3o9p with prefix f8e450ac52274d88
I0000 00:00:1738059530.036549 1333979 kernel.cc:944] Save model in resources
I0000 00:00:1738059530.038171 1330287 abstract_model.cc:914] Model self evaluation:
Number of predictions (without weights): 429513
Number of predictions (with weights): 429296
Task: CLASSIFICATION
Label: __LABEL

Accuracy: 0.999304  CI95[W][0.999234 0.999369]
LogLoss: : 0.0219263
ErrorRate: : 0.000696123

Default Accuracy: : 0.333599
Default LogLoss: : 1.09861
Default ErrorRate: : 0.666401

Confusion Table:
truth\prediction
                   1                 2                  3
1  142

Validation Accuracy: 0.998500645160675
Test Accuracy: 0.9983832836151123

Classification Report on Test Dataset:
              precision    recall  f1-score   support

     Failure       1.00      1.00      1.00     80817
      Normal       1.00      1.00      1.00     10865

    accuracy                           1.00    134224
   macro avg       1.00      1.00      1.00    134224
weighted avg       1.00      1.00      1.00    134224


Confusion Matrix on Test Dataset:
[[80600     0   217]
 [    0 10865     0]
 [    0     0 42542]]

Feature Set Comparison Results:
Base Features:
  Validation Accuracy: 0.9804
  Test Accuracy: 0.9800
With Power Consumption and Downtime:
  Validation Accuracy: 0.9804
  Test Accuracy: 0.9800
With Humidity:
  Validation Accuracy: 1.0000
  Test Accuracy: 1.0000
With All Features:
  Validation Accuracy: 0.9985
  Test Accuracy: 0.9984


In [6]:
best_features = features_with_humidity

train_ds, val_ds, test_ds, y_test = prepare_data(best_features)

with strategy.scope():
    final_model = tfdf.keras.RandomForestModel(
        num_trees=50,  
        max_depth=7,  
        min_examples=10,  
        task=tfdf.keras.Task.CLASSIFICATION,
        random_seed=42
    )
    final_model.compile(metrics=["accuracy"])


final_model.fit(train_ds, validation_data=val_ds, verbose=1, class_weight=class_weights_dict)

model_path = "random_forest_model"

final_model.save(model_path , save_format="tf")
print(f"Model saved to {model_path}")

Use /var/folders/4t/vgrmglh57ds018mbdc9875y80000gn/T/tmpv3knixu4 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.551546. Found 429513 examples.
Reading validation dataset...
Num validation examples: tf.Tensor(107379, shape=(), dtype=int32)
Validation dataset read in 0:00:00.183959. Found 107379 examples.
Training model...


I0000 00:00:1738060207.201089 1330287 kernel.cc:782] Start Yggdrasil model training
I0000 00:00:1738060207.202079 1330287 kernel.cc:783] Collect training examples
I0000 00:00:1738060207.202087 1330287 kernel.cc:795] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

I0000 00:00:1738060207.203830 1330287 kernel.cc:401] Number of batches: 420
I0000 00:00:1738060207.203836 1330287 kernel.cc:402] Number of examples: 429513
I0000 00:00:1738060207.225861 1330287 kernel.cc:802] Training dataset:
Number of records: 429513
Number of columns: 7

Number of columns by type:
	NUMERICAL: 6 (85.7143%)
	CATEGORICAL: 1 (14.2857%)

Columns:

NUMERICAL: 6 (85.7143%)
	1: "__WEIGHTS" NUMERIC

Model trained in 0:00:02.078005
Compiling model...
Model compiled.


I0000 00:00:1738060209.251812 1346397 random_forest.cc:811] Training of tree  50/50 (tree index:49) done accuracy:0.999974 logloss:0.00452913
I0000 00:00:1738060209.251903 1346389 random_forest.cc:891] Final OOB metrics: accuracy:0.999974 logloss:0.00452913
I0000 00:00:1738060209.254551 1346389 kernel.cc:926] Export model in log directory: /var/folders/4t/vgrmglh57ds018mbdc9875y80000gn/T/tmpv3knixu4 with prefix 2e7fac9e21ed46ef
I0000 00:00:1738060209.257124 1346389 kernel.cc:944] Save model in resources
I0000 00:00:1738060209.260204 1330287 abstract_model.cc:914] Model self evaluation:
Number of predictions (without weights): 429513
Number of predictions (with weights): 429296
Task: CLASSIFICATION
Label: __LABEL

Accuracy: 0.999974  CI95[W][0.999957 0.999985]
LogLoss: : 0.00452913
ErrorRate: : 2.58088e-05

Default Accuracy: : 0.333599
Default LogLoss: : 1.09861
Default ErrorRate: : 0.666401

Confusion Table:
truth\prediction
                   1                 2                  3
1  

INFO:tensorflow:Assets written to: random_forest_model/assets


INFO:tensorflow:Assets written to: random_forest_model/assets


Model saved to random_forest_model
