# Random Forest Classification

## Importing the libraries

In [105]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [106]:
dataset = pd.read_csv('synthetic_inspections_labeled.csv')
X = dataset.drop(['inspection_id', 'worker_id', 'inspector_id', 'lane_id', 'vendor_id', 'maintenance_due_date', 'submission_time', 'photo_hash', 'device_id', 'ip_address', 'app_version', 'anomaly_type', 'anomaly_flag'], axis=1)
y = dataset['anomaly_flag']


# Perform One-Hot Encoding on the 'status' column


In [107]:
X = pd.get_dummies(X, columns=['status'], drop_first=True)

## Splitting the dataset into the Training set and Test set

In [108]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (15000, 24)
y_train shape: (15000,)
X_test shape: (5000, 24)
y_test shape: (5000,)


## Feature Scaling

In [109]:
from sklearn.preprocessing import StandardScaler
feature_columns = list(X.columns) # Get column names from original DataFrame X
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [110]:
print(X_train)

[[ 0.40113044  0.5649031   0.65372114 ... -0.33419356 -0.33849474
  -0.23182058]
 [ 0.42068187  0.14413361 -0.15973333 ...  1.63967255 -0.33849474
  -0.23182058]
 [-1.09263277 -0.84103715 -0.28452298 ... -0.80298676 -0.33849474
   4.31368093]
 ...
 [-0.5856915  -0.20157398 -1.26264836 ...  0.15310464 -0.33849474
  -0.23182058]
 [-0.43543415 -1.02009226  1.50702215 ...  0.07908466 -0.33849474
   4.31368093]
 [-1.05966154  0.38845745 -1.25528854 ...  1.15237436 -0.33849474
  -0.23182058]]


In [111]:
print(X_test)

[[-0.15225084 -0.12551808 -0.90521211 ... -1.12374     2.95425565
  -0.23182058]
 [ 1.56538251  0.82041578 -1.09610229 ...  1.33125597  2.95425565
  -0.23182058]
 [-1.19667866 -0.02430618  1.92964975 ... -1.32112661 -0.33849474
  -0.23182058]
 ...
 [ 0.82833933 -0.17338782  0.88419859 ...  0.066748   -0.33849474
  -0.23182058]
 [ 0.60716804 -0.28260013 -0.79788811 ...  0.25179794 -0.33849474
  -0.23182058]
 [ 0.28719144 -0.54759091 -1.10286971 ... -1.12374    -0.33849474
  -0.23182058]]


## Training the Random Forest Classification model on the Training set

In [112]:
from imblearn.ensemble import BalancedRandomForestClassifier

classifier = BalancedRandomForestClassifier(
    n_estimators=200, random_state=42
)

# 1. Train the classifier
classifier.fit(X_train, y_train)

# 2. Get prediction probabilities on test set
y_proba = classifier.predict_proba(X_test)[:, 0]  # probability for class 0

# 3. Apply custom threshold (default is 0.5)
y_pred = (y_proba >= 0.35).astype(int)


## Predicting a new result

In [113]:
sample_data = X.iloc[0].values.reshape(1, -1)
predicted_value = classifier.predict(sc.transform(sample_data))
print("Prediction for a new sample:", predicted_value)

Prediction for a new sample: [0]




## Predicting the Test set results

In [114]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.values.reshape(len(y_test),1)),1))


[[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


SMOTE Oversampling

In [115]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)      # Create SMOTE oversampler
X_res, y_res = sm.fit_resample(X_train, y_train)  # Apply SMOTE on training data

classifier.fit(X_res, y_res)  # Train RandomForest classifier on balanced dataset


## Making the Confusion Matrix

In [116]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))

[[4839   11]
 [  18  132]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4850
           1       0.92      0.88      0.90       150

    accuracy                           0.99      5000
   macro avg       0.96      0.94      0.95      5000
weighted avg       0.99      0.99      0.99      5000



## Visualising the Training set results

In [117]:
from matplotlib.colors import ListedColormap
# Create a new scaler just for the plotting data
sc_plot = StandardScaler()
X_train_plot = sc_plot.fit_transform(X_train[:, 0:2])
X_set, y_set = sc_plot.inverse_transform(X_train_plot), y_train
X1_min, X1_max = X_set[:, 0].min() - 1, X_set[:, 0].max() + 1
X2_min, X2_max = X_set[:, 1].min() - 1, X_set[:, 1].max() + 1
X1, X2 = np.meshgrid(np.arange(start = X1_min, stop = X1_max, step = (X1_max - X1_min) / 1000),
                     np.arange(start = X2_min, stop = X2_max, step = (X2_max - X2_min) / 1000))
# The model classifier expects all 24 features, so we can't just use the 2 features here.
# Instead, we will train a separate classifier for the 2 features for visualization purpose.
# Create a new classifier just for the plotting data
classifier_plot = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier_plot.fit(X_train[:, 0:2], y_train)
plt.contourf(X1, X2, classifier_plot.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(['#FA8072', '#1E90FF']))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c = ListedColormap(['#FA8072', '#1E90FF'])(i), label = j)
plt.title('Random Forest Classifier (Training set)')
plt.xlabel('gps_distance_worker_to_lane')
plt.ylabel('gps_distance_inspector_to_lane')
plt.legend()
plt.savefig('random_forest_training_set.png')
plt.close()


  plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c = ListedColormap(['#FA8072', '#1E90FF'])(i), label = j)


## Visualising the Test set results

In [118]:
from matplotlib.colors import ListedColormap
X_set, y_set = X_test[:, 0:2], y_test
X1_min, X1_max = X_set[:, 0].min() - 1, X_set[:, 0].max() + 1
X2_min, X2_max = X_set[:, 1].min() - 1, X_set[:, 1].max() + 1
X1, X2 = np.meshgrid(np.arange(start=X1_min, stop=X1_max, step=(X1_max - X1_min) / 1000),
                     np.arange(start=X2_min, stop=X2_max, step=(X2_max - X2_min) / 1000))
# Predict for each point on the grid using the plot classifier
Z = classifier_plot.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape)
# Plot the decision boundary
plt.contourf(X1, X2, Z, alpha=0.75, cmap=ListedColormap(['#FA8072', '#1E90FF']))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
# Define colors for scatter plot
colors = ['#FA8072', '#1E90FF']
# Plot the test set points
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(
        X_set[y_set == j, 0], X_set[y_set == j, 1],
        color=colors[i], label=j
    )
# Add titles and labels
plt.title('Random Forest Classifier (Test set)')
plt.xlabel('gps_distance_worker_to_lane')
plt.ylabel('gps_distance_inspector_to_lane')
plt.legend()
plt.savefig('random_forest_test_set.png')
plt.close()

In [119]:
import joblib
import os

artifact = {
    "model": classifier,                      # your trained RandomForest
    "feature_columns": list(X.columns), # exact feature columns after get_dummies
    "anomaly_label": 1                        # change to 0 if anomalies labeled as 0
}

os.makedirs("models", exist_ok=True)
joblib.dump(artifact, "models/anomaly_model_artifact.joblib")
print("Saved artifact -> models/anomaly_model_artifact.joblib")


Saved artifact -> models/anomaly_model_artifact.joblib


In [120]:
from google.colab import drive
drive.mount('/content/drive')   # Mounts your Google Drive at this path


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [124]:
import joblib

# Save the model
joblib.dump(classifier, "/content/drive/MyDrive/anomaly_rf.pkl")
joblib.dump(artifact, "/content/drive/MyDrive/anomaly_model_artifact.joblib")

<Figure size 640x480 with 0 Axes>