In [3]:
import pandas as pd
import numpy as np
from sklearn import metrics
from joblib import load
from sklearn.metrics import accuracy_score
import time
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline as ImbPipeline

# Function to preprocess and scale numerical columns
def preprocess_and_scale(df, numerical_features):
    scaler = StandardScaler()
    df[numerical_features] = scaler.fit_transform(df[numerical_features])
    return df

def print_classification_report(y_true, y_pred):
    print("\n\n------------------result----------------\n\n")
    accuracy = accuracy_score(y_true, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(metrics.classification_report(y_true, y_pred))

def test_model(num_records=50):  # Specify the number of records for testing, default is 50
    # Load the trained model
    model = load("random_forest_model_smote_train1.pkl")

    # Load the test data
    print("Loading dataset...")
    df = pd.read_csv("clean_df.csv") # Load only the first 'num_records' rows
    df[['hour', 'minute']] = df['Time'].str.split(':', expand=True).astype('int32')

    features = ['longitude', 'latitude', 'Speed_limit', 'hour', 'minute','Number_of_Vehicles', 'Number_of_Casualties',
       'Day_of_Week','Light_Conditions',
       'Weather_Conditions', 'Road_Surface_Conditions','Carriageway_Hazards']
    X = df[features]
    y = df['Accident_Severity']


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

    # Define the pipeline
    pipeline = ImbPipeline([
        ('sampling', SMOTE(random_state=12)),
        ('classifier', model)
    ])

    # Fit the pipeline and make predictions on the test data
    start_time = time.time()
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    end_time = time.time()
    print(f"Prediction took {end_time - start_time:.2f} seconds")

    print_classification_report(y_test, y_pred)

if __name__ == "__main__":
    test_model()


Loading dataset...
Prediction took 25.61 seconds


------------------result----------------


Accuracy: 77.23%
              precision    recall  f1-score   support

           1       0.07      0.08      0.07       136
           2       0.20      0.25      0.22      1485
           3       0.88      0.86      0.87     10359

    accuracy                           0.77     11980
   macro avg       0.39      0.39      0.39     11980
weighted avg       0.79      0.77      0.78     11980

