In [None]:
import numpy as np
import pandas as pd
import time
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline

def evaluate_classification(model, name, X_train, X_test, y_train, y_test):
    train_predictions = model.predict(X_train)
    test_predictions = model.predict(X_test)

    train_accuracy = accuracy_score(y_train, train_predictions)
    test_accuracy = accuracy_score(y_test, test_predictions)

    train_precision = precision_score(y_train, train_predictions, average='weighted')
    test_precision = precision_score(y_test, test_predictions, average='weighted')

    train_recall = recall_score(y_train, train_predictions, average='weighted')
    test_recall = recall_score(y_test, test_predictions, average='weighted')

    print("Training Set Metrics:")
    print("Training Accuracy {}: {:.2f}%".format(name, train_accuracy * 100))
    print("Training Precision {}: {:.2f}%".format(name, train_precision * 100))
    print("Training Recall {}: {:.2f}%".format(name, train_recall * 100))

    print("\nTest Set Metrics:")
    print("Test Accuracy {}: {:.2f}%".format(name, test_accuracy * 100))
    print("Test Precision {}: {:.2f}%".format(name, test_precision * 100))
    print("Test Recall {}: {:.2f}%".format(name, test_recall * 100))

def train_and_save_model(data_file, model_file, num_rows=None):
    start_time = time.time()
    print("Loading the dataset...")
    # Load only necessary columns
    cols_to_load = ['Time', 'Date', 'Accident_Severity', 'LSOA_of_Accident_Location']
    if num_rows:
        df = pd.read_csv(data_file, usecols=cols_to_load, nrows=num_rows)
    else:
        df = pd.read_csv(data_file, usecols=cols_to_load)

    df[['hour', 'minute']] = df['Time'].str.split(':', expand=True).astype('int32')
    df[['day', 'month', 'year']] = df['Date'].str.split('-', expand=True).astype(int)
    df.drop(columns=["Time", "Date", "LSOA_of_Accident_Location"], inplace=True)

    X = df.drop(['Accident_Severity'], axis=1)
    y = df['Accident_Severity']

    pipeline = ImbPipeline([
        ('preprocess', StandardScaler()),
        ('sampling', SMOTE(random_state=20)),
        ('classifier', RandomForestClassifier())
    ])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    print("Model Training...")
    # Define hyperparameters for tuning
    param_grid = {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [None, 10, 20],
        # Add more hyperparameters for tuning if needed
    }
    # Use GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_pipeline = grid_search.best_estimator_

    print("Saving the model...")
    # Save the trained model
    with open(model_file, "wb") as f:
        pickle.dump(best_pipeline, f)

    end_time = time.time()
    print(f"Model training and saving took {end_time - start_time:.2f} seconds")
    evaluate_classification(best_pipeline, "Random Forest", X_train, X_test, y_train, y_test)

if __name__ == "__main__":
    data_file = "clean_df.csv"
    model_file = "random_forest_model_smote_train1.pkl"
    num_rows = None # Set the number of rows for training (e.g., num_rows = 1000000)
    train_and_save_model(data_file, model_file, num_rows)


Loading the dataset...
Model Training...
