In [1]:
import psycopg2
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [2]:
# Database connection parameters
db_params = {
    'host': '',
    'port': '',
    'database': 'postgres',
    'user': '',
    'password': ''
}

# Function to establish a database connection
def get_db_connection(db_params):
    try:
        conn = psycopg2.connect(
            host=db_params['host'],
            port=db_params['port'],
            database=db_params['database'],
            user=db_params['user'],
            password=db_params['password']
        )
        return conn
    except psycopg2.DatabaseError as e:
        print(f"Error: {e}")
        return None

# Function to retrieve data from a specific table
def get_table_data(db_params, table_name):
    conn = get_db_connection(db_params)
    if conn is None:
        return None
    
    try:
        query = f"SELECT * FROM {table_name};"
        df = pd.read_sql_query(query, conn)
        return df
    except Exception as e:
        print(f"Error: {e}")
        return None
    finally:
        conn.close()

In [3]:
# Function to retrieve data from a specific table
def get_table_data(db_params, table_name):
    conn = get_db_connection(db_params)
    if conn is None:
        return None
    
    try:
        query = f"SELECT * FROM {table_name};"
        df = pd.read_sql_query(query, conn)
        return df
    except Exception as e:
        print(f"Error: {e}")
        return None
    finally:
        conn.close()

In [4]:
# Load the training, testing, and validation datasets from the database
train_table_name = 'group12_warehouse.train_table'
test_table_name = 'group12_warehouse.test_table'
validation_table_name = 'group12_warehouse.validation_table'

train_data = get_table_data(db_params, train_table_name)
test_data = get_table_data(db_params, test_table_name)
validation_data = get_table_data(db_params, validation_table_name)

if train_data is not None and test_data is not None and validation_data is not None:
    print(train_data.head())
    print(test_data.head())
    print(validation_data.head())
else:
    print("Failed to retrieve data from one or more tables.")

  Accident severity First Mode of Transport           Area Type  \
0               0.0      -0.350029334776985  0.3316993365651348   
1               0.0      -1.810821489630903  0.3316993365651348   
2               0.0      -0.350029334776985  0.3316993365651348   
3               0.0      -0.350029334776985  0.3316993365651348   
4               0.0      -0.350029334776985   -3.01477841455867   

      Light condition        Road Location       Road condition  \
0  0.6027962867863099  -0.8738398047923384    1.513226708878669   
1  0.6027962867863099   1.1443745117992685  -0.6608395121052414   
2  0.6027962867863099  -0.8738398047923384  -0.6608395121052414   
3  0.6027962867863099   1.1443745117992685  -0.6608395121052414   
4  -1.658935235535878   1.1443745117992685  -0.6608395121052414   

          Road surface        Road situation          Speed limit  \
0    1.784021822931772  -0.42384880344171616   1.9706356443111297   
1  -0.7354366365307541    1.0394771034977446  -0.1099889

  df = pd.read_sql_query(query, conn)


In [5]:
# Split features and target variable
X_train = train_data.drop(columns=['Accident severity'])
y_train = train_data[['Accident severity']]
    
X_test = test_data.drop(columns=['Accident severity'])
y_test = test_data[['Accident severity']]
    
X_val = validation_data.drop(columns=['Accident severity'])
y_val = validation_data[['Accident severity']]

## Random Forest Model

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


# Initialize and train a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Make predictions on the testing set
y_test_pred = rf_classifier.predict(X_test)

# 2. Generate a classification report for the testing set
report_test = classification_report(y_test, y_test_pred)
print("Classification Report for Test Set:\n", report_test)

# 3. Make predictions on the validation set
y_pred_val = rf_classifier.predict(X_val)

# 4. Generate a classification report for the validation set
report_val = classification_report(y_val, y_pred_val)
print("\nClassification Report for Validation Set:\n", report_val)


  return fit_method(estimator, *args, **kwargs)


Classification Report for Test Set:
               precision    recall  f1-score   support

         0.0       0.98      1.00      0.99       251
         1.0       0.82      0.96      0.89       251
         2.0       0.95      0.77      0.85       251

    accuracy                           0.91       753
   macro avg       0.92      0.91      0.91       753
weighted avg       0.92      0.91      0.91       753


Classification Report for Validation Set:
               precision    recall  f1-score   support

         0.0       0.99      1.00      1.00       508
         1.0       0.84      0.94      0.89       508
         2.0       0.93      0.81      0.87       508

    accuracy                           0.92      1524
   macro avg       0.92      0.92      0.92      1524
weighted avg       0.92      0.92      0.92      1524



## Hypertuning

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize a Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit GridSearchCV to the data
grid_search.fit(X_train, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# Use the best parameters to initialize a new Random Forest Classifier
best_rf_classifier = RandomForestClassifier(**best_params, random_state=42)
best_rf_classifier.fit(X_train, y_train)

# Make predictions on the testing set
y_test_pred = best_rf_classifier.predict(X_test)

# Generate a classification report for the testing set
report_test = classification_report(y_test, y_test_pred)
print("Classification Report for Test Set:\n", report_test)

# Make predictions on the validation set
y_val_pred = best_rf_classifier.predict(X_val)

# Generate a classification report for the validation set
report_val = classification_report(y_val, y_val_pred)
print("\nClassification Report for Validation Set:\n", report_val)


Fitting 5 folds for each of 216 candidates, totalling 1080 fits


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.5s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.4s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.9s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   1.3s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.4s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=   1.2s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   0.4s
[CV] END bootstrap=True, max_depth=None, min_sa

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

Best parameters found:  {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


  return fit_method(estimator, *args, **kwargs)


Classification Report for Test Set:
               precision    recall  f1-score   support

         0.0       0.98      1.00      0.99       251
         1.0       0.87      0.98      0.92       251
         2.0       0.97      0.83      0.89       251

    accuracy                           0.93       753
   macro avg       0.94      0.93      0.93       753
weighted avg       0.94      0.93      0.93       753


Classification Report for Validation Set:
               precision    recall  f1-score   support

         0.0       0.99      1.00      1.00       508
         1.0       0.86      0.96      0.91       508
         2.0       0.96      0.83      0.89       508

    accuracy                           0.93      1524
   macro avg       0.94      0.93      0.93      1524
weighted avg       0.94      0.93      0.93      1524

[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   1.4s
[CV] END bootstrap=True, max_depth=Non

[CV] END bootstrap=False, class_weight=balanced_subsample, criterion=entropy, max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=15, n_estimators=300; total time=   1.5s
[CV] END bootstrap=False, class_weight=balanced_subsample, criterion=entropy, max_depth=40, max_features=log2, min_samples_leaf=6, min_samples_split=2, n_estimators=100; total time=   0.5s
[CV] END bootstrap=False, class_weight=balanced_subsample, criterion=entropy, max_depth=40, max_features=log2, min_samples_leaf=6, min_samples_split=2, n_estimators=300; total time=   1.5s
[CV] END bootstrap=False, class_weight=balanced_subsample, criterion=entropy, max_depth=40, max_features=log2, min_samples_leaf=6, min_samples_split=5, n_estimators=300; total time=   1.5s
[CV] END bootstrap=False, class_weight=balanced_subsample, criterion=entropy, max_depth=40, max_features=log2, min_samples_leaf=6, min_samples_split=10, n_estimators=100; total time=   0.5s
[CV] END bootstrap=False, class_weight=balanced_subsa