<a href="https://colab.research.google.com/github/Sandra877/Data-Analysis-Class/blob/main/SWKTrafficAssignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Importing required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from IPython.display import display  # Importing display for better DataFrame visualization

In [None]:
# Load the dataset
url = 'https://docs.google.com/spreadsheets/d/1AxV1SaET7Jy78KqChEIvld56SWQvYx6OYB_rfjUUUGg/gviz/tq?tqx=out:csv&sheet=Sheet1'
data = pd.read_csv(url)

In [None]:
# Show the data after loading
print("Initial Data:")
display(data.head())  # Use display for better visualization


Initial Data:


Unnamed: 0,Date,Accident Spot,Area,County,Road/ Highway,Brief Accident Details/Cause,Victims,Total people confirmed dead,Time of the Accidents,Weather conditions,...,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27
0,8/8/2023,Sobea,Sobea,Nakuru,Nakuru-Eldoret Highway,Head on Collision,Passengers,4.0,4.30 pm,,...,,,,,,,,,,
1,8/7/2023,Maai-Mahiu,Naivasha,Nakuru,Maai-Mahiu Naivasha Highway,vehicle and motorcycle collision,Passengers,1.0,5.50 pm,,...,,,,,,,,,,
2,7/25/2023,Ntulele,Ntulele,Narok,Narok Mai Mahiu road,Head on Collision,Drivers/Occupants,4.0,,,...,,,,,,,,,,
3,12/2/2022,Suswa,Suswa,Narok,Narok Mai Mahiu road,Head on Collision,Driver and passengers,3.0,6.00 pm,,...,,,,,,,,,,
4,12/1/2022,Mutira,Mutira,Kirinyaga,Kerugoya-Karatina Road,Run over,Pedestrian,1.0,,,...,,,,,,,,,,


In [None]:
# Data Cleaning
# Remove rows where key columns like 'Road/Highway' or 'Total people confirmed dead' are missing
data_cleaned = data.dropna(subset=['Road/ Highway', 'Total people confirmed dead']).copy()


In [None]:
# Show the data after dropping NaN values
print("\nData After Dropping Missing Values in 'Road/Highway' and 'Total people confirmed dead':")
display(data_cleaned.head())  # Use display for better visualization



Data After Dropping Missing Values in 'Road/Highway' and 'Total people confirmed dead':


Unnamed: 0,Date,Accident Spot,Area,County,Road/ Highway,Brief Accident Details/Cause,Victims,Total people confirmed dead,Time of the Accidents,Weather conditions,...,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27
0,8/8/2023,Sobea,Sobea,Nakuru,Nakuru-Eldoret Highway,Head on Collision,Passengers,4.0,4.30 pm,,...,,,,,,,,,,
1,8/7/2023,Maai-Mahiu,Naivasha,Nakuru,Maai-Mahiu Naivasha Highway,vehicle and motorcycle collision,Passengers,1.0,5.50 pm,,...,,,,,,,,,,
2,7/25/2023,Ntulele,Ntulele,Narok,Narok Mai Mahiu road,Head on Collision,Drivers/Occupants,4.0,,,...,,,,,,,,,,
3,12/2/2022,Suswa,Suswa,Narok,Narok Mai Mahiu road,Head on Collision,Driver and passengers,3.0,6.00 pm,,...,,,,,,,,,,
4,12/1/2022,Mutira,Mutira,Kirinyaga,Kerugoya-Karatina Road,Run over,Pedestrian,1.0,,,...,,,,,,,,,,


In [None]:
# Handling 'Time of the Accidents'
# Convert to datetime with specified format
data_cleaned.loc[:, 'Time of the Accidents'] = pd.to_datetime(data_cleaned['Time of the Accidents'], format='%I.%M %p', errors='coerce')

In [None]:
# Replace missing values with the median time
# Convert datetime to numerical format for median calculation
data_cleaned['Time of the Accidents'] = data_cleaned['Time of the Accidents'].astype('datetime64[ns]')
median_time = data_cleaned['Time of the Accidents'].median()  # Calculate median
data_cleaned.loc[:, 'Time of the Accidents'] = data_cleaned['Time of the Accidents'].fillna(median_time)

In [None]:
# Creating a binary classification column: 'Dangerous Road' (1 for dangerous, 0 for not dangerous)
data_cleaned['Dangerous Road'] = data_cleaned['Total people confirmed dead'].apply(lambda x: 1 if x >= 3 else 0)

In [None]:
# Show the data after adding the 'Dangerous Road' column
print("\nData After Adding 'Dangerous Road' Column:")
display(data_cleaned.head())  # Use display for better visualization


Data After Adding 'Dangerous Road' Column:


Unnamed: 0,Date,Accident Spot,Area,County,Road/ Highway,Brief Accident Details/Cause,Victims,Total people confirmed dead,Time of the Accidents,Weather conditions,...,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Dangerous Road
0,8/8/2023,Sobea,Sobea,Nakuru,Nakuru-Eldoret Highway,Head on Collision,Passengers,4.0,1900-01-01 16:30:00,,...,,,,,,,,,,1
1,8/7/2023,Maai-Mahiu,Naivasha,Nakuru,Maai-Mahiu Naivasha Highway,vehicle and motorcycle collision,Passengers,1.0,1900-01-01 17:50:00,,...,,,,,,,,,,0
2,7/25/2023,Ntulele,Ntulele,Narok,Narok Mai Mahiu road,Head on Collision,Drivers/Occupants,4.0,1900-01-01 16:30:00,,...,,,,,,,,,,1
3,12/2/2022,Suswa,Suswa,Narok,Narok Mai Mahiu road,Head on Collision,Driver and passengers,3.0,1900-01-01 18:00:00,,...,,,,,,,,,,1
4,12/1/2022,Mutira,Mutira,Kirinyaga,Kerugoya-Karatina Road,Run over,Pedestrian,1.0,1900-01-01 16:30:00,,...,,,,,,,,,,0


In [None]:
# Encoding categorical variables (Road/Highway)
data_cleaned = pd.get_dummies(data_cleaned, columns=['Road/ Highway'])

In [None]:
# Show the data after encoding categorical variables
print("\nData After Encoding 'Road/ Highway' Column:")
display(data_cleaned.head())  # Use display for better visualization



Data After Encoding 'Road/ Highway' Column:


Unnamed: 0,Date,Accident Spot,Area,County,Brief Accident Details/Cause,Victims,Total people confirmed dead,Time of the Accidents,Weather conditions,Unnamed: 10,...,Road/ Highway_Nakuru Kericho Highway,Road/ Highway_Nakuru-Eldoret Highway,Road/ Highway_Namanga Road,Road/ Highway_Narok Mai Mahiu road,Road/ Highway_Naromoru Nanyuki Road,Road/ Highway_Rukenya Kimunye Road,Road/ Highway_Sagana Kagio Road,Road/ Highway_Sagana Kenol Road,Road/ Highway_Thika Kitui Highway,Road/ Highway_Thika Road
0,8/8/2023,Sobea,Sobea,Nakuru,Head on Collision,Passengers,4.0,1900-01-01 16:30:00,,,...,False,True,False,False,False,False,False,False,False,False
1,8/7/2023,Maai-Mahiu,Naivasha,Nakuru,vehicle and motorcycle collision,Passengers,1.0,1900-01-01 17:50:00,,,...,False,False,False,False,False,False,False,False,False,False
2,7/25/2023,Ntulele,Ntulele,Narok,Head on Collision,Drivers/Occupants,4.0,1900-01-01 16:30:00,,,...,False,False,False,True,False,False,False,False,False,False
3,12/2/2022,Suswa,Suswa,Narok,Head on Collision,Driver and passengers,3.0,1900-01-01 18:00:00,,,...,False,False,False,True,False,False,False,False,False,False
4,12/1/2022,Mutira,Mutira,Kirinyaga,Run over,Pedestrian,1.0,1900-01-01 16:30:00,,,...,False,False,False,False,False,False,False,False,False,False


In [None]:
# Dropping 'Weather conditions' column if exists
if 'Weather conditions' in data_cleaned.columns:
    data_cleaned = data_cleaned.drop(columns=['Weather conditions'])


In [None]:
# Show the data after dropping the irrelevant column
print("\nData After Dropping 'Weather conditions':")
display(data_cleaned.head())  # Use display for better visualization



Data After Dropping 'Weather conditions':


Unnamed: 0,Date,Accident Spot,Area,County,Brief Accident Details/Cause,Victims,Total people confirmed dead,Time of the Accidents,Unnamed: 10,Unnamed: 11,...,Road/ Highway_Nakuru Kericho Highway,Road/ Highway_Nakuru-Eldoret Highway,Road/ Highway_Namanga Road,Road/ Highway_Narok Mai Mahiu road,Road/ Highway_Naromoru Nanyuki Road,Road/ Highway_Rukenya Kimunye Road,Road/ Highway_Sagana Kagio Road,Road/ Highway_Sagana Kenol Road,Road/ Highway_Thika Kitui Highway,Road/ Highway_Thika Road
0,8/8/2023,Sobea,Sobea,Nakuru,Head on Collision,Passengers,4.0,1900-01-01 16:30:00,,,...,False,True,False,False,False,False,False,False,False,False
1,8/7/2023,Maai-Mahiu,Naivasha,Nakuru,vehicle and motorcycle collision,Passengers,1.0,1900-01-01 17:50:00,,,...,False,False,False,False,False,False,False,False,False,False
2,7/25/2023,Ntulele,Ntulele,Narok,Head on Collision,Drivers/Occupants,4.0,1900-01-01 16:30:00,,,...,False,False,False,True,False,False,False,False,False,False
3,12/2/2022,Suswa,Suswa,Narok,Head on Collision,Driver and passengers,3.0,1900-01-01 18:00:00,,,...,False,False,False,True,False,False,False,False,False,False
4,12/1/2022,Mutira,Mutira,Kirinyaga,Run over,Pedestrian,1.0,1900-01-01 16:30:00,,,...,False,False,False,False,False,False,False,False,False,False


In [None]:
# Selecting features for classification
X = data_cleaned[['Total people confirmed dead'] + list(data_cleaned.filter(like='Road/ Highway').columns)]
y = data_cleaned['Dangerous Road']

In [None]:
# Splitting data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)
y_pred_dt = dt_classifier.predict(X_test)

In [None]:
# Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)
y_pred_rf = rf_classifier.predict(X_test)

In [None]:
# Evaluating the models
accuracy_dt = accuracy_score(y_test, y_pred_dt)
accuracy_rf = accuracy_score(y_test, y_pred_rf)


In [None]:
# Output the accuracies and classification reports
print(f"\nDecision Tree Accuracy: {accuracy_dt * 100:.2f}%")
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt))


Decision Tree Accuracy: 100.00%
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      1.00      1.00         7

    accuracy                           1.00        14
   macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14



In [None]:
print(f"\nRandom Forest Accuracy: {accuracy_rf * 100:.2f}%")
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 100.00%
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      1.00      1.00         7

    accuracy                           1.00        14
   macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14



In [None]:
# Displaying cleaned data for reference
print("\nFinal Cleaned Data for Reference:")
display(data_cleaned.head())  # Use display for better visualization


Final Cleaned Data for Reference:


Unnamed: 0,Date,Accident Spot,Area,County,Brief Accident Details/Cause,Victims,Total people confirmed dead,Time of the Accidents,Unnamed: 10,Unnamed: 11,...,Road/ Highway_Nakuru Kericho Highway,Road/ Highway_Nakuru-Eldoret Highway,Road/ Highway_Namanga Road,Road/ Highway_Narok Mai Mahiu road,Road/ Highway_Naromoru Nanyuki Road,Road/ Highway_Rukenya Kimunye Road,Road/ Highway_Sagana Kagio Road,Road/ Highway_Sagana Kenol Road,Road/ Highway_Thika Kitui Highway,Road/ Highway_Thika Road
0,8/8/2023,Sobea,Sobea,Nakuru,Head on Collision,Passengers,4.0,1900-01-01 16:30:00,,,...,False,True,False,False,False,False,False,False,False,False
1,8/7/2023,Maai-Mahiu,Naivasha,Nakuru,vehicle and motorcycle collision,Passengers,1.0,1900-01-01 17:50:00,,,...,False,False,False,False,False,False,False,False,False,False
2,7/25/2023,Ntulele,Ntulele,Narok,Head on Collision,Drivers/Occupants,4.0,1900-01-01 16:30:00,,,...,False,False,False,True,False,False,False,False,False,False
3,12/2/2022,Suswa,Suswa,Narok,Head on Collision,Driver and passengers,3.0,1900-01-01 18:00:00,,,...,False,False,False,True,False,False,False,False,False,False
4,12/1/2022,Mutira,Mutira,Kirinyaga,Run over,Pedestrian,1.0,1900-01-01 16:30:00,,,...,False,False,False,False,False,False,False,False,False,False
