# **PROBLEM STATEMENT:**

# ** To predict the Crash incident reported to the police in Washington County from 2011 to 2015.**

In [16]:
import pandas as pd
import numpy as np

municipal_code_df = pd.read_csv('/content/washingtonmunicipalcode.csv')
police_agency_code_df = pd.read_csv('/content/washingtonpoliceagencycode.csv')
crash_2011_df = pd.read_csv('/content/2011washington.csv')
crash_2012_df = pd.read_csv('/content/2012washington.csv')
crash_2014_df = pd.read_csv('/content/2014washington.csv')
crash_2015_df = pd.read_csv('/content/2015washington.csv')
crash_additional_df = pd.read_csv('/content/aeeb1559-a410-4c30-ab3f-0dc20c58bea1.csv')

municipal_code_df.head(), police_agency_code_df.head(), crash_2011_df.head()

(    Code              Municipality
 0  62216           Morris Township
 1  62217   Mount Pleasant Township
 2  62218  North Bethlehem Township
 3  62219   North Franklin Township
 4  62220   North Strabane Township,
     Code            Policy Agency
 0  62101  East Bethlehem Township
 1  62205         Carroll Township
 2  62206           Cecil Township
 3  62207       Chartiers Township
 4  62209         Donegal Township,
     CRASH_CRN  DISTRICT  CRASH_COUNTY  MUNICIPALITY POLICE_AGCY  CRASH_YEAR  \
 0  2011000084        12            62         62424       62301        2011   
 1  2011001835        12            62         62302       68B01        2011   
 2  2011001997        12            62         62302       62302        2011   
 3  2011002227        12            62         62302       62302        2011   
 4  2011002261        12            62         62403       68B02        2011   
 
    CRASH_MONTH  DAY_OF_WEEK  TIME_OF_DAY  HOUR_OF_DAY  ...  ACCESS_CTRL  \
 0            

In [11]:
#Data Cleaning and Data  Preprocessing

# Merge crash data from different years
crash_data = pd.concat([crash_2011_df, crash_2012_df, crash_2014_df, crash_2015_df, crash_additional_df], ignore_index=True)

# Ensure POLICE_AGCY is of the same type in both dataframes,string
crash_data['POLICE_AGCY'] = crash_data['POLICE_AGCY'].astype(str)
police_agency_code_df['Code'] = police_agency_code_df['Code'].astype(str)

# Enrich the crash data by adding information municipal and police agency codes
crash_data = crash_data.merge(municipal_code_df, left_on='MUNICIPALITY', right_on='Code', how='left')
crash_data = crash_data.merge(police_agency_code_df, left_on='POLICE_AGCY', right_on='Code', how='left')

crash_data.drop(columns=['Code_x', 'Code_y'], inplace=True)

# Convert date-related columns to appropriate types
crash_data['CRASH_DATE'] = pd.to_datetime(crash_data['CRASH_YEAR'].astype(str) + '-' + crash_data['CRASH_MONTH'].astype(str) + '-01')

# Missing values
crash_data.isnull().sum()

Unnamed: 0,0
CRASH_CRN,0
DISTRICT,0
CRASH_COUNTY,0
MUNICIPALITY,0
POLICE_AGCY,0
...,...
STREET_NAME,62
_id,8025
Municipality,0
Policy Agency,5018


In [12]:
#Predictive Modeling

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Subset of columns used to predict the target variable
features = ['WEATHER', 'ROAD_CONDITION', 'HOUR_OF_DAY', 'DAY_OF_WEEK']
X = crash_data[features]
y = crash_data['MAX_SEVERITY_LEVEL'] #want to predict

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the logistic regression model,binary classification
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train) #Train the model on the training data.

# Make predictions,Use the trained model to predict MAX_SEVERITY_LEVEL for the test set.
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
accuracy, report

(0.5163224516988674,
 '              precision    recall  f1-score   support\n\n           0       0.52      1.00      0.68      1550\n           1       0.00      0.00      0.00        41\n           2       0.00      0.00      0.00       103\n           3       0.00      0.00      0.00       231\n           4       0.00      0.00      0.00       628\n           8       0.00      0.00      0.00       370\n           9       0.00      0.00      0.00        79\n\n    accuracy                           0.52      3002\n   macro avg       0.07      0.14      0.10      3002\nweighted avg       0.27      0.52      0.35      3002\n')