## Import the libraries


In [None]:
import pandas as pd
import os
from pandas.plotting import scatter_matrix
from matplotlib import pyplot as plt
import seaborn as sns
import xml.etree.ElementTree as ET
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline

## Exploratory Data Analysis


In [None]:
def split_time(df):
    
    df[['Year', 'Month','Day']] = [d.split(' ')[0].split('-') for d in df['timestamp'].values]
    df[['Hour']] = [d.split(' ')[1].split(':')[0] for d in df['timestamp'].values]
    df[['Year', 'Month','Day','Hour']] = df[['Year', 'Month','Day','Hour']].astype('int')
    
    return df

In [None]:
def encode(df):
    df.loc[df['Side'] == 'R', 'Side'] = 0
    df.loc[df['Side'] == 'L', 'Side'] = 1
    return df * 1

In [None]:
def merge(df1, df2):
    return df1.merge(df2,on=['Year','Day','Month','Hour'],how='left')

In [None]:
def drop(df):
    df = df.drop(columns=['ID','timestamp','Bump','Lng', 'No_Exit', 'Give_Way'])
    return df

In [None]:
def weather_preprocess(wr):
    #Drop
    wr = wr.drop(columns=['Selected'])
    #Remove duplicates
    wr = wr.drop_duplicates(['Year','Day','Month','Hour'])
    #Fill missing values
    categorical = wr[['Weather_Condition']]
    numerical = wr[['Year','Day','Month','Hour','Wind_Chill(F)','Precipitation(in)','Temperature(F)','Humidity(%)','Wind_Speed(mph)','Visibility(mi)']]
    numerical_imputer = SimpleImputer(strategy='mean')
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    complete_numerical = numerical_imputer.fit_transform(numerical)
    complete_categorical = categorical_imputer.fit_transform(categorical)
    complete_wr = pd.concat([pd.DataFrame(complete_numerical,columns = ['Year','Day','Month','Hour','Wind_Chill(F)','Precipitation(in)','Temperature(F)','Humidity(%)','Wind_Speed(mph)','Visibility(mi)']),pd.DataFrame(complete_categorical,columns=['Weather_Condition'])], axis=1)
    #Encode categorical data
    en_wr = pd.get_dummies(complete_wr,columns=['Weather_Condition'])
    return en_wr

In [None]:
def preprocess(df1, df2):
    
    df1 = split_time(df1)
    df1 = encode(df1)
    df = merge(df1, df2)
    df = drop(df)
    
    return df

### Data Shape

In [None]:
dataset_path = '/kaggle/input/car-crashes-severity-prediction/'

df = pd.read_csv(os.path.join(dataset_path, 'train.csv'))
wr = pd.read_csv(os.path.join(dataset_path, 'weather-sfcsv.csv'))
prstree = ET.parse('/kaggle/input/car-crashes-severity-prediction/holidays.xml')

# Convert xml to df
root = prstree.getroot()
  
store_items = []
all_items = []
  
for storeno in root.iter('row'):
    
    date = storeno.find('date').text
    description = storeno.find('description').text
    
    store_items = [date,description]
    all_items.append(store_items)
  
hy = pd.DataFrame(all_items, columns=[
  'date', 'description'])
  
print("The shape of the dataset is {}.\n\n".format(df.shape))
print("The shape of the weather is {}.\n\n".format(wr.shape))
print("The shape of the holidays is {}.\n\n".format(hy.shape))

### Weather Preprocessing

In [None]:
fig = plt.figure(figsize = (25,45))
ax = fig.gca()
wr.hist(ax = ax)

wr = weather_preprocess(wr)
wr.info()

In [None]:
fig = plt.figure(figsize = (15,5))
ax = fig.gca()

wr.Year.hist(bins=15,ax = ax)

## Training Data Preprocessing

In [None]:
df = preprocess(df, wr)

### EDA

In [None]:
df.info()

In [None]:
fig = plt.figure(figsize = (25,45))
ax = fig.gca()
df.hist(ax = ax)

In [None]:
print(df.Stop.value_counts())

## Data Splitting


In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify = df['Severity']) # Try adding `stratify` here

y_train = train_df['Severity']
X_train = train_df.drop(columns=['Severity'])

y_val = val_df['Severity']
X_val = val_df.drop(columns=['Severity'])

## Model Training



In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create an instance of the classifier
classifier = RandomForestClassifier(max_depth=2, random_state=0)

# Train the classifier
classifier = classifier.fit(X_train, y_train)

In [None]:
print("The accuracy of the classifier on the validation set is ", (classifier.score(X_val, y_val)))

## Submission File Generation



In [None]:
test_df = pd.read_csv(os.path.join(dataset_path, 'test.csv'))
test_df.head()

In [None]:
X_test = test_df
X_test = preprocess(X_test, wr)

In [None]:
y_test_predicted = classifier.predict(X_test)
test_df['Severity'] = y_test_predicted

In [None]:
test_df[['ID', 'Severity']].to_csv('/kaggle/working/submission.csv', index=False)