In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import scipy
from sklearn.metrics import classification_report,accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from pylab import rcParams
rcParams['figure.figsize'] = 14, 8
RANDOM_SEED = 42
from datetime import datetime
import seaborn as sns

In [26]:
df = pd.read_csv('ml project.csv')

In [27]:
df.head()

Unnamed: 0,Accident_Index,1st_Road_Class,1st_Road_Number,2nd_Road_Class,2nd_Road_Number,Accident_Severity,Carriageway_Hazards,Date,Day_of_Week,Did_Police_Officer_Attend_Scene_of_Accident,...,Police_Force,Road_Surface_Conditions,Road_Type,Special_Conditions_at_Site,Speed_limit,Time,Urban_or_Rural_Area,Weather_Conditions,Year,InScotland
0,200501BS00001,A,3218.0,,0.0,Serious,,04-01-2005,Tuesday,1.0,...,Metropolitan Police,Wet or damp,Single carriageway,,30,17:42,Urban,Raining no high winds,2005,No
1,200501BS00002,B,450.0,C,0.0,Slight,,05-01-2005,Wednesday,1.0,...,Metropolitan Police,Dry,Dual carriageway,,30,17:36,Urban,Fine no high winds,2005,No
2,200501BS00003,C,0.0,,0.0,Slight,,06-01-2005,Thursday,1.0,...,Metropolitan Police,Dry,Single carriageway,,30,00:15,Urban,Fine no high winds,2005,No
3,200501BS00004,A,3220.0,,0.0,Slight,,07-01-2005,Friday,1.0,...,Metropolitan Police,Dry,Single carriageway,,30,10:35,Urban,Fine no high winds,2005,No
4,200501BS00005,Unclassified,0.0,,0.0,Slight,,10-01-2005,Monday,1.0,...,Metropolitan Police,Wet or damp,Single carriageway,,30,21:13,Urban,Fine no high winds,2005,No


In [28]:
df.dtypes

Accident_Index                                  object
1st_Road_Class                                  object
1st_Road_Number                                float64
2nd_Road_Class                                  object
2nd_Road_Number                                float64
Accident_Severity                               object
Carriageway_Hazards                             object
Date                                            object
Day_of_Week                                     object
Did_Police_Officer_Attend_Scene_of_Accident    float64
Junction_Control                                object
Junction_Detail                                 object
Latitude                                       float64
Light_Conditions                                object
Local_Authority_(District)                      object
Local_Authority_(Highway)                       object
Location_Easting_OSGR                          float64
Location_Northing_OSGR                         float64
Longitude 

# Data Cleaning

In [29]:
df["Junction_Control"].unique()

array(['Data missing or out of range', 'Auto traffic signal',
       'Give way or uncontrolled', 'Stop sign', 'Authorised person',
       'Not at junction or within 20 metres'], dtype=object)

In [30]:
df["Junction_Control"].value_counts()

Give way or uncontrolled               502984
Data missing or out of range           353871
Auto traffic signal                    105838
Not at junction or within 20 metres     76916
Stop sign                                7128
Authorised person                        1838
Name: Junction_Control, dtype: int64

In [31]:
df.drop("Junction_Control", axis=1, inplace = True)

In [32]:
df["2nd_Road_Class"].unique()

array([nan, 'C', 'Unclassified', 'B', 'A', 'Motorway', 'A(M)'],
      dtype=object)

In [33]:
df["2nd_Road_Class"].value_counts()

Unclassified    411064
A               102148
C                47542
B                40221
Motorway          6980
A(M)               796
Name: 2nd_Road_Class, dtype: int64

In [34]:
df.drop("2nd_Road_Class", axis=1, inplace = True)

# Duplicate Features

In [35]:
# LSOA_of_Accident_Location
df.drop("Location_Northing_OSGR", axis=1, inplace = True)
df.drop("Location_Easting_OSGR", axis=1, inplace = True)
df.drop("1st_Road_Class", axis=1, inplace = True)
df.drop("1st_Road_Number", axis=1, inplace = True)
df.drop("2nd_Road_Number", axis=1, inplace = True)
df.drop("Local_Authority_(District)", axis=1, inplace = True)
df.drop("Local_Authority_(Highway)", axis=1, inplace = True)

In [36]:
df.head()

Unnamed: 0,Accident_Index,Accident_Severity,Carriageway_Hazards,Date,Day_of_Week,Did_Police_Officer_Attend_Scene_of_Accident,Junction_Detail,Latitude,Light_Conditions,Longitude,...,Police_Force,Road_Surface_Conditions,Road_Type,Special_Conditions_at_Site,Speed_limit,Time,Urban_or_Rural_Area,Weather_Conditions,Year,InScotland
0,200501BS00001,Serious,,04-01-2005,Tuesday,1.0,Not at junction or within 20 metres,51.489096,Daylight,-0.19117,...,Metropolitan Police,Wet or damp,Single carriageway,,30,17:42,Urban,Raining no high winds,2005,No
1,200501BS00002,Slight,,05-01-2005,Wednesday,1.0,Crossroads,51.520075,Darkness - lights lit,-0.211708,...,Metropolitan Police,Dry,Dual carriageway,,30,17:36,Urban,Fine no high winds,2005,No
2,200501BS00003,Slight,,06-01-2005,Thursday,1.0,Not at junction or within 20 metres,51.525301,Darkness - lights lit,-0.206458,...,Metropolitan Police,Dry,Single carriageway,,30,00:15,Urban,Fine no high winds,2005,No
3,200501BS00004,Slight,,07-01-2005,Friday,1.0,Not at junction or within 20 metres,51.482442,Daylight,-0.173862,...,Metropolitan Police,Dry,Single carriageway,,30,10:35,Urban,Fine no high winds,2005,No
4,200501BS00005,Slight,,10-01-2005,Monday,1.0,Not at junction or within 20 metres,51.495752,Darkness - lighting unknown,-0.156618,...,Metropolitan Police,Wet or damp,Single carriageway,,30,21:13,Urban,Fine no high winds,2005,No


# Labelling

In [37]:
from sklearn.preprocessing import LabelEncoder

In [38]:
label_encoder=LabelEncoder()

In [39]:
df.dtypes

Accident_Index                                  object
Accident_Severity                               object
Carriageway_Hazards                             object
Date                                            object
Day_of_Week                                     object
Did_Police_Officer_Attend_Scene_of_Accident    float64
Junction_Detail                                 object
Latitude                                       float64
Light_Conditions                                object
Longitude                                      float64
LSOA_of_Accident_Location                       object
Number_of_Casualties                             int64
Number_of_Vehicles                               int64
Pedestrian_Crossing-Human_Control              float64
Pedestrian_Crossing-Physical_Facilities        float64
Police_Force                                    object
Road_Surface_Conditions                         object
Road_Type                                       object
Special_Co

In [40]:
df["Accident_Severity"]=label_encoder.fit_transform(df["Accident_Severity"])
df["Carriageway_Hazards"]=label_encoder.fit_transform(df["Carriageway_Hazards"])
df["Day_of_Week"]=label_encoder.fit_transform(df["Day_of_Week"])
df["Junction_Detail"]=label_encoder.fit_transform(df["Junction_Detail"])
df["Light_Conditions"]=label_encoder.fit_transform(df["Light_Conditions"])
df["LSOA_of_Accident_Location"]=label_encoder.fit_transform(df["LSOA_of_Accident_Location"])
df["Police_Force"]=label_encoder.fit_transform(df["Police_Force"])
df["Road_Surface_Conditions"]=label_encoder.fit_transform(df["Road_Surface_Conditions"])
df["Road_Type"]=label_encoder.fit_transform(df["Road_Type"])
df["Special_Conditions_at_Site"]=label_encoder.fit_transform(df["Special_Conditions_at_Site"])
df["Urban_or_Rural_Area"]=label_encoder.fit_transform(df["Urban_or_Rural_Area"])
df["Weather_Conditions"]=label_encoder.fit_transform(df["Weather_Conditions"])
df["InScotland"]=label_encoder.fit_transform(df["InScotland"])

In [41]:
df.head()

Unnamed: 0,Accident_Index,Accident_Severity,Carriageway_Hazards,Date,Day_of_Week,Did_Police_Officer_Attend_Scene_of_Accident,Junction_Detail,Latitude,Light_Conditions,Longitude,...,Police_Force,Road_Surface_Conditions,Road_Type,Special_Conditions_at_Site,Speed_limit,Time,Urban_or_Rural_Area,Weather_Conditions,Year,InScotland
0,200501BS00001,1,2,04-01-2005,5,1.0,4,51.489096,4,-0.19117,...,29,5,3,4,30,17:42,2,6,2005,0
1,200501BS00002,2,2,05-01-2005,6,1.0,0,51.520075,1,-0.211708,...,29,1,0,4,30,17:36,2,2,2005,0
2,200501BS00003,2,2,06-01-2005,4,1.0,4,51.525301,1,-0.206458,...,29,1,3,4,30,00:15,2,2,2005,0
3,200501BS00004,2,2,07-01-2005,0,1.0,4,51.482442,4,-0.173862,...,29,1,3,4,30,10:35,2,2,2005,0
4,200501BS00005,2,2,10-01-2005,1,1.0,4,51.495752,0,-0.156618,...,29,5,3,4,30,21:13,2,2,2005,0


In [42]:
df.to_csv('ml_labelled_Data.csv')

In [43]:
db=pd.read_csv('ml_labelled_Data.csv')

In [47]:
#Create independent and Dependent Features
columns = db.columns.tolist()
# Filter the columns to remove data we do not want 
columns = [c for c in columns if c not in ["Accident_Severity"]]
# Store the variable we are predicting 
target = "Accident_Severity"
# Define a random state 
state = np.random.RandomState(42)
X = db[columns]
Y = db[target]
# Print the shapes of X & Y
print(X.shape)
print(Y.shape)

(1048575, 25)
(1048575,)


In [50]:
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import NearMiss

ImportError: cannot import name '_ClassNamePrefixFeaturesOutMixin' from 'sklearn.base' (C:\Users\polis\anaconda3\lib\site-packages\sklearn\base.py)

In [49]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.9.1-py3-none-any.whl (199 kB)
     ------------------------------------ 199.3/199.3 kB 390.7 kB/s eta 0:00:00
Collecting scikit-learn>=1.1.0
  Downloading scikit_learn-1.1.3-cp39-cp39-win_amd64.whl (7.6 MB)
     ---------------------------------------- 7.6/7.6 MB 6.1 MB/s eta 0:00:00
Installing collected packages: scikit-learn, imbalanced-learn, imblearn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.24.2
    Uninstalling scikit-learn-0.24.2:
      Successfully uninstalled scikit-learn-0.24.2
Successfully installed imbalanced-learn-0.9.1 imblearn-0.0 scikit-learn-1.1.3


