In [1]:
import pandas as pd
import numpy as np 
import plotly.express as px 
import sklearn as skl 
import folium as flm 
import matplotlib.pyplot as plt 

In [2]:
from datetime import datetime

In [3]:
df=pd.read_csv('Data-Collisions.csv',low_memory=False)
df.head()

Unnamed: 0,SEVERITYCODE,X,Y,OBJECTID,INCKEY,COLDETKEY,REPORTNO,STATUS,ADDRTYPE,INTKEY,LOCATION,EXCEPTRSNCODE,EXCEPTRSNDESC,SEVERITYCODE.1,SEVERITYDESC,COLLISIONTYPE,PERSONCOUNT,PEDCOUNT,PEDCYLCOUNT,VEHCOUNT,INCDATE,INCDTTM,JUNCTIONTYPE,SDOT_COLCODE,SDOT_COLDESC,INATTENTIONIND,UNDERINFL,WEATHER,ROADCOND,LIGHTCOND,PEDROWNOTGRNT,SDOTCOLNUM,SPEEDING,ST_COLCODE,ST_COLDESC,SEGLANEKEY,CROSSWALKKEY,HITPARKEDCAR
0,2,-122.323148,47.70314,1,1307,1307,3502005,Matched,Intersection,37475.0,5TH AVE NE AND NE 103RD ST,,,2,Injury Collision,Angles,2,0,0,2,2013/03/27 00:00:00+00,3/27/2013 2:54:00 PM,At Intersection (intersection related),11.0,"MOTOR VEHICLE STRUCK MOTOR VEHICLE, FRONT END ...",,N,Overcast,Wet,Daylight,,,,10,Entering at angle,0.0,0.0,N
1,1,-122.347294,47.647172,2,52200,52200,2607959,Matched,Block,,AURORA BR BETWEEN RAYE ST AND BRIDGE WAY N,,,1,Property Damage Only Collision,Sideswipe,2,0,0,2,2006/12/20 00:00:00+00,12/20/2006 6:55:00 PM,Mid-Block (not related to intersection),16.0,"MOTOR VEHICLE STRUCK MOTOR VEHICLE, LEFT SIDE ...",,0,Raining,Wet,Dark - Street Lights On,,6354039.0,,11,From same direction - both going straight - bo...,0.0,0.0,N
2,1,-122.33454,47.607871,3,26700,26700,1482393,Matched,Block,,4TH AVE BETWEEN SENECA ST AND UNIVERSITY ST,,,1,Property Damage Only Collision,Parked Car,4,0,0,3,2004/11/18 00:00:00+00,11/18/2004 10:20:00 AM,Mid-Block (not related to intersection),14.0,"MOTOR VEHICLE STRUCK MOTOR VEHICLE, REAR END",,0,Overcast,Dry,Daylight,,4323031.0,,32,One parked--one moving,0.0,0.0,N
3,1,-122.334803,47.604803,4,1144,1144,3503937,Matched,Block,,2ND AVE BETWEEN MARION ST AND MADISON ST,,,1,Property Damage Only Collision,Other,3,0,0,3,2013/03/29 00:00:00+00,3/29/2013 9:26:00 AM,Mid-Block (not related to intersection),11.0,"MOTOR VEHICLE STRUCK MOTOR VEHICLE, FRONT END ...",,N,Clear,Dry,Daylight,,,,23,From same direction - all others,0.0,0.0,N
4,2,-122.306426,47.545739,5,17700,17700,1807429,Matched,Intersection,34387.0,SWIFT AVE S AND SWIFT AV OFF RP,,,2,Injury Collision,Angles,2,0,0,2,2004/01/28 00:00:00+00,1/28/2004 8:04:00 AM,At Intersection (intersection related),11.0,"MOTOR VEHICLE STRUCK MOTOR VEHICLE, FRONT END ...",,0,Raining,Wet,Daylight,,4028032.0,,10,Entering at angle,0.0,0.0,N


In [4]:
df.columns

Index(['SEVERITYCODE', 'X', 'Y', 'OBJECTID', 'INCKEY', 'COLDETKEY', 'REPORTNO',
       'STATUS', 'ADDRTYPE', 'INTKEY', 'LOCATION', 'EXCEPTRSNCODE',
       'EXCEPTRSNDESC', 'SEVERITYCODE.1', 'SEVERITYDESC', 'COLLISIONTYPE',
       'PERSONCOUNT', 'PEDCOUNT', 'PEDCYLCOUNT', 'VEHCOUNT', 'INCDATE',
       'INCDTTM', 'JUNCTIONTYPE', 'SDOT_COLCODE', 'SDOT_COLDESC',
       'INATTENTIONIND', 'UNDERINFL', 'WEATHER', 'ROADCOND', 'LIGHTCOND',
       'PEDROWNOTGRNT', 'SDOTCOLNUM', 'SPEEDING', 'ST_COLCODE', 'ST_COLDESC',
       'SEGLANEKEY', 'CROSSWALKKEY', 'HITPARKEDCAR'],
      dtype='object')

First, let's explore the data. Let's select the repeated features ( like severity decription and severitycode.1 ) or those judged not important before any further analysis. 

Note that for every dropped feature,  verification  of its redundance and meaning has been done. 



In [5]:
df.drop(['INCKEY','OBJECTID','COLLISIONTYPE','COLDETKEY','REPORTNO','STATUS','INTKEY','LOCATION','EXCEPTRSNCODE','EXCEPTRSNDESC','SEVERITYCODE.1','SEVERITYDESC','INCDATE','SDOT_COLDESC','PEDROWNOTGRNT','SDOTCOLNUM','SEGLANEKEY','CROSSWALKKEY','ST_COLDESC'],axis=1,inplace=True)

Here are the columns after deletion 

In [6]:
df.columns 

Index(['SEVERITYCODE', 'X', 'Y', 'ADDRTYPE', 'PERSONCOUNT', 'PEDCOUNT',
       'PEDCYLCOUNT', 'VEHCOUNT', 'INCDTTM', 'JUNCTIONTYPE', 'SDOT_COLCODE',
       'INATTENTIONIND', 'UNDERINFL', 'WEATHER', 'ROADCOND', 'LIGHTCOND',
       'SPEEDING', 'ST_COLCODE', 'HITPARKEDCAR'],
      dtype='object')

In [7]:
#Let's drop the duplicates, in fact there are duplictes but as the OBJECTID feature has been added as an index, we couldn't see them.
df.drop_duplicates( keep='first', inplace=True, ignore_index=False)

In [8]:
df.shape

(27627, 19)

In [9]:
df.isna().sum()

SEVERITYCODE          0
X                   657
Y                   657
ADDRTYPE            119
PERSONCOUNT           0
PEDCOUNT              0
PEDCYLCOUNT           0
VEHCOUNT              0
INCDTTM               1
JUNCTIONTYPE        233
SDOT_COLCODE          1
INATTENTIONIND    23835
UNDERINFL           138
WEATHER             143
ROADCOND            142
LIGHTCOND           145
SPEEDING          26106
ST_COLCODE            1
HITPARKEDCAR          1
dtype: int64

In [10]:
194623-5318

189305

Let's deal with missing values ! 

We drop the to columns INATTETNIONIND and SPEEDING as they contain very few known values. 

We see that if we drop all the rows having missing values, only 7.49% of the dataframe will be deleted. 

We choose to drop all missing values as we assume that it will hinder the accuracy of the dataframe less than replacing it with approximated values.  

In [11]:
df.drop(['INATTENTIONIND','SPEEDING'],axis=1,inplace=True)
print((df.shape[0]-df.dropna(axis=0,inplace=False).shape[0])/df.shape[0]*100)
df.dropna(axis=0,inplace=True)

3.094798566619611


In [12]:
df.shape

(26772, 17)

## Feature engineering 

In [13]:
df.columns

Index(['SEVERITYCODE', 'X', 'Y', 'ADDRTYPE', 'PERSONCOUNT', 'PEDCOUNT',
       'PEDCYLCOUNT', 'VEHCOUNT', 'INCDTTM', 'JUNCTIONTYPE', 'SDOT_COLCODE',
       'UNDERINFL', 'WEATHER', 'ROADCOND', 'LIGHTCOND', 'ST_COLCODE',
       'HITPARKEDCAR'],
      dtype='object')

In [14]:
#Addrtype collisiontype junctiontype sdotscolcode weather roadcond lightcond : one hot encoder 
#underinflhitparkedcar binary --done
#pdcount + pedcycl -> pedcount, pedcycl  + binary   
#dttm get hour 

## One Hot Encoding 
  

In [15]:
def one_hot_encoder(column_name,data):
    data=pd.concat([data, pd.get_dummies(data[column_name])],axis=1)
    data.drop([column_name,data.columns[-1]],inplace=True,axis=1)
    return data


In [16]:
columns_to_encode=['ADDRTYPE', 'JUNCTIONTYPE' , 'HITPARKEDCAR' , 'UNDERINFL' ,'SDOT_COLCODE','WEATHER', 'ROADCOND' ,'LIGHTCOND']

In [17]:
for i in columns_to_encode: 
    print(df[i].unique())

['Intersection' 'Block']
['At Intersection (intersection related)'
 'Mid-Block (not related to intersection)' 'Driveway Junction'
 'At Intersection (but not related to intersection)'
 'Mid-Block (but intersection related)' 'Ramp Junction']
['N' 'Y']
['N' '0' '1' 'Y']
[11. 16. 14. 51. 13. 28. 18. 34.  0. 24. 69. 26. 12. 21. 23. 29. 48. 15.
 55. 33. 56. 54. 31. 44. 32. 27. 25. 66. 36. 64. 22. 58. 47. 61. 35.]
['Overcast' 'Raining' 'Clear' 'Unknown' 'Other' 'Snowing' 'Fog/Smog/Smoke'
 'Sleet/Hail/Freezing Rain' 'Blowing Sand/Dirt' 'Severe Crosswind']
['Wet' 'Dry' 'Unknown' 'Snow/Slush' 'Ice' 'Other' 'Sand/Mud/Dirt'
 'Standing Water' 'Oil']
['Daylight' 'Dark - Street Lights On' 'Dark - No Street Lights' 'Unknown'
 'Dusk' 'Dawn' 'Dark - Street Lights Off' 'Other'
 'Dark - Unknown Lighting']


We notice that : 
    - we will not have problems with new column names after encoding only with :
           - the 3rd and 4th column as they have same values
           - the values 'other' and 'Unknown' in any feature.
    - the 4th columns is not well formatted.
We will deal encounter this in the cell below. 

In [18]:
#We gather the features havinf 'Unknown' and 'Other' among their values in a list : 
lst=[]
for i in columns_to_encode: 
    if ('Other' in df[i].unique() ) or ( 'Unknown' in df[i].unique()):
        lst.append(i)


elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



In [19]:
#We replace the "Unknown" and 'Other' by 'Unknown+feature name' and 'Other+feature name'
df['UNDERINFL'].replace(to_replace=['N','0','Y','1'], value=['NOinfl','NOinfl','YESinfl','YESinfl'],inplace=True)
df['HITPARKEDCAR'].replace(to_replace=['N','Y'],value=['No','Yes'],inplace=True)
for i in lst:
    df[i].replace(to_replace=['Unknown','Other'],value=['Unknown'+str(i),'Other'+str(i)],inplace=True)

Now we add a binary feature saying if any pedestrian ( in a bicycle or not ) was concerned with the accident : 


In [20]:
ped=[]
for i in df.index: 
    if ((df.loc[i]['PEDCOUNT']!=0) or (df.loc[i]['PEDCYLCOUNT']!=0)):
        ped.append(1)
    else:
        ped.append(0)
df['PEDESTRIAN']=ped

In [21]:
df.isna().sum()

SEVERITYCODE    0
X               0
Y               0
ADDRTYPE        0
PERSONCOUNT     0
PEDCOUNT        0
PEDCYLCOUNT     0
VEHCOUNT        0
INCDTTM         0
JUNCTIONTYPE    0
SDOT_COLCODE    0
UNDERINFL       0
WEATHER         0
ROADCOND        0
LIGHTCOND       0
ST_COLCODE      0
HITPARKEDCAR    0
PEDESTRIAN      0
dtype: int64

In [22]:
df['Hour']=pd.to_datetime(df['INCDTTM']).dt.hour

In [23]:
df.columns

Index(['SEVERITYCODE', 'X', 'Y', 'ADDRTYPE', 'PERSONCOUNT', 'PEDCOUNT',
       'PEDCYLCOUNT', 'VEHCOUNT', 'INCDTTM', 'JUNCTIONTYPE', 'SDOT_COLCODE',
       'UNDERINFL', 'WEATHER', 'ROADCOND', 'LIGHTCOND', 'ST_COLCODE',
       'HITPARKEDCAR', 'PEDESTRIAN', 'Hour'],
      dtype='object')

In [24]:
#And finally let's encode the categorical data 
for i in columns_to_encode:
    df=one_hot_encoder(i,data=df)


In [25]:
df.drop(['INCDTTM','ST_COLCODE'],axis=1,inplace=True)

And now we get the final data that we will use to model : 
let's store it in a new variable. 


In [26]:
data_prepared=df


# DATA MODELING 

Now that we have the data validated and cleaned. We will do some machine learning by applying diffferent known models for classification. 

The models we will be using are :  
 - K-Nearest Neighbors 
 - Decision Trees 
 - Logistic Regression 
 - Support Vector Machine 
 
Finally we will combine the 4 models, and for every row we take the most common result of the 4 models. 


Let's get started : 

Preprocessing : 

In [None]:
df.columns

Index([                                     'SEVERITYCODE',
                                                       'X',
                                                       'Y',
                                             'PERSONCOUNT',
                                                'PEDCOUNT',
                                             'PEDCYLCOUNT',
                                                'VEHCOUNT',
                                              'PEDESTRIAN',
                                                    'Hour',
                                                   'Block',
       'At Intersection (but not related to intersection)',
                  'At Intersection (intersection related)',
                                       'Driveway Junction',
                    'Mid-Block (but intersection related)',
                 'Mid-Block (not related to intersection)',
                                           'Ramp Junction',
                                        

In [27]:
#We will replace severity code values ( 1 and 2 ) by (0 and 1). This would help in logistic regression and it's better-presented
df['SEVERITYCODE'].replace(to_replace=[1,2],value=[0,1],inplace=True)
X = df[df.columns[1:]]
y=df['SEVERITYCODE'].values

In [28]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,y,test_size=0.2,random_state=4)

In [29]:
from sklearn import preprocessing
X= preprocessing.StandardScaler().fit(X).transform(X)

## Random Forest Classfier

In [30]:
from sklearn.ensemble import RandomForestClassifier as RFC

In [31]:
rfc= RFC(criterion='entropy').fit(X_train,Y_train)
yhatrfc=rfc.predict(X_test)
train_score= skl.metrics.accuracy_score(Y_train,rfc.predict(X_train))
test_score= skl.metrics.accuracy_score(Y_test,yhatrfc)
print('train : ', train_score , ' test : ', test_score)
print('the predicted values are : ' )
print(yhatrfc)

train :  0.9970117196619508  test :  0.719327731092437
the predicted values are : 
[0 1 0 ... 1 0 1]


## KNN

In [32]:
from sklearn.neighbors import KNeighborsClassifier as KNN

In [None]:

score={}
for k in range(1,277):
    neigh= KNN(n_neighbors=k).fit(X_train,Y_train)
    yhat=neigh.predict(X_test)
    train_score= skl.metrics.accuracy_score(Y_train,neigh.predict(X_train))
    test_score= skl.metrics.accuracy_score(Y_test,yhat)
    score[k]= test_score
    print(k, 'train : ', train_score , ' test : ', test_score)
best_score=max(score.values())
#finding the k : 
for i,j in score.items():
    if j == best_score: 
        best_k=i
        break
print('the best k is', best_k)


1 train :  0.9971517953027969  test :  0.6692810457516339
2 train :  0.8328430685903722  test :  0.7090569561157797
3 train :  0.8336368305551665  test :  0.6907563025210084
4 train :  0.7983377690619601  test :  0.7090569561157797
5 train :  0.800672363076061  test :  0.7092436974789916
6 train :  0.7836298267731242  test :  0.7249299719887955
7 train :  0.7859644207872251  test :  0.7187675070028011
8 train :  0.7764392772096932  test :  0.7303454715219421
9 train :  0.7774664985758977  test :  0.7275443510737628
10 train :  0.770929635336415  test :  0.7368814192343605
11 train :  0.7689218844842882  test :  0.7299719887955182
12 train :  0.7652332259420087  test :  0.7344537815126051
13 train :  0.7678479712378018  test :  0.7295985060690943
14 train :  0.7625717887659336  test :  0.7338935574229691
15 train :  0.7641593126955223  test :  0.7292250233426704
16 train :  0.7593500490264743  test :  0.7333333333333333
17 train :  0.761124340477191  test :  0.7277310924369748
18 train 

In [None]:
df.head()

Unnamed: 0,SEVERITYCODE,X,Y,PERSONCOUNT,PEDCOUNT,PEDCYLCOUNT,VEHCOUNT,PEDESTRIAN,Hour,Block,At Intersection (but not related to intersection),At Intersection (intersection related),Driveway Junction,Mid-Block (but intersection related),Mid-Block (not related to intersection),Ramp Junction,No,NOinfl,0,11,12,13,14,15,16,18,21,22,23,24,25,26,27,28,29,31,32,33,34,35,...,46,47,48,51,52,53,54,55,56,58,61,64,66,68,Blowing Sand/Dirt,Clear,Fog/Smog/Smoke,OtherWEATHER,Overcast,Partly Cloudy,Raining,Severe Crosswind,Sleet/Hail/Freezing Rain,Snowing,Dry,Ice,Oil,OtherROADCOND,Sand/Mud/Dirt,Snow/Slush,Standing Water,UnknownROADCOND,Dark - No Street Lights,Dark - Street Lights Off,Dark - Street Lights On,Dark - Unknown Lighting,Dawn,Daylight,Dusk,OtherLIGHTCOND
0,1,-122.323148,47.70314,2,0,0,2,0,14,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,-122.347294,47.647172,2,0,0,2,0,18,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,0,-122.33454,47.607871,4,0,0,3,0,10,1,0,0,0,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,-122.334803,47.604803,3,0,0,3,0,9,1,0,0,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,1,-122.306426,47.545739,2,0,0,2,0,8,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [33]:
neigh= KNN(n_neighbors=40).fit(X_train,Y_train)
yhat=neigh.predict(X_test)
train_score= skl.metrics.accuracy_score(Y_train,neigh.predict(X_train))
test_score= skl.metrics.accuracy_score(Y_test,yhat)
print('train : ', train_score , ' test : ', test_score)


train :  0.746416398188355  test :  0.734827264239029


In [34]:
yhatknn=yhat

In [None]:
'''
#Now the best k is found: let's use it. 
neigh= KNN(n_neighbors=7).fit(X_train,Y_train)
yhatknn=neigh.predict(X_test)
print( 'he predicted result is : ' , yhatknn)
'''

NameError: ignored

## Decision Trees


In [35]:
from sklearn.tree import DecisionTreeClassifier as DTC

In [36]:
score={}
for k in range(1,50):
    X_train,X_test,Y_train,Y_test=train_test_split(X,y,test_size=0.2,random_state=4)
    tree= DTC(criterion='entropy',max_depth=k).fit(X_train,Y_train)
    yhat=tree.predict(X_test)
    train_score= skl.metrics.accuracy_score(Y_train,tree.predict(X_train))
    test_score= skl.metrics.accuracy_score(Y_test,yhat)
    score[k]= test_score
    print(k, 'train : ', train_score , ' test : ', test_score)
best_score=max(score.values())
#finding the k : 
for i,j in score.items():
    if j == best_score: 
        best_k=i
        break
print('the best k is', best_k)

1 train :  0.7359107251249007  test :  0.7355742296918768
2 train :  0.7359107251249007  test :  0.7355742296918768
3 train :  0.7359107251249007  test :  0.7355742296918768
4 train :  0.7360974926460289  test :  0.7352007469654529
5 train :  0.7398328430685903  test :  0.7337068160597572
6 train :  0.7420273614418452  test :  0.73874883286648
7 train :  0.7457160199841247  test :  0.734267040149393
8 train :  0.748657608441892  test :  0.7323996265172735
9 train :  0.7511322780968389  test :  0.7323996265172735
10 train :  0.7557547742447588  test :  0.7277310924369748
11 train :  0.7623850212448056  test :  0.7260504201680672
12 train :  0.7696222626885185  test :  0.7256769374416433
13 train :  0.7788205631040762  test :  0.7211951447245565
14 train :  0.7878320959985059  test :  0.7197012138188609
15 train :  0.79810430966055  test :  0.7118580765639589
16 train :  0.8096838959704907  test :  0.7112978524743231
17 train :  0.822710930569174  test :  0.7092436974789916
18 train :  0

In [37]:
tree= DTC(criterion='entropy',max_depth=1).fit(X_train,Y_train)
yhatdtc=tree.predict(X_test)
print( 'The predicted result is : ' , yhatdtc)

The predicted result is :  [0 0 0 ... 0 0 1]


## Logistic Regression

In [38]:
from sklearn.linear_model import LogisticRegression as log
from sklearn.metrics import confusion_matrix

In [39]:
X_train,X_test,Y_train,Y_test=train_test_split(X,y,test_size=0.2,random_state=4)
LR= log().fit(X_train,Y_train)
yhatlr=LR.predict(X_test)
train_score= skl.metrics.accuracy_score(Y_train,LR.predict(X_train))
test_score= skl.metrics.accuracy_score(Y_test,yhat)
print('train score : ', train_score , ' test score : ', test_score)
print(" the predicted labels are ", yhatlr)



train score :  0.7421674370826913  test score :  0.6728291316526611
 the predicted labels are  [0 0 0 ... 0 0 1]



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



In [46]:
yhat=[]
for i in range(len(yhatknn)):
    if ((yhatknn[i]+yhatdtc[i]+yhatlr[i]+yhatrfc[i])>2):
        yhat.append(1)
    elif ((yhatknn[i]+yhatdtc[i]+yhatlr[i]+yhatrfc[i])<2):
        yhat.append(0)
    else:
        yhat.append(np.random.randint(2))

    
    


In [50]:
yhat

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [45]:
len(yhatrfc)

5355

let's see how the models performs with f1-score and jaccard similarity 

In [55]:
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics import f1_score
jaccard_KNN=jaccard_similarity_score(Y_test, yhat)
f1=f1_score(Y_test, yhat, average='binary')
print('Jaccard:',jaccard_KNN)
print('F1-Score: ' ,f1)


Jaccard: 0.7368814192343605
F1-Score:  0.33880807132801505



jaccard_similarity_score has been deprecated and replaced with jaccard_score. It will be removed in version 0.23. This implementation has surprising behavior for binary and multiclass classification tasks.



In [57]:
test_score= skl.metrics.accuracy_score(Y_test,yhat)
print('score après combinaison: ', test_score)

score après combinaison:  0.7368814192343605
