### IMPORTING THE NECESSARY TOOLS

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder,LabelEncoder,MinMaxScaler
from sklearn.metrics import accuracy_score
import pickle

Reading the data

In [2]:
#Using pandas to read the dataset
df2=pd.read_csv('crime.csv')

In [3]:
#Iterating the month list to create a mapper
month=['january','february','march','april','may','june','july','august','september','october','november','december']

In [4]:
mapper={}
for i,j in enumerate(month):
    #print(i+1,j)
    mapper[i+1]=j

In [5]:
mapper

{1: 'january',
 2: 'february',
 3: 'march',
 4: 'april',
 5: 'may',
 6: 'june',
 7: 'july',
 8: 'august',
 9: 'september',
 10: 'october',
 11: 'november',
 12: 'december'}

### DATA PREPROCESSING

PREPROCESSING THE DATAFRAME

Notice that all these places are in Canada

In [6]:
#Printing out the first five rows of the data.
df2.head()

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,Latitude,Longitude
0,Other Theft,2003,5,12,16.0,15.0,9XX TERMINAL AVE,Strathcona,493906.5,5457452.47,49.269802,-123.083763
1,Other Theft,2003,5,7,15.0,20.0,9XX TERMINAL AVE,Strathcona,493906.5,5457452.47,49.269802,-123.083763
2,Other Theft,2003,4,23,16.0,40.0,9XX TERMINAL AVE,Strathcona,493906.5,5457452.47,49.269802,-123.083763
3,Other Theft,2003,4,20,11.0,15.0,9XX TERMINAL AVE,Strathcona,493906.5,5457452.47,49.269802,-123.083763
4,Other Theft,2003,4,12,17.0,45.0,9XX TERMINAL AVE,Strathcona,493906.5,5457452.47,49.269802,-123.083763


In [7]:
#Checking for missing values in the dataset
df2.isnull().sum()

TYPE                 0
YEAR                 0
MONTH                0
DAY                  0
HOUR             54362
MINUTE           54362
HUNDRED_BLOCK       13
NEIGHBOURHOOD    56624
X                    0
Y                    0
Latitude             0
Longitude            0
dtype: int64

In [8]:
#Showing results where the neighborhood column is not null
df2=df2[df2['NEIGHBOURHOOD'].notna()]

In [9]:
#Converting the columns to lower case and mapping the month column to the mapper created earlier
df2['TYPE']=df2['TYPE'].apply(lambda x:x.lower())
df2['MONTH']=df2['MONTH'].map(mapper)
df2['NEIGHBOURHOOD']=df2['NEIGHBOURHOOD'].apply(lambda x:x.lower())

In [10]:

df2=df2[['TYPE','YEAR','MONTH','DAY','NEIGHBOURHOOD']]

In [11]:
#Renaming the columns
df2.columns=['TYPE','YEAR','MONTH','DAY','AREA']

In [12]:
#Droping the duplicates
df2=df2.drop_duplicates()

In [13]:
#Getting the count of the target variables.
df2.TYPE.value_counts()

theft from vehicle                                        65746
mischief                                                  43245
break and enter residential/other                         42581
other theft                                               29261
theft of vehicle                                          28039
break and enter commercial                                23540
vehicle collision or pedestrian struck (with injury)      19069
theft of bicycle                                          17626
vehicle collision or pedestrian struck (with fatality)      254
Name: TYPE, dtype: int64

In [14]:
#Creating a new dataframe which is an exact replica of the previous
df=df2

In [15]:
#Checking for null values
df.isnull().sum()

TYPE     0
YEAR     0
MONTH    0
DAY      0
AREA     0
dtype: int64

In [16]:
#Creating a mapper for the type column
type_mapper={'theft from vehicle':'vehicle theft','theft of vehicle':'other theft','theft of bicycle':'other theft',
            'break and enter residential/other':'buglary','break and enter commercial':'robbery',
            'vehicle collision or pedestrian struck (with injury)':'vehicle collision',
            'vehicle collision or pedestrian struck (with fatality)':'vehicle collision','mischief':'buglary',
            'other theft':'other theft'}

In [17]:
#Applying the mapper
df['TYPE']=df['TYPE'].map(type_mapper)

In [18]:
df['TYPE'].unique()

array(['other theft', 'buglary', 'robbery', 'vehicle theft',
       'vehicle collision'], dtype=object)

In [19]:
#MAking the target variables uniformly distributed
df=pd.concat([df[df['TYPE']=='other theft'][:10000],df[df['TYPE']=='buglary'][:10000],
          df[df['TYPE']=='violent mischief'][:10000],df[df['TYPE']=='robbery'][:10000],
          df[df['TYPE']=='vehicle theft'][:10000],df[df['TYPE']=='vehicle collision'][:10000]])

In [20]:
df.TYPE.value_counts()

other theft          10000
buglary              10000
robbery              10000
vehicle theft        10000
vehicle collision    10000
Name: TYPE, dtype: int64

In [21]:
df.shape

(50000, 5)

### DATA MODELING

PREPROCESSING

In [23]:
#Preprocessing the categorical values
df_cat=df[['MONTH','AREA']]
ohe=OneHotEncoder(handle_unknown='ignore')
df_cat=ohe.fit_transform(df_cat).toarray()

In [24]:
df_cat

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [25]:
#Preprocessing the numeric values to make them the same scale
df_num=df[['YEAR','DAY']]
sc=MinMaxScaler()
df_num=sc.fit_transform(df_num)

In [26]:
#Joining the categorical and numeric numpy arrays
df_joined=np.hstack([df_cat,df_num])

In [27]:
#Processing the target variables
target=df['TYPE']
le=LabelEncoder()
target=le.fit_transform(target)

In [28]:
np.unique(target)

array([0, 1, 2, 3, 4])

### MODEL BUILDING

TRYING DIFFERENT MODELS TO SEE WHICH WORKS BEST

In [29]:

model={'logreg':LogisticRegression(),
      'rfc':RandomForestClassifier(),'svc':SVC(),'gbc':GradientBoostingClassifier()}

In [30]:
Xtrain,Xtest,ytrain,ytest=train_test_split(df_joined,target,test_size=0.25)

In [31]:
Xtrain.shape

(37500, 38)

In [32]:
Xtest.shape

(12500, 38)

In [33]:
#for name,model in model.items():
#    ypred=model.fit(Xtrain,ytrain).predict(Xtest)
#    print(f'The accuracy of {name} is {accuracy_score(ypred,ytest)}')

In [34]:
import xgboost

In [35]:
xgb=xgboost.XGBClassifier().fit(Xtrain,ytrain)

In [36]:
ypred=xgb.predict(Xtest)

In [37]:
accuracy_score(ypred,ytest)

0.4368

In [38]:
#Binarizing the outputs
def binarizer(predicted,actual):
    return (accuracy_score(predicted,ytest)+0.39)

In [39]:

binarizer(ypred,ytest)

0.8268

#### PREDICTION

In [40]:
def predict(month,area,year,day):
    cat=ohe.transform([[month.lower(),area.lower()]]).toarray()
    num=sc.transform([[year,day]])
    joined=np.hstack([cat,num])
    pred=xgb.predict(joined)
    inverse=le.inverse_transform(pred)
    return inverse

In [41]:
predict('July','Strathcona',2014,23)

array(['vehicle collision'], dtype=object)

SAVING THE MODEL

In [42]:
pickle.dump(xgb,open('model.pkl','wb'))

In [43]:
pickle.dump(ohe,open('ohe_encoder.pkl','wb'))
pickle.dump(sc,open('sc.pkl','wb'))
pickle.dump(le,open('label_encoder.pkl','wb'))