This is our notebook for our preliminary data cleaning and logistic regression model.

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from dateutil import parser
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from datetime import datetime as DAMN
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import operator
from sklearn.naive_bayes import GaussianNB

## Read in cleaned data from CSV 

In [33]:
df = pd.read_csv("crime.csv")


In [34]:
df[:100]

Unnamed: 0.1,Unnamed: 0,Block,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,X Coordinate,Y Coordinate,Latitude,Longitude,sin_hour,cos_hour,sin_month,cos_month
0,1842704,0000X W ILLINOIS ST,BATTERY,SIMPLE,STREET,0,0,1831,18,42,8,1175688,1903592,41.890828,-87.630235,0.500000,0.866025,0.0,1.0
1,1842705,006XX N DEARBORN ST,THEFT,FROM BUILDING,BAR OR TAVERN,0,0,1832,18,42,8,1175825,1904582,41.893542,-87.629702,0.000000,1.000000,0.0,1.0
2,1842712,052XX W RACE AVE,CRIMINAL DAMAGE,TO PROPERTY,APARTMENT,0,0,1523,15,28,25,1141356,1903051,41.890046,-87.756333,0.000000,1.000000,0.0,1.0
3,1842713,053XX W WELLINGTON AVE,BATTERY,SIMPLE,APARTMENT,1,0,2514,25,31,19,1140319,1919430,41.935011,-87.759739,0.000000,1.000000,0.0,1.0
4,1842717,014XX W LEXINGTON ST,BATTERY,SIMPLE,STREET,0,0,1231,12,25,28,1166800,1896845,41.872509,-87.663069,0.000000,1.000000,0.0,1.0
5,1842718,064XX S ROCKWELL ST,WEAPONS VIOLATION,UNLAWFUL POSS OF HANDGUN,RESIDENTIAL YARD (FRONT/BACK),1,0,825,8,15,66,1160145,1861909,41.776780,-87.688465,0.000000,1.000000,0.0,1.0
6,1842720,016XX N MENARD AVE,WEAPONS VIOLATION,RECKLESS FIREARM DISCHARGE,RESIDENCE,0,0,2531,25,29,25,1137502,1910357,41.910165,-87.770311,0.000000,1.000000,0.0,1.0
7,1842725,042XX W WILCOX ST,OTHER OFFENSE,OTHER WEAPONS VIOLATION,RESIDENTIAL YARD (FRONT/BACK),1,0,1115,11,28,26,1148126,1898998,41.878797,-87.731575,0.000000,1.000000,0.0,1.0
8,1842727,018XX N KEDVALE AVE,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,1,1,2534,25,30,20,1148430,1911869,41.914110,-87.730126,0.000000,1.000000,0.0,1.0
9,1842728,014XX N LOREL AVE,WEAPONS VIOLATION,RECKLESS FIREARM DISCHARGE,RESIDENTIAL YARD (FRONT/BACK),0,0,2532,25,37,25,1140427,1909002,41.906394,-87.759599,0.000000,1.000000,0.0,1.0


## Drop all  unnecessary columns

In [35]:
# Drop unnecessary columns
df.drop(["Beat", "District", "Block", "Community Area", "X Coordinate", "Y Coordinate", "Latitude", "Longitude", "Unnamed: 0"], inplace=True, axis=1)

In [36]:
df[:100]

Unnamed: 0,Primary Type,Description,Location Description,Arrest,Domestic,Ward,sin_hour,cos_hour,sin_month,cos_month
0,BATTERY,SIMPLE,STREET,0,0,42,0.500000,0.866025,0.0,1.0
1,THEFT,FROM BUILDING,BAR OR TAVERN,0,0,42,0.000000,1.000000,0.0,1.0
2,CRIMINAL DAMAGE,TO PROPERTY,APARTMENT,0,0,28,0.000000,1.000000,0.0,1.0
3,BATTERY,SIMPLE,APARTMENT,1,0,31,0.000000,1.000000,0.0,1.0
4,BATTERY,SIMPLE,STREET,0,0,25,0.000000,1.000000,0.0,1.0
5,WEAPONS VIOLATION,UNLAWFUL POSS OF HANDGUN,RESIDENTIAL YARD (FRONT/BACK),1,0,15,0.000000,1.000000,0.0,1.0
6,WEAPONS VIOLATION,RECKLESS FIREARM DISCHARGE,RESIDENCE,0,0,29,0.000000,1.000000,0.0,1.0
7,OTHER OFFENSE,OTHER WEAPONS VIOLATION,RESIDENTIAL YARD (FRONT/BACK),1,0,28,0.000000,1.000000,0.0,1.0
8,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,1,1,30,0.000000,1.000000,0.0,1.0
9,WEAPONS VIOLATION,RECKLESS FIREARM DISCHARGE,RESIDENTIAL YARD (FRONT/BACK),0,0,37,0.000000,1.000000,0.0,1.0


## One hot encode all dummy variables

In [37]:
# One Hot Encoded (Categorical Data): Primary Type, Description, Location Description, District
df = pd.get_dummies(df, columns =['Primary Type', 'Description', 'Location Description', 'Ward'])

In [38]:
df[:100]

Unnamed: 0,Arrest,Domestic,sin_hour,cos_hour,sin_month,cos_month,Primary Type_ARSON,Primary Type_ASSAULT,Primary Type_BATTERY,Primary Type_BURGLARY,...,Ward_41,Ward_42,Ward_43,Ward_44,Ward_45,Ward_46,Ward_47,Ward_48,Ward_49,Ward_50
0,0,0,0.500000,0.866025,0.0,1.0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0.000000,1.000000,0.0,1.0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0.000000,1.000000,0.0,1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0.000000,1.000000,0.0,1.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0.000000,1.000000,0.0,1.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5,1,0,0.000000,1.000000,0.0,1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0.000000,1.000000,0.0,1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,0.000000,1.000000,0.0,1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1,1,0.000000,1.000000,0.0,1.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0.000000,1.000000,0.0,1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Separate into training and test data 

In [39]:
y = df["Arrest"].copy(True)
X = df.iloc[:,1:].copy(True)
print(y)
print(X)


0         0
1         0
2         0
3         1
4         0
5         1
6         0
7         1
8         1
9         0
10        1
11        0
12        1
13        0
14        1
15        0
16        1
17        0
18        0
19        0
20        1
21        1
22        1
23        1
24        1
25        1
26        1
27        1
28        1
29        1
         ..
904760    0
904761    0
904762    0
904763    0
904764    1
904765    0
904766    0
904767    0
904768    0
904769    0
904770    0
904771    0
904772    0
904773    0
904774    1
904775    0
904776    0
904777    0
904778    0
904779    0
904780    0
904781    1
904782    0
904783    0
904784    0
904785    0
904786    0
904787    0
904788    0
904789    1
Name: Arrest, Length: 904790, dtype: int64
        Domestic      sin_hour      cos_hour  sin_month  cos_month  \
0              0  5.000000e-01  8.660254e-01   0.000000        1.0   
1              0  0.000000e+00  1.000000e+00   0.000000        1.0   
2              

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state = DAMN.now().second)


## Perform Logistic Regression with an L1 Regularizer

In [41]:
classifier_lr_l1 = LogisticRegression(random_state = DAMN.now().second, penalty="l1")
classifier_lr_l1.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=31, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [42]:
y_pred_lr_l1 = classifier_lr_l1.predict(X_test)
confusion_matrix_result = confusion_matrix(y_test, y_pred_lr_l1)
print(confusion_matrix_result)

[[145854   3278]
 [ 18684  13142]]


In [43]:
classifier_lr_l1.score(X_test, y_test)

0.87863482133975834

In [44]:
print(classification_report(y_test, y_pred_lr_l1))

             precision    recall  f1-score   support

          0       0.89      0.98      0.93    149132
          1       0.80      0.41      0.54     31826

avg / total       0.87      0.88      0.86    180958



## Perform Logistic Regression with an L2 Regularizer

In [45]:
classifier_lr_l2 = LogisticRegression(random_state = DAMN.now().second, penalty="l1")
classifier_lr_l2.fit(X_train, y_train)
y_pred_lr_l2 = classifier_lr_l2.predict(X_test)
confusion_matrix_result = confusion_matrix(y_test, y_pred_lr_l2)
print(confusion_matrix_result)

[[145853   3279]
 [ 18686  13140]]


In [62]:
print(classification_report(y_test, y_pred_lr_l2))

             precision    recall  f1-score   support

          0       0.89      0.98      0.93    149132
          1       0.80      0.41      0.54     31826

avg / total       0.87      0.88      0.86    180958



In [46]:
classifier_lr_l1.coef_

array([[  1.31656839e-01,   8.10378405e-03,   1.09789779e-01,
          6.31728848e-02,   6.37574725e-02,  -8.70448397e-01,
          6.81859038e-02,  -3.54948773e-03,  -1.50402100e+00,
          2.84261923e+00,  -4.15009824e-01,  -1.35678795e+00,
          1.33252679e+00,  -2.04158959e+00,   0.00000000e+00,
         -7.81853964e-01,   1.81862697e+00,  -7.02777310e-01,
         -1.62355021e+00,  -1.66897778e+00,  -4.38783389e-01,
         -4.32631246e-01,   0.00000000e+00,   2.15500320e-01,
         -6.66407161e-01,   8.67001745e-01,   3.76194731e-01,
          4.24593028e-01,  -1.67989485e+00,  -8.12448043e-01,
         -5.12041846e-01,  -2.42886232e+00,   1.75792318e+00,
          2.22339788e-02,  -9.01492130e-01,  -6.06119344e-01,
         -9.61244139e-02,   2.27206576e+00,   2.76734833e+00,
          8.93949537e-01,   0.00000000e+00,   0.00000000e+00,
          2.13048754e-01,   1.28918101e-01,   1.27853678e-02,
          0.00000000e+00,  -5.14673430e-02,  -9.42152475e-01,
        

In [47]:
classifier_lr_l1.coef_.shape

(1, 492)

## Analyze the Coefficients Produced by Using the L1 Regularizer

In [65]:
desc = {}
ward ={}
loc ={}
primary = {}
time={}
other={}
d={}
hour={}
month={}
non_zero=0
for i in range(len(X.columns)):
    field = X.columns[i]
    d[field] = classifier_lr_l1.coef_[0][i]
    print(field)
    if(classifier_lr_l1.coef_[0][i] > .0001):
        non_zero+=1
    if (field.startswith("Primary Type")):
        primary[field] = classifier_lr_l1.coef_[0][i]
    elif (field.startswith("Location Description")):
        loc[field] = classifier_lr_l1.coef_[0][i]
    elif (field.startswith("Ward")):
        ward[field] = classifier_lr_l1.coef_[0][i]
    elif (field.startswith("Description_")):
        desc[field] = classifier_lr_l1.coef_[0][i]
    elif (field == "cos_hour" or field == "sin_hour"):
        hour[field] = classifier_lr_l1.coef_[0][i]
    elif (field == "cos_month" or field == "sin_month"):
        month[field] = classifier_lr_l1.coef_[0][i]
    else :
        other[field] = classifier_lr_l1.coef_[0][i]

Domestic
sin_hour
cos_hour
sin_month
cos_month
Primary Type_ARSON
Primary Type_ASSAULT
Primary Type_BATTERY
Primary Type_BURGLARY
Primary Type_CONCEALED CARRY LICENSE VIOLATION
Primary Type_CRIM SEXUAL ASSAULT
Primary Type_CRIMINAL DAMAGE
Primary Type_CRIMINAL TRESPASS
Primary Type_DECEPTIVE PRACTICE
Primary Type_HOMICIDE
Primary Type_HUMAN TRAFFICKING
Primary Type_INTERFERENCE WITH PUBLIC OFFICER
Primary Type_INTIMIDATION
Primary Type_KIDNAPPING
Primary Type_MOTOR VEHICLE THEFT
Primary Type_NON - CRIMINAL
Primary Type_NON-CRIMINAL
Primary Type_NON-CRIMINAL (SUBJECT SPECIFIED)
Primary Type_OBSCENITY
Primary Type_OFFENSE INVOLVING CHILDREN
Primary Type_OTHER NARCOTIC VIOLATION
Primary Type_OTHER OFFENSE
Primary Type_PUBLIC PEACE VIOLATION
Primary Type_ROBBERY
Primary Type_SEX OFFENSE
Primary Type_STALKING
Primary Type_THEFT
Primary Type_WEAPONS VIOLATION
Description_$500 AND UNDER
Description_ABUSE/NEGLECT: CARE FACILITY
Description_AGG CRIM SEX ABUSE FAM MEMBER
Description_AGG CRIMINAL

In [66]:
non_zero

177

In [49]:
sorted_d = sorted(d.items(), key=operator.itemgetter(1))
sorted_d

[('Description_OBSCENE TELEPHONE CALLS', -4.1155791379054705),
 ('Description_HARASSMENT BY TELEPHONE', -3.7151279794762075),
 ('Description_TELEPHONE THREAT', -3.652650625473576),
 ('Description_HARASSMENT BY ELECTRONIC MEANS', -3.448428128815038),
 ('Description_OTHER OFFENSE', -3.1016217604730656),
 ('Description_BOMB THREAT', -2.8080509764211068),
 ('Description_ILLEGAL USE CASH CARD', -2.5351184030759724),
 ('Primary Type_THEFT', -2.4288623214715317),
 ('Description_UNLAWFUL USE OTHER FIREARM', -2.3563281770453064),
 ('Location Description_AIRCRAFT', -2.2974973391164712),
 ('Description_EAVESDROPPING', -2.2849261449306786),
 ('Description_UNLAWFUL INTERFERE/VISITATION', -2.2238227308997645),
 ('Description_RECKLESS FIREARM DISCHARGE', -2.1865492362358876),
 ('Description_LOST PASSPORT', -2.1514897675659328),
 ('Description_ARSON THREAT', -2.1411287807043391),
 ('Description_ATTEMPT - FINANCIAL IDENTITY THEFT', -2.1382918155663817),
 ('Primary Type_DECEPTIVE PRACTICE', -2.041589591

In [50]:
def get_average(weights):
    sorted_weights = sorted(weights.items(), key=operator.itemgetter(1))
    sum = 0.0
    for w in weights:
        sum += abs(weights[w])
    avg_w = sum/float(len(weights))
    return(avg_w, sorted_weights)
        

In [51]:
get_average(ward)

(0.086685785482705502,
 [('Ward_32', -0.36155757960427826),
  ('Ward_47', -0.30273341127696735),
  ('Ward_43', -0.29756085198493537),
  ('Ward_4', -0.24764106375553277),
  ('Ward_39', -0.22383097225604984),
  ('Ward_33', -0.18731337539718121),
  ('Ward_50', -0.1681010766740155),
  ('Ward_27', -0.16285284328353591),
  ('Ward_41', -0.1432503385208416),
  ('Ward_35', -0.12746243312604752),
  ('Ward_23', -0.1267605468032256),
  ('Ward_24', -0.11489913804539661),
  ('Ward_36', -0.1022750604617414),
  ('Ward_25', -0.098649533823982782),
  ('Ward_1', -0.090879188798122471),
  ('Ward_45', -0.083823823817360241),
  ('Ward_19', -0.078189528269199518),
  ('Ward_29', -0.072683031756833466),
  ('Ward_18', -0.072673875701567692),
  ('Ward_40', -0.057114540242065436),
  ('Ward_49', -0.03469883521872557),
  ('Ward_38', -0.026900880659596376),
  ('Ward_13', -0.025931780870427714),
  ('Ward_28', -0.025324826005908643),
  ('Ward_26', -0.025185118009123323),
  ('Ward_5', -0.024779163626249132),
  ('Ward_1

In [52]:
get_average(desc)

(0.87791498222841369,
 [('Description_OBSCENE TELEPHONE CALLS', -4.1155791379054705),
  ('Description_HARASSMENT BY TELEPHONE', -3.7151279794762075),
  ('Description_TELEPHONE THREAT', -3.652650625473576),
  ('Description_HARASSMENT BY ELECTRONIC MEANS', -3.448428128815038),
  ('Description_OTHER OFFENSE', -3.1016217604730656),
  ('Description_BOMB THREAT', -2.8080509764211068),
  ('Description_ILLEGAL USE CASH CARD', -2.5351184030759724),
  ('Description_UNLAWFUL USE OTHER FIREARM', -2.3563281770453064),
  ('Description_EAVESDROPPING', -2.2849261449306786),
  ('Description_UNLAWFUL INTERFERE/VISITATION', -2.2238227308997645),
  ('Description_RECKLESS FIREARM DISCHARGE', -2.1865492362358876),
  ('Description_LOST PASSPORT', -2.1514897675659328),
  ('Description_ARSON THREAT', -2.1411287807043391),
  ('Description_ATTEMPT - FINANCIAL IDENTITY THEFT', -2.1382918155663817),
  ('Description_FINANCIAL IDENTITY THEFT $300 AND UNDER', -2.007173842353172),
  ('Description_OTHER CRIME AGAINST P

In [53]:
 get_average(primary)

(0.9872430812898374,
 [('Primary Type_THEFT', -2.4288623214715317),
  ('Primary Type_DECEPTIVE PRACTICE', -2.0415895911394371),
  ('Primary Type_ROBBERY', -1.6798948483114549),
  ('Primary Type_MOTOR VEHICLE THEFT', -1.6689777772042944),
  ('Primary Type_KIDNAPPING', -1.6235502141622045),
  ('Primary Type_BURGLARY', -1.5040210042195257),
  ('Primary Type_CRIMINAL DAMAGE', -1.3567879495869444),
  ('Primary Type_ARSON', -0.87044839735125223),
  ('Primary Type_SEX OFFENSE', -0.8124480432460941),
  ('Primary Type_HUMAN TRAFFICKING', -0.78185396449326949),
  ('Primary Type_INTIMIDATION', -0.70277730950563777),
  ('Primary Type_OFFENSE INVOLVING CHILDREN', -0.66640716130655109),
  ('Primary Type_STALKING', -0.51204184585410795),
  ('Primary Type_NON - CRIMINAL', -0.43878338867758043),
  ('Primary Type_NON-CRIMINAL', -0.43263124613697879),
  ('Primary Type_CRIM SEXUAL ASSAULT', -0.41500982409628057),
  ('Primary Type_BATTERY', -0.003549487734903128),
  ('Primary Type_HOMICIDE', 0.0),
  ('Prim

In [54]:
get_average(loc)

(0.3400649258498849,
 [('Location Description_AIRCRAFT', -2.2974973391164712),
  ('Location Description_AIRPORT TERMINAL UPPER LEVEL - SECURE AREA',
   -1.5145044657146653),
  ('Location Description_NURSING HOME/RETIREMENT HOME', -1.5048182791998799),
  ('Location Description_RESIDENCE-GARAGE', -1.0510209434746505),
  ('Location Description_ANIMAL HOSPITAL', -0.99188072690938478),
  ('Location Description_DAY CARE CENTER', -0.861176273538935),
  ('Location Description_POOL ROOM', -0.74733315723151772),
  ('Location Description_AUTO', -0.74182875320186903),
  ('Location Description_FIRE STATION', -0.68073488899204504),
  ('Location Description_VACANT LOT/LAND', -0.68041136214133313),
  ('Location Description_DRIVEWAY - RESIDENTIAL', -0.63631916646668418),
  ('Location Description_VEHICLE - OTHER RIDE SERVICE', -0.61811157613596202),
  ('Location Description_MEDICAL/DENTAL OFFICE', -0.60476685897981852),
  ('Location Description_BARBERSHOP', -0.56344634825865558),
  ('Location Descriptio

In [55]:
get_average(hour)

(0.058946781282007948,
 [('sin_hour', 0.0081037840504079132), ('cos_hour', 0.10978977851360798)])

In [56]:
get_average(month)

(0.063465178646043202,
 [('sin_month', 0.063172884767115273), ('cos_month', 0.063757472524971145)])

## Perform Naive Bayes

In [57]:
classifier_nb = GaussianNB()
classifier_nb.fit(X_train, y_train)


GaussianNB(priors=None)

In [58]:
y_pred_nb = classifier_nb.predict(X_test)
confusion_matrix_result2 = confusion_matrix(y_test, y_pred_nb)
print(confusion_matrix_result2)

[[137349  11783]
 [ 15363  16463]]


In [59]:
print(classification_report(y_test, y_pred_nb))

             precision    recall  f1-score   support

          0       0.90      0.92      0.91    149132
          1       0.58      0.52      0.55     31826

avg / total       0.84      0.85      0.85    180958



## Predict Using Decision Tree

In [60]:
from sklearn import tree

In [61]:
classifier_dt = tree.DecisionTreeClassifier()
classifier_dt.fit(X_train, y_train)

y_pred_dt = classifier_dt.predict(X_test)
confusion_matrix_result3 = confusion_matrix(y_test, y_pred_dt)
print(confusion_matrix_result3)
print(classification_report(y_test, y_pred_dt))

[[135409  13723]
 [ 16581  15245]]
             precision    recall  f1-score   support

          0       0.89      0.91      0.90    149132
          1       0.53      0.48      0.50     31826

avg / total       0.83      0.83      0.83    180958



In [67]:
non_zero=0
for i in range(len(X.columns)):
    if(classifier_lr_l2.coef_[0][i] > .0001):
        non_zero+=1
non_zero

183