This is our notebook for our preliminary data cleaning and logistic regression model.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from dateutil import parser
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from datetime import datetime as DAMN
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import operator
from sklearn.naive_bayes import GaussianNB

## Read in cleaned data from CSV 

In [2]:
df = pd.read_csv("crime.csv")


In [3]:
df[:100]

Unnamed: 0.1,Unnamed: 0,Block,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,X Coordinate,Y Coordinate,Latitude,Longitude,sin_hour,cos_hour,sin_month,cos_month
0,23,067XX S PULASKI RD,ASSAULT,SIMPLE,PARKING LOT/GARAGE(NON.RESID.),0,0,833,8,13,65,1150857,1859788,41.771146,-87.722570,7.847643e-15,1.0,1.0,6.123234e-17
1,66,057XX S RACINE AVE,ASSAULT,SIMPLE,APARTMENT,0,1,713,7,16,67,1169298,1866815,41.790050,-87.654769,7.847643e-15,1.0,1.0,6.123234e-17
2,304,061XX S THROOP ST,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,0,1,713,7,16,67,1168680,1863944,41.782185,-87.657118,7.847643e-15,1.0,1.0,6.123234e-17
3,674,015XX N KOSTNER AVE,OTHER OFFENSE,TELEPHONE THREAT,APARTMENT,0,0,2534,25,37,23,1146760,1909896,41.908728,-87.736312,7.847643e-15,1.0,1.0,6.123234e-17
4,696,027XX W 59TH ST,THEFT,$500 AND UNDER,TAXICAB,0,0,824,8,16,66,1158917,1865413,41.786421,-87.692872,7.847643e-15,1.0,1.0,6.123234e-17
5,826,037XX W 59TH ST,CRIMINAL DAMAGE,TO PROPERTY,APARTMENT,0,0,822,8,14,62,1152408,1865224,41.786033,-87.716742,7.847643e-15,1.0,1.0,6.123234e-17
6,1172,0000X W TERMINAL ST,CRIMINAL TRESPASS,TO STATE SUP LAND,AIRPORT TERMINAL LOWER LEVEL - NON-SECURE AREA,1,0,1652,16,41,76,1100726,1934289,41.976421,-87.904976,7.847643e-15,1.0,1.0,6.123234e-17
7,1300,012XX S SAWYER AVE,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,0,1,1022,10,24,29,1154889,1894311,41.865802,-87.706868,7.847643e-15,1.0,1.0,6.123234e-17
8,1344,102XX S COTTAGE GROVE AVE,BATTERY,DOMESTIC BATTERY SIMPLE,STREET,1,0,511,5,8,50,1182767,1837177,41.708418,-87.606301,7.847643e-15,1.0,1.0,6.123234e-17
9,1386,0000X W TERMINAL ST,CRIMINAL TRESPASS,TO STATE SUP LAND,AIRPORT TERMINAL LOWER LEVEL - NON-SECURE AREA,1,0,1655,16,41,76,1104700,1933741,41.974862,-87.890372,7.847643e-15,1.0,1.0,6.123234e-17


## Drop all  unnecessary columns

In [4]:
# Drop unnecessary columns
df.drop(["Beat", "District", "Block", "Community Area", "X Coordinate", "Y Coordinate", "Latitude", "Longitude", "Unnamed: 0"], inplace=True, axis=1)

In [5]:
df[:100]

Unnamed: 0,Primary Type,Description,Location Description,Arrest,Domestic,Ward,sin_hour,cos_hour,sin_month,cos_month
0,ASSAULT,SIMPLE,PARKING LOT/GARAGE(NON.RESID.),0,0,13,7.847643e-15,1.0,1.0,6.123234e-17
1,ASSAULT,SIMPLE,APARTMENT,0,1,16,7.847643e-15,1.0,1.0,6.123234e-17
2,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,0,1,16,7.847643e-15,1.0,1.0,6.123234e-17
3,OTHER OFFENSE,TELEPHONE THREAT,APARTMENT,0,0,37,7.847643e-15,1.0,1.0,6.123234e-17
4,THEFT,$500 AND UNDER,TAXICAB,0,0,16,7.847643e-15,1.0,1.0,6.123234e-17
5,CRIMINAL DAMAGE,TO PROPERTY,APARTMENT,0,0,14,7.847643e-15,1.0,1.0,6.123234e-17
6,CRIMINAL TRESPASS,TO STATE SUP LAND,AIRPORT TERMINAL LOWER LEVEL - NON-SECURE AREA,1,0,41,7.847643e-15,1.0,1.0,6.123234e-17
7,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,0,1,24,7.847643e-15,1.0,1.0,6.123234e-17
8,BATTERY,DOMESTIC BATTERY SIMPLE,STREET,1,0,8,7.847643e-15,1.0,1.0,6.123234e-17
9,CRIMINAL TRESPASS,TO STATE SUP LAND,AIRPORT TERMINAL LOWER LEVEL - NON-SECURE AREA,1,0,41,7.847643e-15,1.0,1.0,6.123234e-17


## One hot encode all dummy variables

In [6]:
# One Hot Encoded (Categorical Data): Primary Type, Description, Location Description, District
df = pd.get_dummies(df, columns =['Primary Type', 'Description', 'Location Description', 'Ward'])

In [7]:
df[:100]

Unnamed: 0,Arrest,Domestic,sin_hour,cos_hour,sin_month,cos_month,Primary Type_ARSON,Primary Type_ASSAULT,Primary Type_BATTERY,Primary Type_BURGLARY,...,Ward_41,Ward_42,Ward_43,Ward_44,Ward_45,Ward_46,Ward_47,Ward_48,Ward_49,Ward_50
0,0,0,7.847643e-15,1.0,1.0,6.123234e-17,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,7.847643e-15,1.0,1.0,6.123234e-17,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,7.847643e-15,1.0,1.0,6.123234e-17,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,7.847643e-15,1.0,1.0,6.123234e-17,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,7.847643e-15,1.0,1.0,6.123234e-17,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,7.847643e-15,1.0,1.0,6.123234e-17,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,0,7.847643e-15,1.0,1.0,6.123234e-17,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,0,1,7.847643e-15,1.0,1.0,6.123234e-17,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8,1,0,7.847643e-15,1.0,1.0,6.123234e-17,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9,1,0,7.847643e-15,1.0,1.0,6.123234e-17,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


## Separate into training and test data 

In [8]:
y = df["Arrest"].copy(True)
X = df.iloc[:,1:].copy(True)
print(y)
print(X)


0         0
1         0
2         0
3         0
4         0
5         0
6         1
7         0
8         1
9         1
10        0
11        0
12        0
13        0
14        1
15        0
16        0
17        0
18        0
19        0
20        0
21        0
22        0
23        0
24        0
25        0
26        0
27        0
28        0
29        0
         ..
902268    1
902269    0
902270    1
902271    0
902272    0
902273    1
902274    1
902275    0
902276    0
902277    1
902278    0
902279    0
902280    0
902281    0
902282    0
902283    0
902284    0
902285    0
902286    0
902287    0
902288    1
902289    0
902290    0
902291    0
902292    0
902293    0
902294    0
902295    1
902296    0
902297    0
Name: Arrest, Length: 902298, dtype: int64
        Domestic      sin_hour  cos_hour  sin_month     cos_month  \
0              0  7.847643e-15       1.0        1.0  6.123234e-17   
1              1  7.847643e-15       1.0        1.0  6.123234e-17   
2              1  

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state = DAMN.now().second)


## Perform Logistic Regression with an L1 Regularizer

In [10]:
classifier_lr_l1 = LogisticRegression(random_state = DAMN.now().second, penalty="l1")
classifier_lr_l1.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=47, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [11]:
y_pred_lr_l1 = classifier_lr_l1.predict(X_test)
confusion_matrix_result = confusion_matrix(y_test, y_pred_lr_l1)
print(confusion_matrix_result)

[[145446   3247]
 [ 18649  13118]]


In [13]:
classifier_lr_l1.score(X_test, y_test)

0.87866563227307992

In [14]:
print(classification_report(y_test, y_pred_lr_l1))

             precision    recall  f1-score   support

          0       0.89      0.98      0.93    148693
          1       0.80      0.41      0.55     31767

avg / total       0.87      0.88      0.86    180460



## Perform Logistic Regression with an L2 Regularizer

In [15]:
classifier_lr_l2 = LogisticRegression(random_state = DAMN.now().second, penalty="l1")
classifier_lr_l2.fit(X_train, y_train)
y_pred_lr_l2 = classifier_lr_l2.predict(X_test)
confusion_matrix_result = confusion_matrix(y_test, y_pred_lr_l2)
print(confusion_matrix_result)

[[145448   3245]
 [ 18651  13116]]


In [16]:
classifier_lr_l1.coef_

array([[  1.29203859e-01,  -5.59822977e-01,  -1.61175322e-01,
          8.30849577e-02,   5.15616099e-02,  -4.94485392e-01,
          1.63646971e-01,   1.03432959e-01,  -1.33402309e+00,
          2.81491558e+00,  -5.15433240e-01,  -7.87079517e-01,
          1.90528164e+00,  -1.84881412e+00,   0.00000000e+00,
         -5.39328153e-01,   2.12097982e+00,  -6.34785353e-01,
         -1.09890109e+00,  -1.58939123e+00,  -5.96512310e-02,
         -4.28094365e-01,   0.00000000e+00,   6.21050339e-01,
         -5.03979219e-01,   4.96308141e-01,   7.05758339e-01,
          4.16842514e-01,  -1.36046372e+00,  -5.89476212e-01,
         -2.50053213e-01,  -1.93842539e+00,   1.96079938e+00,
         -3.11850598e-01,  -9.49218475e-01,  -3.87558048e-01,
          0.00000000e+00,   2.10575309e+00,   2.85444301e+00,
          9.24840025e-01,   6.58897212e-02,   0.00000000e+00,
          5.86298188e-01,   4.09737507e-01,   1.36542433e-03,
          0.00000000e+00,  -2.10166511e-02,  -9.32738467e-01,
        

In [17]:
classifier_lr_l1.coef_.shape

(1, 492)

## Analyze the Coefficients Produced by Using the L1 Regularizer

In [18]:
desc = {}
ward ={}
loc ={}
primary = {}
time={}
other={}
d={}
hour={}
month={}
for i in range(len(X.columns)):
    field = X.columns[i]
    d[field] = classifier_lr_l1.coef_[0][i]
    print(field)
    if (field.startswith("Primary Type")):
        primary[field] = classifier_lr_l1.coef_[0][i]
    elif (field.startswith("Location Description")):
        loc[field] = classifier_lr_l1.coef_[0][i]
    elif (field.startswith("Ward")):
        ward[field] = classifier_lr_l1.coef_[0][i]
    elif (field.startswith("Description_")):
        desc[field] = classifier_lr_l1.coef_[0][i]
    elif (field == "cos_hour" or field == "sin_hour"):
        hour[field] = classifier_lr_l1.coef_[0][i]
    elif (field == "cos_month" or field == "sin_month"):
        month[field] = classifier_lr_l1.coef_[0][i]
    else :
        other[field] = classifier_lr_l1.coef_[0][i]

Domestic
sin_hour
cos_hour
sin_month
cos_month
Primary Type_ARSON
Primary Type_ASSAULT
Primary Type_BATTERY
Primary Type_BURGLARY
Primary Type_CONCEALED CARRY LICENSE VIOLATION
Primary Type_CRIM SEXUAL ASSAULT
Primary Type_CRIMINAL DAMAGE
Primary Type_CRIMINAL TRESPASS
Primary Type_DECEPTIVE PRACTICE
Primary Type_HOMICIDE
Primary Type_HUMAN TRAFFICKING
Primary Type_INTERFERENCE WITH PUBLIC OFFICER
Primary Type_INTIMIDATION
Primary Type_KIDNAPPING
Primary Type_MOTOR VEHICLE THEFT
Primary Type_NON - CRIMINAL
Primary Type_NON-CRIMINAL
Primary Type_NON-CRIMINAL (SUBJECT SPECIFIED)
Primary Type_OBSCENITY
Primary Type_OFFENSE INVOLVING CHILDREN
Primary Type_OTHER NARCOTIC VIOLATION
Primary Type_OTHER OFFENSE
Primary Type_PUBLIC PEACE VIOLATION
Primary Type_ROBBERY
Primary Type_SEX OFFENSE
Primary Type_STALKING
Primary Type_THEFT
Primary Type_WEAPONS VIOLATION
Description_$500 AND UNDER
Description_ABUSE/NEGLECT: CARE FACILITY
Description_AGG CRIM SEX ABUSE FAM MEMBER
Description_AGG CRIMINAL

In [19]:
sorted_d = sorted(d.items(), key=operator.itemgetter(1))
sorted_d

[('Description_OBSCENE TELEPHONE CALLS', -4.3028539654943065),
 ('Description_HARASSMENT BY TELEPHONE', -3.8806803634883869),
 ('Description_TELEPHONE THREAT', -3.7779761194849044),
 ('Description_OTHER OFFENSE', -3.6662465169823562),
 ('Description_HARASSMENT BY ELECTRONIC MEANS', -3.496918297695772),
 ('Description_UNLAWFUL INTERFERE/VISITATION', -3.3177915103739561),
 ('Description_ILLEGAL USE CASH CARD', -2.6943936882238901),
 ('Description_BOMB THREAT', -2.625984783934197),
 ('Location Description_AIRCRAFT', -2.3281835773387649),
 ('Description_ATTEMPT - FINANCIAL IDENTITY THEFT', -2.2559993448895379),
 ('Description_OTHER CRIME AGAINST PERSON', -2.2257825987842499),
 ('Description_UNLAWFUL USE OTHER FIREARM', -2.1893788657537088),
 ('Description_EAVESDROPPING', -2.187403056768765),
 ('Description_RECKLESS FIREARM DISCHARGE', -2.1416587087709167),
 ('Description_POCKET-PICKING', -2.0139555137371121),
 ('Description_FINANCIAL IDENTITY THEFT $300 AND UNDER', -1.9612059217353959),
 (

In [20]:
def get_average(weights):
    sorted_weights = sorted(weights.items(), key=operator.itemgetter(1))
    sum = 0.0
    for w in weights:
        sum += abs(weights[w])
    avg_w = sum/float(len(weights))
    return(avg_w, sorted_weights)
        

In [21]:
get_average(ward)

(0.085643525271931434,
 [('Ward_32', -0.33521653363436205),
  ('Ward_43', -0.25747605569525633),
  ('Ward_47', -0.24036566912231339),
  ('Ward_4', -0.22642364679556382),
  ('Ward_39', -0.2185061701957445),
  ('Ward_33', -0.19227810854830968),
  ('Ward_50', -0.17252322024740782),
  ('Ward_41', -0.15313921335622088),
  ('Ward_23', -0.14335327348384325),
  ('Ward_25', -0.13105937897450146),
  ('Ward_27', -0.12328373210005375),
  ('Ward_1', -0.10965545527347483),
  ('Ward_45', -0.10087280124019588),
  ('Ward_19', -0.099849556119609537),
  ('Ward_40', -0.095885767206704289),
  ('Ward_24', -0.095733925737489262),
  ('Ward_36', -0.087699178320209584),
  ('Ward_29', -0.084962387720053226),
  ('Ward_35', -0.060143002001164771),
  ('Ward_18', -0.050275193268869978),
  ('Ward_26', -0.040022317589150126),
  ('Ward_30', -0.02949279249469557),
  ('Ward_28', -0.021521694445763791),
  ('Ward_5', -0.018232382762834721),
  ('Ward_13', -0.012456320933767969),
  ('Ward_8', -0.0034404670259762043),
  ('War

In [22]:
get_average(desc)

(0.8759213485834636,
 [('Description_OBSCENE TELEPHONE CALLS', -4.3028539654943065),
  ('Description_HARASSMENT BY TELEPHONE', -3.8806803634883869),
  ('Description_TELEPHONE THREAT', -3.7779761194849044),
  ('Description_OTHER OFFENSE', -3.6662465169823562),
  ('Description_HARASSMENT BY ELECTRONIC MEANS', -3.496918297695772),
  ('Description_UNLAWFUL INTERFERE/VISITATION', -3.3177915103739561),
  ('Description_ILLEGAL USE CASH CARD', -2.6943936882238901),
  ('Description_BOMB THREAT', -2.625984783934197),
  ('Description_ATTEMPT - FINANCIAL IDENTITY THEFT', -2.2559993448895379),
  ('Description_OTHER CRIME AGAINST PERSON', -2.2257825987842499),
  ('Description_UNLAWFUL USE OTHER FIREARM', -2.1893788657537088),
  ('Description_EAVESDROPPING', -2.187403056768765),
  ('Description_RECKLESS FIREARM DISCHARGE', -2.1416587087709167),
  ('Description_POCKET-PICKING', -2.0139555137371121),
  ('Description_FINANCIAL IDENTITY THEFT $300 AND UNDER', -1.9612059217353959),
  ('Description_LOST PA

In [23]:
 get_average(primary)

(0.90290715057303039,
 [('Primary Type_THEFT', -1.9384253885462348),
  ('Primary Type_DECEPTIVE PRACTICE', -1.8488141224053012),
  ('Primary Type_MOTOR VEHICLE THEFT', -1.5893912310023679),
  ('Primary Type_ROBBERY', -1.3604637194938336),
  ('Primary Type_BURGLARY', -1.3340230928205425),
  ('Primary Type_KIDNAPPING', -1.0989010865990403),
  ('Primary Type_CRIMINAL DAMAGE', -0.78707951723686465),
  ('Primary Type_INTIMIDATION', -0.63478535277183745),
  ('Primary Type_SEX OFFENSE', -0.5894762124346874),
  ('Primary Type_HUMAN TRAFFICKING', -0.53932815253497302),
  ('Primary Type_CRIM SEXUAL ASSAULT', -0.51543324035749005),
  ('Primary Type_OFFENSE INVOLVING CHILDREN', -0.50397921854463501),
  ('Primary Type_ARSON', -0.49448539222930826),
  ('Primary Type_NON-CRIMINAL', -0.42809436536440243),
  ('Primary Type_STALKING', -0.25005321337739156),
  ('Primary Type_NON - CRIMINAL', -0.059651230997531612),
  ('Primary Type_HOMICIDE', 0.0),
  ('Primary Type_NON-CRIMINAL (SUBJECT SPECIFIED)', 0.0)

In [24]:
get_average(loc)

(0.33018295727785535,
 [('Location Description_AIRCRAFT', -2.3281835773387649),
  ('Location Description_AIRPORT TERMINAL UPPER LEVEL - SECURE AREA',
   -1.5801777331566187),
  ('Location Description_NURSING HOME/RETIREMENT HOME', -1.3580964203982178),
  ('Location Description_RESIDENCE-GARAGE', -1.0319846998248485),
  ('Location Description_POOL ROOM', -0.86898069733949801),
  ('Location Description_DAY CARE CENTER', -0.86528913784588035),
  ('Location Description_MEDICAL/DENTAL OFFICE', -0.73111630324710697),
  ('Location Description_AUTO', -0.68671968768994351),
  ('Location Description_VEHICLE - OTHER RIDE SERVICE', -0.66589471862197047),
  ('Location Description_VACANT LOT/LAND', -0.66216799525095371),
  ('Location Description_ANIMAL HOSPITAL', -0.58066676993656507),
  ('Location Description_DRIVEWAY - RESIDENTIAL', -0.55728300564413213),
  ('Location Description_ATM (AUTOMATIC TELLER MACHINE)',
   -0.53690272357561886),
  ('Location Description_BARBERSHOP', -0.53431620031906235),

In [25]:
get_average(hour)

(0.36049914937437078,
 [('sin_hour', -0.5598229769720422), ('cos_hour', -0.1611753217766993)])

In [26]:
get_average(month)

(0.067323283801407077,
 [('cos_month', 0.051561609857360163), ('sin_month', 0.083084957745453991)])

## Perform Naive Bayes

In [27]:
classifier_nb = GaussianNB()
classifier_nb.fit(X_train, y_train)


GaussianNB(priors=None)

In [28]:
y_pred_nb = classifier_nb.predict(X_test)
confusion_matrix_result2 = confusion_matrix(y_test, y_pred_nb)
print(confusion_matrix_result2)

[[133600  15093]
 [ 13987  17780]]


In [29]:
print(classification_report(y_test, y_pred_nb))

             precision    recall  f1-score   support

          0       0.91      0.90      0.90    148693
          1       0.54      0.56      0.55     31767

avg / total       0.84      0.84      0.84    180460



In [None]:
from sklearn import tree

In [None]:
classifier_dt = tree.DecisionTreeClassifier()
classifier_dt.fit(X_train, y_train)

y_pred_dt = classifier_dt.predict(X_test)
confusion_matrix_result3 = confusion_matrix(y_test, y_pred_dt)
print(confusion_matrix_result3)
print(classification_report(y_test, y_pred_dt))