In [38]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import preprocessing

In [39]:
df = pd.read_csv('crime_data.csv')

In [40]:
df.head()

Unnamed: 0,State,City,Population,Violent crime,Murder and manslaughter,Rape,Rape (legacy definition),Robbery,Aggravated assault,Property crime,Burglary,Larceny,Motor vehicle theft,Arson
0,ALABAMA,Abbeville,2645,11,1,1,,2,7,63,21,39,3,
1,,Adamsville,4481,19,1,0,,7,11,321,58,252,11,
2,,Addison,744,1,0,1,,0,0,25,6,17,2,
3,,Alabaster,31170,44,0,2,,11,31,640,70,544,26,
4,,Alexander City,14692,119,2,16,,12,89,661,121,510,30,


In [41]:
df.columns

Index(['State', 'City', 'Population', 'Violent crime',
       'Murder and manslaughter', 'Rape', 'Rape (legacy definition)',
       'Robbery', 'Aggravated assault', 'Property crime', 'Burglary',
       'Larceny', 'Motor vehicle theft', 'Arson'],
      dtype='object')

In [42]:
df.drop(['State', 'City', 'Rape (legacy definition)', 'Arson'], axis=1, inplace=True)

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9292 entries, 0 to 9291
Data columns (total 10 columns):
Population                 9289 non-null object
Violent crime              9288 non-null object
Murder and manslaughter    9292 non-null int64
Rape                       5431 non-null object
Robbery                    9292 non-null object
Aggravated assault         9289 non-null object
Property crime             9288 non-null object
Burglary                   9290 non-null object
Larceny                    9290 non-null object
Motor vehicle theft        9292 non-null object
dtypes: int64(1), object(9)
memory usage: 726.0+ KB


In [44]:
df.dropna(axis=0, inplace=True)

In [45]:
for item in ['Violent crime', 'Rape', 'Aggravated assault', 'Property crime',
    'Burglary', 'Larceny', 'Motor vehicle theft', 'Robbery', 'Population']:
    df[item] = df[item].str.replace(',', '').astype(int)

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5425 entries, 0 to 9255
Data columns (total 10 columns):
Population                 5425 non-null int32
Violent crime              5425 non-null int32
Murder and manslaughter    5425 non-null int64
Rape                       5425 non-null int32
Robbery                    5425 non-null int32
Aggravated assault         5425 non-null int32
Property crime             5425 non-null int32
Burglary                   5425 non-null int32
Larceny                    5425 non-null int32
Motor vehicle theft        5425 non-null int32
dtypes: int32(9), int64(1)
memory usage: 275.5 KB


In [47]:
df = pd.DataFrame(preprocessing.scale(df), columns = df.columns)

In [48]:
df['Population2'] = df['Population'] ** 2
df['Population3'] = df['Population'] ** 3

In [49]:
df.head()

Unnamed: 0,Population,Violent crime,Murder and manslaughter,Rape,Robbery,Aggravated assault,Property crime,Burglary,Larceny,Motor vehicle theft,Population2,Population3
0,-0.272875,-0.123591,0.027145,-0.173987,-0.099642,-0.132037,-0.213101,-0.168071,-0.235997,-0.125142,0.074461,-0.020318
1,-0.234326,-0.107066,0.027145,-0.204052,-0.072167,-0.117411,-0.090246,-0.094137,-0.083777,-0.09497,0.054909,-0.012866
2,-0.312788,-0.144248,-0.108205,-0.173987,-0.110632,-0.157632,-0.231196,-0.198043,-0.251719,-0.128913,0.097837,-0.030602
3,0.326041,-0.055424,-0.108205,-0.143923,-0.050187,-0.044284,0.061656,-0.070159,0.1249,-0.038398,0.106303,0.034659
4,-0.019934,0.099501,0.162495,0.276983,-0.044692,0.167785,0.071656,0.031749,0.100602,-0.023312,0.000397,-8e-06


### Vanilla Logistic Model

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model

In [12]:
X = df.drop('Violent crime', axis=1)
y = df['Violent crime']

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

### Ridge Model

In [15]:
ridgemodel = linear_model.Ridge(alpha=10, fit_intercept=False)

In [16]:
ridgemodel.fit(X_train, y_train)

Ridge(alpha=10, copy_X=True, fit_intercept=False, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [17]:
ridge_pred = ridgemodel.predict(X_test)

In [22]:
ridgemodel.score(X_train, y_train)

0.99997217278889361

In [29]:
ridgemodel.coef_

array([ -1.78847733e-05,   2.88488926e-02,   7.12885219e-02,
         3.70224162e-01,   5.44033711e-01,   2.13678520e-03,
         7.46333324e-03,   5.45917954e-04,  -4.37260062e-05])

### Lasso Model

In [24]:
lassomodel = linear_model.Lasso(alpha=.35)

In [50]:
lassofit = lassomodel.fit(X_train, y_train)

In [51]:
lasso_pred = lassomodel.predict(X_test)

In [26]:
print(lassomodel.score(X_train, y_train))

0.91305200776


In [27]:
lassomodel.coef_

array([ 0.        ,  0.0031626 ,  0.        ,  0.33252325,  0.38343746,
        0.        ,  0.        ,  0.        ,  0.        ])

In [28]:
lassomodel.intercept_

0.00276737325236435