In [1]:
import math
import warnings

from IPython.display import display
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import linear_model
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std

# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

# Suppress annoying harmless error.
warnings.filterwarnings(
    action="ignore",
    module="scipy",
    message="^internal gelsd")

In [8]:
#Read the data in
nyc_crime_raw = pd.read_csv('NYC_Crime_2014.csv', header=4)

In [9]:
#Take a quick glance at the data
nyc_crime_raw.head(10)

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson3,Unnamed: 13
0,Adams Village,1851,0,0.0,,0.0,0,0,11,1,10,0,0.0,
1,Addison Town and Village,2568,2,0.0,,0.0,1,1,49,1,47,1,0.0,
2,Afton Village4,820,0,0.0,0.0,,0,0,1,0,1,0,0.0,
3,Akron Village,2842,1,0.0,,0.0,0,1,17,0,17,0,0.0,
4,Albany4,98595,802,8.0,54.0,,237,503,3888,683,3083,122,12.0,
5,Albion Village4,5872,26,0.0,3.0,,2,21,204,41,159,4,0.0,
6,Alexandria Bay Village4,1107,0,0.0,0.0,,0,0,7,2,5,0,0.0,
7,Alfred Village4,4032,11,1.0,1.0,,0,9,30,6,24,0,0.0,
8,Altamont Village4,1723,1,0.0,0.0,,0,1,2,2,0,0,0.0,
9,Amherst Town4,118860,128,1.0,16.0,,43,68,2066,176,1846,44,2.0,


In [10]:
#Remove superfluous column
nyc_crime_raw.drop(columns=['Unnamed: 13'], inplace=True)

In [11]:
nyc_crime_raw.shape

(376, 13)

In [13]:
nyc_crime_raw.columns = [
            'city', 'population', 'murder', 
            'violent_crime', 'rape1', 
            'rape2', 'robbery', 'aggravated_assault',
            'property_crime', 'burglary', 'larceny', 'auto_theft', 'arson'
        ]

# set raw
nyc_crime_raw.describe()

Unnamed: 0,violent_crime,rape2,arson
count,369.0,142.0,365.0
mean,1.453,0.007,1.425
std,17.694,0.084,7.995
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,1.0
max,333.0,1.0,135.0


In [14]:
#Now that we know we have 376 observations across 12 columns, let's Find NaN's by column

missing_values_count = nyc_crime_raw.isnull().sum()
print(missing_values_count)

city                    1
population              7
murder                  7
violent_crime           7
rape1                 149
rape2                 234
robbery                 7
aggravated_assault      7
property_crime          8
burglary                7
larceny                 8
auto_theft              7
arson                  11
dtype: int64


In [16]:
nyc_crime_raw = nyc_crime_raw.dropna(subset=['population', 'robbery', 'burglary', 'property_crime'])

In [17]:
nyc_crime_raw['population'] = nyc_crime_raw['population'].apply(lambda x: str(x).replace(',', ''))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [18]:
nyc_crime_raw['robbery'] = nyc_crime_raw['robbery'].apply(lambda x: str(x).replace(',', ''))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [19]:
nyc_crime_raw['burglary'] = nyc_crime_raw['burglary'].apply(lambda x: str(x).replace(',', ''))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [20]:
nyc_crime_raw['property_crime'] = nyc_crime_raw['property_crime'].apply(lambda x: str(x).replace(',', ''))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [21]:
nyc_crime_raw['larceny'] = nyc_crime_raw['larceny'].apply(lambda x: str(x).replace(',', ''))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [22]:
nyc_crime_raw['population'] = nyc_crime_raw['population'].astype(int)
nyc_crime_raw['robbery'] = nyc_crime_raw['robbery'].astype(int)
nyc_crime_raw['burglary'] = nyc_crime_raw['burglary'].astype(int)
nyc_crime_raw['property_crime'] = nyc_crime_raw['property_crime'].astype(int)
nyc_crime_raw['larceny'] = nyc_crime_raw['larceny'].astype(int)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [23]:
print(nyc_crime_raw['property_crime'].describe())
print(nyc_crime_raw['population'].describe())
print(nyc_crime_raw['robbery'].describe())
print(nyc_crime_raw['burglary'].describe())
print(nyc_crime_raw['larceny'].describe())

count      368.000
mean       698.361
std       7123.614
min          0.000
25%         25.000
50%         76.000
75%        271.500
max     135747.000
Name: property_crime, dtype: float64
count       368.000
mean      37888.399
std      441757.416
min          79.000
25%        2628.250
50%        6564.500
75%       15534.750
max     8473938.000
Name: population, dtype: float64
count     368.000
mean       60.823
std       867.655
min         0.000
25%         0.000
50%         1.000
75%         4.000
max     16581.000
Name: robbery, dtype: float64
count     368.000
mean      101.160
std       856.253
min         0.000
25%         4.000
50%        12.500
75%        39.000
max     15916.000
Name: burglary, dtype: float64
count      368.000
mean       562.791
std       5869.850
min          0.000
25%         20.000
50%         60.500
75%        228.500
max     112107.000
Name: larceny, dtype: float64


In [24]:
nyc_crime_raw = nyc_crime_raw[nyc_crime_raw['property_crime'] > 25]
nyc_crime_raw['property_crime'].quantile(0.95)
nyc_crime_raw = nyc_crime_raw[nyc_crime_raw['property_crime'] < 2009]
nyc_crime_raw = nyc_crime_raw[nyc_crime_raw['population'] > 2628]
nyc_crime_raw['population'].quantile(0.95)


44738.79999999999

In [25]:
nyc_crime_raw = nyc_crime_raw[nyc_crime_raw['population'] < 66486]


In [26]:
#Robbery is different. The 25th percentile is 0. Let's instead look at a lower quantile
nyc_crime_raw['robbery'].quantile(0.85)


12.0

In [28]:
nyc_crime_raw = nyc_crime_raw[nyc_crime_raw['robbery'] < 7]
nyc_crime_raw = nyc_crime_raw[nyc_crime_raw['burglary'] > 4]
nyc_crime_raw['burglary'].quantile(0.95)


61.19999999999999

In [30]:
nyc_crime_raw = nyc_crime_raw[nyc_crime_raw['burglary'] < 61]
nyc_crime_raw = nyc_crime_raw[nyc_crime_raw['larceny'] > 20]
nyc_crime_raw['larceny'].quantile(0.95)


349.25

In [31]:
nyc_crime_raw = nyc_crime_raw[nyc_crime_raw['larceny'] < 349]

nyc_crime_raw['PopulationSquared'] = nyc_crime_raw['population']**2


In [32]:
# Instantiate and fit our model.
regression = linear_model.LinearRegression()
Y = nyc_crime_raw['property_crime']
X = nyc_crime_raw[['population', 'PopulationSquared', 'burglary', 'robbery']]
regression.fit(X, Y)

# Inspect the results.
print('\nCoefficients: \n', regression.coef_)
print('\nIntercept: \n#Perform Cross-Validation

from sklearn.model_selection import cross_val_score
cross_val_score(regression, X, Y, cv=10)', regression.intercept_)
print('\nR-squared:')
print(regression.score(X, Y))



Coefficients: 
 [ 3.71191288e-03 -6.14920267e-08  4.31643508e+00  7.53440627e+00]

Intercept: 
 2.5397914914702966

R-squared:
0.6490968785254018


  linalg.lstsq(X, y)


### Validate the model  


In [33]:
#Perform Cross-Validation

from sklearn.model_selection import cross_val_score
cross_val_score(regression, X, Y, cv=10)

array([0.81977676, 0.75301842, 0.13816034, 0.26001248, 0.68295977,
       0.82813815, 0.81833987, 0.34091492, 0.32259765, 0.65306851])

In [34]:
# Use a ~ to represent an '=' from the functional form
linear_formula = 'property_crime ~ population+PopulationSquared+burglary+robbery'

# Fit the model to our data using the formula.
lm = smf.ols(formula=linear_formula, data=nyc_crime_raw).fit()
lm.params
lm.pvalues

Intercept           0.842
population          0.086
PopulationSquared   0.355
burglary            0.000
robbery             0.012
dtype: float64

In [35]:
lm.rsquared

0.6490968785254017

In [36]:
lm.conf_int()


Unnamed: 0,0,1
Intercept,-22.611,27.69
population,-0.001,0.008
PopulationSquared,-0.0,0.0
burglary,3.551,5.082
robbery,1.678,13.39


### Create revised model
### Removed Population and Population Squared, Added LarcenyTheft

In [38]:
# Instantiate and fit our model.
regression = linear_model.LinearRegression()
Y = nyc_crime_raw['property_crime']
X = nyc_crime_raw[['burglary', 'robbery', 'larceny']]
regression.fit(X, Y)

# Inspect the results.
print('\nCoefficients: \n', regression.coef_)
print('\nIntercept: \n', regression.intercept_)
print('\nR-squared:')
print(regression.score(X, Y))


Coefficients: 
 [1.01978963 0.35822054 1.01386312]

Intercept: 
 0.7318621733487589

R-squared:
0.9991601607971416


In [39]:
# Use a ~ to represent an '=' from the functional form
linear_formula = 'property_crime ~ burglary+robbery+larceny'

# Fit the model to our data using the formula.
lm = smf.ols(formula=linear_formula, data=nyc_crime_raw).fit()

In [40]:
lm.pvalues


Intercept   0.070
burglary    0.000
robbery     0.015
larceny     0.000
dtype: float64

In [42]:
#Perform Cross-Validation

from sklearn.model_selection import cross_val_score
cross_val_score(regression, X, Y, cv=10)


array([0.99907655, 0.99951516, 0.99905488, 0.99951741, 0.99910037,
       0.99954456, 0.99848302, 0.99490303, 0.9994934 , 0.99917967])