In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statistics
import seaborn as sns
from scipy.stats import ttest_ind, ttest_ind_from_stats
from scipy.special import stdtr
from scipy import stats
from scipy.stats import ttest_ind
from matplotlib.mlab import PCA as mlabPCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pandas as pd
from sklearn import linear_model
from sklearn.linear_model import LinearRegression

In [2]:
df2013 = pd.read_csv("ny_crime_2013.csv")
df2014 = pd.read_csv("ny_crime_2014.csv")

In [3]:
#drop column that contains no data
df2013 = df2013.drop('Rape\n(revised\ndefinition)1', axis=1)

In [4]:
#simplify column names
df2013.columns = ['city', 'population', 'violent crime', 'murder', 'rape', 'robbery', 'assault', 
              'property crime','burglary', 'larceny/theft', 'motor vehicle theft', 'arson']
df2014.columns = ['city', 'population', 'violent crime', 'murder', 'rape v1', 'rape v2', 'robbery', 'assault', 
              'property crime','burglary', 'larceny/theft', 'motor vehicle theft', 'arson']

In [5]:
df2013 = df2013.drop(columns=['city', 'rape', 'arson'])
df2014 = df2014.drop(columns=['city', 'rape v1', 'rape v2'])

df2013 = df2013.replace({',': ''}, regex=True)
df2014 = df2014.replace({',': ''}, regex=True)

In [6]:
#convert all data into floats
df2013[['population', 'violent crime', 'murder', 'robbery', 'assault', 
              'property crime','burglary', 'larceny/theft', 'motor vehicle theft']] = df2013[['population', 'violent crime', 'murder', 'robbery', 'assault', 
              'property crime','burglary', 'larceny/theft', 'motor vehicle theft']].astype(float)
df2014[['population', 'violent crime', 'murder', 'robbery', 'assault', 
              'property crime','burglary', 'larceny/theft', 'motor vehicle theft', 'arson']] = df2014[['population', 'violent crime', 'murder', 'robbery', 'assault', 
              'property crime','burglary', 'larceny/theft', 'motor vehicle theft', 'arson']].astype(float)

In [7]:
#remove outliers (based off of 3 standard deviations)
df2013[np.abs(df2013.population-df2013.population.mean())<=(3*df2013.population.std())] 
df2013 = df2013[~(np.abs(df2013.population-df2013.population.mean())>(3*df2013.population.std()))]

df2014[np.abs(df2014.population-df2014.population.mean())<=(3*df2014.population.std())] 
df2014 = df2014[~(np.abs(df2014.population-df2014.population.mean())>(3*df2014.population.std()))]

In [8]:
#disable warning
pd.options.mode.chained_assignment = None

#create population squared column
df2013['population squared'] = np.square(df2013['population'])
df2014['population squared'] = np.square(df2014['population'])

In [9]:
#convert murder and robbery (continuous data) columns into categorical data
df2013['murder'] = np.where(df2013['murder']>0, 1, 0)
df2013['robbery'] = np.where(df2013['robbery']>0, 1, 0)

df2014['murder'] = np.where(df2014['murder']>0, 1, 0)
df2014['robbery'] = np.where(df2014['robbery']>0, 1, 0)

In [10]:
#create population log columns
df2013['population log'] = pd.DataFrame(np.log(df2013['population']))
df2014['population log'] = pd.DataFrame(np.log(df2014['population']))

In [11]:
#drop NaN values
df2013 = df2013.dropna()
df2014 = df2014.dropna()

In [12]:
df2013['propert crime'] = df2013['property crime'].values.reshape(-1,1)
X = df2013[['population', 'population squared', 'population log', 'murder', 'robbery']]
y = df2013['property crime']
regr = linear_model.LinearRegression() 
regr.fit(X,y)
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:')
print(regr.score(X, y))


Coefficients: 
 [5.65282537e-03 1.40183952e-07 6.07703663e+01 1.68486011e+02
 8.99482841e+01]

Intercept: 
 -474.6952485891337

R-squared:
0.8279720763932436


In [13]:
from sklearn.model_selection import cross_val_score
print('Cross Validation Scores:\n', cross_val_score(regr, X, y, cv=10))

Cross Validation Scores:
 [-0.02621307  0.93727081  0.30749063  0.84185228  0.76152037  0.79262981
  0.52999227  0.89926571  0.78755618 -1.45071752]


In [14]:
df2014['propert crime'] = df2014['property crime'].values.reshape(-1,1)
X2 = df2014[['population', 'population squared', 'population log', 'murder', 'robbery']]
y2 = df2014['property crime']
regr.fit(X2,y2)
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:')
print(regr.score(X2, y2))


Coefficients: 
 [ 2.06000004e-02  3.96421071e-08 -5.82633953e+01  2.45264765e+02
  8.34775179e+01]

Intercept: 
 410.34687453880144

R-squared:
0.7550625074652999


In [15]:
print('Cross Validation Scores:\n', cross_val_score(regr, X2, y2, cv=10))

Cross Validation Scores:
 [ 0.80351479  0.8712423   0.67495926  0.768288    0.73621998  0.46295017
  0.53347672  0.56122402  0.81697679 -2.75639745]
