In [175]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns

# Linear Regression Model challenge

First, I set up the working environment. Next, I imported my data, making sure to capture the correct column names and leave out extranious rows. Then I took my first look at the data.

In [176]:
data = ('/Users/Beba/Documents/JupyterNotebooks/NY2013Crimes.csv')
raw_crimes = pd.read_csv(data, skiprows=4)
raw_crimes.head()

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson3
0,Adams Village,1861,0,0.0,,0,0,0,12,2,10,0,0.0
1,Addison Town and Village,2577,3,0.0,,0,0,3,24,3,20,1,0.0
2,Akron Village,2846,3,0.0,,0,0,3,16,1,15,0,0.0
3,Albany,97956,791,8.0,,30,227,526,4090,705,3243,142,
4,Albion Village,6388,23,0.0,,3,4,16,223,53,165,5,


## The next step is to do some data cleaning, rearranging and scaling.

In [5]:
Property_Crime = pd.DataFrame()

In [6]:
Property_Crime['City'] = raw_crimes['City']
Property_Crime['Population'] = raw_crimes['Population']
Property_Crime['Robbery'] = raw_crimes['Robbery']
Property_Crime['Murder'] = raw_crimes['Murder and\nnonnegligent\nmanslaughter']

In [7]:
Property_Crime['Population'] = Property_Crime['Population'].str.replace(',','').astype(float)

In [8]:
Property_Crime['PopulationSQ'] = Property_Crime['Population'].apply(lambda x: x*x)

In [13]:
Property_Crime.head()

Unnamed: 0,City,Population,Robbery,Murder,PopulationSQ
0,Adams Village,1861.0,0,0.0,3463321.0
1,Addison Town and Village,2577.0,0,0.0,6640929.0
2,Akron Village,2846.0,0,0.0,8099716.0
3,Albany,97956.0,227,8.0,9595378000.0
4,Albion Village,6388.0,4,0.0,40806540.0


In [261]:
Property_Crime['Robbery_Category'] = np.where((Property_Crime['Robbery'] == 0.0), 0, 1)

In [68]:
Property_Crime['Murder'] = Property_Crime['Murder'].astype(str)
Property_Crime['Murder_Category'] = np.where(Property_Crime['Murder'].str.contains("0.0"), 0, 1)

In [72]:
Property_Crime.drop(['Robbery'], axis=1, inplace=True)
Property_Crime.drop(['Murder'], axis=1, inplace=True)

In [76]:
Property_Crime.describe()

Unnamed: 0,Population,PopulationSQ,Robbery_Category,Murder_Category
count,348.0,348.0,351.0,351.0
mean,40037.63,203554700000.0,0.575499,0.145299
std,450037.4,3778876000000.0,0.494973,0.352905
min,526.0,276676.0,0.0,0.0
25%,3003.0,9018117.0,0.0,0.0
50%,7233.5,52325680.0,1.0,0.0
75%,18427.5,339753600.0,1.0,0.0
max,8396126.0,70494930000000.0,1.0,1.0


In [114]:
#Property_Crime['Population'] = raw_crimes['Population']
#Property_Crime['Population'] = Property_Crime['Population'].astype(float)
#Property_Crime['PopulationSQ'] = Property_Crime['Population'].apply(lambda x: x*x)
raw_crimes.isnull().sum()

City                                        0
Population                                  0
Violent\ncrime                              0
Murder and\nnonnegligent\nmanslaughter      0
Rape\n(revised\ndefinition)1              348
Rape\n(legacy\ndefinition)2                 0
Robbery                                     0
Aggravated\nassault                         0
Property\ncrime                             0
Burglary                                    0
Larceny-\ntheft                             0
Motor\nvehicle\ntheft                       0
Arson3                                    161
dtype: int64

In [109]:
Property_Crime.dropna(subset = ['Population', 'PopulationSQ'], inplace=True)

In [179]:
raw_crimes.dropna(subset = ['Property\ncrime',
                            'Population',
                            'Violent\ncrime',
                            'Robbery',
                            'Aggravated\nassault'], inplace=True)

In [199]:
regr = linear_model.LinearRegression()

In [294]:
Y = raw_crimes['Property\ncrime'].values.reshape(-1, 1)
X = Property_Crime['Population'].values.reshape(-1,1)
min_max_scaler = preprocessing.MinMaxScaler()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.35)


In [292]:
raw_crimes['Property\ncrime'] = raw_crimes['Property\ncrime'].astype(float)

In [386]:
raw_crimes['Population'] = raw_crimes['Population'].str.replace(',','').astype(float)

In [197]:
X_train_minmax = min_max_scaler.fit_transform(X_train)
X_test_minmax = min_max_scaler.transform(X_test)
Y_train_minmax = min_max_scaler.fit_transform(Y_train)
Y_test_minmax = min_max_scaler.transform(Y_test)

## Next up is the actual regression!

In [203]:
# the actual regression

regr.fit(X_train_minmax, Y_train_minmax)


print('Coefficients: \n', regr.coef_)
print('Intercept: \n', regr.intercept_)
print(regr.score(X_test_minmax, Y_test_minmax))

Coefficients: 
 [[ 0.67780331]]
Intercept: 
 [-0.00938326]
0.113177244624


In [122]:
Property_Crime['Robbery'] = raw_crimes['Robbery'].str.replace(',','').astype(float)

In [123]:
Property_Crime['Violent_Crime'] = raw_crimes['Violent\ncrime'].str.replace(',','').astype(float)

Population alone only seems to account for 11% of the variance in our model. Let's try some different features.

In [204]:
X3 = Property_Crime[['Population', 'Murder', 'Rape', 'Assault']]
X_train3, X_test3, Y_train3, Y_test3 = train_test_split(X3, Y, test_size=.35)

X_train_minmax3 = min_max_scaler.fit_transform(X_train3)
X_test_minmax3 = min_max_scaler.transform(X_test3)
Y_train_minmax3 = min_max_scaler.fit_transform(Y_train3)
Y_test_minmax3 = min_max_scaler.transform(Y_test3)

regr.fit(X_train_minmax3, Y_train_minmax3)

print('Coefficients: \n', regr.coef_)
print('Intercept: \n', regr.intercept_)
print(regr.score(X_test_minmax3, Y_test_minmax3))


Coefficients: 
 [[ 0.54413717  0.13946844  0.53073963 -0.21450447]]
Intercept: 
 [ 0.00049404]
0.950128885772


This set of features seems to explain a huge amount of variance in our data. 

In [147]:
Property_Crime['Murder'] = raw_crimes['Murder and\nnonnegligent\nmanslaughter']
Property_Crime['Rape'] = raw_crimes['Rape\n(legacy\ndefinition)2'].str.replace(',','').astype(float)
Property_Crime['Assault'] = raw_crimes['Aggravated\nassault'].str.replace(',','').astype(float)
Property_Crime['Burglary'] = raw_crimes['Burglary'].str.replace(',','').astype(float)
Property_Crime['Larceny'] = raw_crimes['Larceny-\ntheft'].str.replace(',','').astype(float)
Property_Crime['Motor_vehicle_theft'] = raw_crimes['Motor\nvehicle\ntheft'].str.replace(',','').astype(float)

Add new columns for new features, to test in the model. And maybe split it up into different sized cities and see if that affects it.

In [389]:
Property_Crime['Small_City'] = np.where((Property_Crime['Population'] < 100000), 1, 0)
Property_Crime['Medium_City'] = np.where(((Property_Crime['Population'] >= 100000) & (Property_Crime['Population'] < 1000000)), 1, 0)
Property_Crime['Big_City'] = np.where((Property_Crime['Population'] >= 1000000), 1, 0)

In [390]:
Property_Crime.Small_City.sum()

342

In [231]:
X4 = Property_Crime[['Population', 'Small_City', 'Murder', 'Rape']]
X_train4, X_test4, Y_train4, Y_test4 = train_test_split(X4, Y, test_size=.35)

X_train_minmax4 = min_max_scaler.fit_transform(X_train4)
X_test_minmax4 = min_max_scaler.transform(X_test4)
Y_train_minmax4 = min_max_scaler.fit_transform(Y_train4)
Y_test_minmax4 = min_max_scaler.transform(Y_test4)

regr.fit(X_train_minmax4, Y_train_minmax4)

print('Coefficients: \n', regr.coef_)
print('Intercept: \n', regr.intercept_)
print(regr.score(X_test_minmax4, Y_test_minmax4))


Coefficients: 
 [[ 0.45925495  0.00119685  0.04907988  0.49060003]]
Intercept: 
 [ 0.00015373]
0.938932421558


In [242]:
Y_pred = regr.fit(X_train_minmax4, Y_train_minmax4).predict(X_train_minmax4)


In [278]:
Small_City_Crime = Property_Crime[Property_Crime['Small_City'] == 1]

In [452]:
Property_Crime.sort_values('Population', ascending=False)

Unnamed: 0,City,Population,PopulationSQ,Robbery_Category,Murder_Category,Robbery,Violent_Crime,Murder,Rape,Assault,Burglary,Larceny,Motor_vehicle_theft,Small_City,Medium_City,Big_City
216,New York,8396126.0,7.049493e+13,1,1,19170.0,52384.0,335.0,1112.0,31767.0,16606.0,117931.0,7434.0,0,0,1
35,Buffalo,258789.0,6.697175e+10,1,1,1322.0,3249.0,47.0,145.0,1735.0,3458.0,8076.0,957.0,0,1,0
272,Rochester,210562.0,4.433636e+10,1,1,918.0,2107.0,42.0,92.0,1055.0,2587.0,6855.0,609.0,0,1,0
346,Yonkers,199134.0,3.965435e+10,1,1,390.0,1036.0,6.0,25.0,615.0,470.0,1662.0,236.0,0,1,0
310,Syracuse,143834.0,2.068822e+10,1,1,400.0,1192.0,21.0,75.0,696.0,1781.0,4298.0,394.0,0,1,0
7,Amherst Town,118296.0,1.399394e+10,1,1,31.0,107.0,1.0,7.0,68.0,204.0,1882.0,32.0,0,1,0
3,Albany,97956.0,9.595378e+09,1,1,227.0,791.0,8.0,30.0,526.0,705.0,3243.0,142.0,1,0,0
126,Greece Town,96667.0,9.344509e+09,1,0,60.0,151.0,0.0,9.0,82.0,332.0,1925.0,46.0,1,0,0
267,Ramapo Town,87204.0,7.604538e+09,1,1,9.0,57.0,2.0,5.0,41.0,88.0,466.0,13.0,1,0,0
57,Clarkstown Town,80705.0,6.513297e+09,1,0,17.0,65.0,0.0,8.0,40.0,99.0,1388.0,28.0,1,0,0


In [329]:
X5 = Property_Crime[['Robbery', 'Murder', 'Rape', 'Assault']]
X_train5, X_test5, Y_train5, Y_test5 = train_test_split(X5, Y, test_size=.35)

X_train_minmax5 = min_max_scaler.fit_transform(X_train5)
X_test_minmax5 = min_max_scaler.transform(X_test5)
Y_train_minmax5 = min_max_scaler.fit_transform(Y_train5)
Y_test_minmax5 = min_max_scaler.transform(Y_test5)

regr.fit(X_train_minmax5, Y_train_minmax5)

print('Coefficients: \n', regr.coef_)
print('Intercept: \n', regr.intercept_)
print(regr.score(X_test_minmax5, Y_test_minmax5))


Coefficients: 
 [[-0.44113872  0.01812845  0.43159857  0.99043029]]
Intercept: 
 [ 0.00113606]
0.943844210761


Gonna give cross validation a whirl. woo. o.O

In [463]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(regr.fit(X_train_minmax5, Y_train_minmax5), X5, Y, cv=3)
scores  

array([ 0.90119075,  0.99949331,  0.9130164 ])

In [462]:
X_scaled = preprocessing.scale(X_train5)
Y_scaled = preprocessing.scale(Y_train5) #?

cross_val_score(regr.fit(X_scaled, Y_scaled), X_test5, Y_test5, cv=3)

array([ 0.91559441,  0.78017271,  0.91972089])

In [337]:
regr.fit(X_scaled, Y_scaled).score(X_test5, Y_test5)

0.076638318690665286

In [338]:
regr.fit(X_train5, Y_train5).score(X_test5, Y_test5)

0.9438442107607844

In [363]:
from sklearn.feature_selection import f_regression

In [None]:
raw_crimes['Population']

In [387]:
Small_City_Y = raw_crimes['Property\ncrime'].where((raw_crimes['Population'] < 100000), inplace=False)

In [394]:
Small_City_Y.dropna(inplace=True)

In [461]:
X6 = Small_City_Crime[['Population', 'Murder', 'Rape', 'Assault']]
X_train6, X_test6, Y_train6, Y_test6 = train_test_split(X6, Small_City_Y, test_size=.3)

X_train_minmax6 = min_max_scaler.fit_transform(X_train6)
X_test_minmax6 = min_max_scaler.transform(X_test6)
Y_train_minmax6 = min_max_scaler.fit_transform(Y_train6)
Y_test_minmax6 = min_max_scaler.transform(Y_test6)

X_train_minmax6.reshape(-1, 1)
X_test_minmax6.reshape(-1, 1)
Y_train_minmax6.reshape(-1, 1)
Y_test_minmax6.reshape(-1, 1)

regr.fit(X_train_minmax6, Y_train_minmax6)

print('Coefficients: \n', regr.coef_)
print('Intercept: \n', regr.intercept_)
print(regr.score(X_test_minmax6, Y_test_minmax6))

scores = cross_val_score(regr.fit(X_train_minmax6, Y_train_minmax6), X6, Small_City_Y, cv=3)
print('Cross Validation Scores', scores) 

Coefficients: 
 [ 0.37881618 -0.08825847  0.28962952  0.34841175]
Intercept: 
 -0.00403964951775
0.86132743872
Cross Validation Scores [ 0.90055842  0.791319    0.85652342]




This model seems to explain a good amount of variance as well. That's a satisfactory R-squared value. Time to check our assumptions.

In [426]:
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std


In [443]:
f_regression(X6, Small_City_Y)

(array([ 877.70192747,  184.38798482,  555.01269269,  661.74610575]),
 array([  3.28045971e-96,   7.42987890e-34,   1.90249174e-73,
          8.86905997e-82]))

In [459]:
f_regression(X5, Y.ravel())

(array([ 97866.48226054,  37370.82225754,  79939.59289857,  72483.91476369]),
 array([ 0.,  0.,  0.,  0.]))

In [460]:
LinearRegression.get_params()

NameError: name 'LinearRegression' is not defined

## Linear Relationship

## Next lets see what the residuals look like

In [266]:
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std

In [453]:
Small_City_Crime.head()

Unnamed: 0,City,Population,PopulationSQ,Robbery_Category,Murder_Category,Robbery,Murder,Rape,Assault
0,Adams Village,1861.0,3463321.0,0,0,0.0,0.0,0.0,0.0
1,Addison Town and Village,2577.0,6640929.0,0,0,0.0,0.0,0.0,3.0
2,Akron Village,2846.0,8099716.0,0,0,0.0,0.0,0.0,3.0
3,Albany,97956.0,9595378000.0,1,1,227.0,8.0,30.0,526.0
4,Albion Village,6388.0,40806540.0,1,0,4.0,0.0,3.0,16.0


In [467]:
lm = smf.ols(formula='Y ~ Murder+Rape+Assault', data=Property_Crime)

In [469]:
lm.params

AttributeError: 'OLS' object has no attribute 'params'

In [470]:
lm.pvalues

AttributeError: 'OLS' object has no attribute 'pvalues'