In [175]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns

# Linear Regression Model challenge

First, I set up the working environment. Next, I imported my data, making sure to capture the correct column names and leave out extranious rows. Then I took my first look at the data.

In [176]:
data = ('/Users/Beba/Documents/JupyterNotebooks/NY2013Crimes.csv')
raw_crimes = pd.read_csv(data, skiprows=4)
raw_crimes.head()

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson3
0,Adams Village,1861,0,0.0,,0,0,0,12,2,10,0,0.0
1,Addison Town and Village,2577,3,0.0,,0,0,3,24,3,20,1,0.0
2,Akron Village,2846,3,0.0,,0,0,3,16,1,15,0,0.0
3,Albany,97956,791,8.0,,30,227,526,4090,705,3243,142,
4,Albion Village,6388,23,0.0,,3,4,16,223,53,165,5,


## The next step is to do some data cleaning, rearranging and scaling.

In [5]:
Property_Crime = pd.DataFrame()

In [6]:
Property_Crime['City'] = raw_crimes['City']
Property_Crime['Population'] = raw_crimes['Population']
Property_Crime['Robbery'] = raw_crimes['Robbery']
Property_Crime['Murder'] = raw_crimes['Murder and\nnonnegligent\nmanslaughter']

In [7]:
Property_Crime['Population'] = Property_Crime['Population'].str.replace(',','').astype(float)

In [8]:
Property_Crime['PopulationSQ'] = Property_Crime['Population'].apply(lambda x: x*x)

In [13]:
Property_Crime.head()

Unnamed: 0,City,Population,Robbery,Murder,PopulationSQ
0,Adams Village,1861.0,0,0.0,3463321.0
1,Addison Town and Village,2577.0,0,0.0,6640929.0
2,Akron Village,2846.0,0,0.0,8099716.0
3,Albany,97956.0,227,8.0,9595378000.0
4,Albion Village,6388.0,4,0.0,40806540.0


In [261]:
Property_Crime['Robbery_Category'] = np.where((Property_Crime['Robbery'] == 0.0), 0, 1)

In [68]:
Property_Crime['Murder'] = Property_Crime['Murder'].astype(str)
Property_Crime['Murder_Category'] = np.where(Property_Crime['Murder'].str.contains("0.0"), 0, 1)

In [72]:
Property_Crime.drop(['Robbery'], axis=1, inplace=True)
Property_Crime.drop(['Murder'], axis=1, inplace=True)

In [76]:
Property_Crime.describe()

Unnamed: 0,Population,PopulationSQ,Robbery_Category,Murder_Category
count,348.0,348.0,351.0,351.0
mean,40037.63,203554700000.0,0.575499,0.145299
std,450037.4,3778876000000.0,0.494973,0.352905
min,526.0,276676.0,0.0,0.0
25%,3003.0,9018117.0,0.0,0.0
50%,7233.5,52325680.0,1.0,0.0
75%,18427.5,339753600.0,1.0,0.0
max,8396126.0,70494930000000.0,1.0,1.0


In [114]:
#Property_Crime['Population'] = raw_crimes['Population']
#Property_Crime['Population'] = Property_Crime['Population'].astype(float)
#Property_Crime['PopulationSQ'] = Property_Crime['Population'].apply(lambda x: x*x)
raw_crimes.isnull().sum()

City                                        0
Population                                  0
Violent\ncrime                              0
Murder and\nnonnegligent\nmanslaughter      0
Rape\n(revised\ndefinition)1              348
Rape\n(legacy\ndefinition)2                 0
Robbery                                     0
Aggravated\nassault                         0
Property\ncrime                             0
Burglary                                    0
Larceny-\ntheft                             0
Motor\nvehicle\ntheft                       0
Arson3                                    161
dtype: int64

In [109]:
Property_Crime.dropna(subset = ['Population', 'PopulationSQ'], inplace=True)

In [179]:
raw_crimes.dropna(subset = ['Property\ncrime',
                            'Population',
                            'Violent\ncrime',
                            'Robbery',
                            'Aggravated\nassault'], inplace=True)

In [199]:
regr = linear_model.LinearRegression()

In [294]:
Y = raw_crimes['Property\ncrime'].values.reshape(-1, 1)
X = Property_Crime['Population'].values.reshape(-1,1)
min_max_scaler = preprocessing.MinMaxScaler()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.35)


In [292]:
raw_crimes['Property\ncrime'] = raw_crimes['Property\ncrime'].astype(float)

In [386]:
raw_crimes['Population'] = raw_crimes['Population'].str.replace(',','').astype(float)

In [197]:
X_train_minmax = min_max_scaler.fit_transform(X_train)
X_test_minmax = min_max_scaler.transform(X_test)
Y_train_minmax = min_max_scaler.fit_transform(Y_train)
Y_test_minmax = min_max_scaler.transform(Y_test)

## Next up is the actual regression!

In [203]:
regr.fit(X_train_minmax, Y_train_minmax) # Scaled data


print('Coefficients: \n', regr.coef_)
print('Intercept: \n', regr.intercept_)
print(regr.score(X_test_minmax, Y_test_minmax))

Coefficients: 
 [[ 0.67780331]]
Intercept: 
 [-0.00938326]
0.113177244624


In [626]:
regr.fit(X_train, Y_train)


print('Coefficients: \n', regr.coef_)
print('Intercept: \n', regr.intercept_)
print(regr.score(X_test, Y_test))

Coefficients: 
 [[ 0.01693097]]
Intercept: 
 [ 98.90840404]
0.589009577266


In [122]:
Property_Crime['Robbery'] = raw_crimes['Robbery'].str.replace(',','').astype(float)

In [123]:
Property_Crime['Violent_Crime'] = raw_crimes['Violent\ncrime'].str.replace(',','').astype(float)

## Testing with different features

In [204]:
X3 = Property_Crime[['Population', 'Murder', 'Rape', 'Assault']]
X_train3, X_test3, Y_train3, Y_test3 = train_test_split(X3, Y, test_size=.35)

X_train_minmax3 = min_max_scaler.fit_transform(X_train3)
X_test_minmax3 = min_max_scaler.transform(X_test3)
Y_train_minmax3 = min_max_scaler.fit_transform(Y_train3)
Y_test_minmax3 = min_max_scaler.transform(Y_test3)

regr.fit(X_train_minmax3, Y_train_minmax3)

print('Coefficients: \n', regr.coef_)
print('Intercept: \n', regr.intercept_)
print(regr.score(X_test_minmax3, Y_test_minmax3))


Coefficients: 
 [[ 0.54413717  0.13946844  0.53073963 -0.21450447]]
Intercept: 
 [ 0.00049404]
0.950128885772


This set of features seems to explain a huge amount of variance in the data. 

In [147]:
Property_Crime['Murder'] = raw_crimes['Murder and\nnonnegligent\nmanslaughter']
Property_Crime['Rape'] = raw_crimes['Rape\n(legacy\ndefinition)2'].str.replace(',','').astype(float)
Property_Crime['Assault'] = raw_crimes['Aggravated\nassault'].str.replace(',','').astype(float)
Property_Crime['Burglary'] = raw_crimes['Burglary'].str.replace(',','').astype(float)
Property_Crime['Larceny'] = raw_crimes['Larceny-\ntheft'].str.replace(',','').astype(float)
Property_Crime['Motor_vehicle_theft'] = raw_crimes['Motor\nvehicle\ntheft'].str.replace(',','').astype(float)

## Adjust for population differences

Split the dataset into Big, Medium, and Small cities and test again. Big are more than 1million people, medium are between 100,000 and 1,000,000 and small are less than 100,000. 

In [389]:
Property_Crime['Small_City'] = np.where((Property_Crime['Population'] < 100000), 1, 0)
Property_Crime['Medium_City'] = np.where(((Property_Crime['Population'] >= 100000) & (Property_Crime['Population'] < 1000000)), 1, 0)
Property_Crime['Big_City'] = np.where((Property_Crime['Population'] >= 1000000), 1, 0)

In [390]:
Property_Crime.Small_City.sum()

342

In [231]:
X4 = Property_Crime[['Population', 'Small_City', 'Murder', 'Rape']]
X_train4, X_test4, Y_train4, Y_test4 = train_test_split(X4, Y, test_size=.35)

X_train_minmax4 = min_max_scaler.fit_transform(X_train4)
X_test_minmax4 = min_max_scaler.transform(X_test4)
Y_train_minmax4 = min_max_scaler.fit_transform(Y_train4)
Y_test_minmax4 = min_max_scaler.transform(Y_test4)

regr.fit(X_train_minmax4, Y_train_minmax4)

print('Coefficients: \n', regr.coef_)
print('Intercept: \n', regr.intercept_)
print(regr.score(X_test_minmax4, Y_test_minmax4))


Coefficients: 
 [[ 0.45925495  0.00119685  0.04907988  0.49060003]]
Intercept: 
 [ 0.00015373]
0.938932421558


In [329]:
X5 = Property_Crime[['Robbery', 'Murder', 'Rape', 'Assault']]
X_train5, X_test5, Y_train5, Y_test5 = train_test_split(X5, Y, test_size=.35)

X_train_minmax5 = min_max_scaler.fit_transform(X_train5)
X_test_minmax5 = min_max_scaler.transform(X_test5)
Y_train_minmax5 = min_max_scaler.fit_transform(Y_train5)
Y_test_minmax5 = min_max_scaler.transform(Y_test5)

regr.fit(X_train_minmax5, Y_train_minmax5)

print('Coefficients: \n', regr.coef_)
print('Intercept: \n', regr.intercept_)
print(regr.score(X_test_minmax5, Y_test_minmax5))


Coefficients: 
 [[-0.44113872  0.01812845  0.43159857  0.99043029]]
Intercept: 
 [ 0.00113606]
0.943844210761


## Here comes cross validation

In [463]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(regr.fit(X_train_minmax5, Y_train_minmax5), X5, Y, cv=3)
scores  

array([ 0.90119075,  0.99949331,  0.9130164 ])

In [462]:
X_scaled = preprocessing.scale(X_train5)
Y_scaled = preprocessing.scale(Y_train5) 

cross_val_score(regr.fit(X_scaled, Y_scaled), X_test5, Y_test5, cv=3)

array([ 0.91559441,  0.78017271,  0.91972089])

In [363]:
from sklearn.feature_selection import f_regression

In [630]:
Property_Crime['Population'].sort_values(ascending=False);

## Double checking populations, only 6 cities in New York State have more than 100,000 residents. 

In [629]:
Small_City_Crime = Property_Crime[Property_Crime['Small_City'] == 1]

In [387]:
Small_City_Y = raw_crimes['Property\ncrime'].where((raw_crimes['Population'] < 100000), inplace=False)

In [394]:
Small_City_Y.dropna(inplace=True)

In [555]:
X6 = Small_City_Crime[['Population', 'Murder', 'Rape', 'Assault']]
X_train6, X_test6, Y_train6, Y_test6 = train_test_split(X6, Small_City_Y, test_size=.3)

X_train_minmax6 = min_max_scaler.fit_transform(X_train6)
X_test_minmax6 = min_max_scaler.transform(X_test6)
Y_train_minmax6 = min_max_scaler.fit_transform(Y_train6)
Y_test_minmax6 = min_max_scaler.transform(Y_test6)

X_train_minmax6.reshape(-1, 1)
X_test_minmax6.reshape(-1, 1)
Y_train_minmax6.reshape(-1, 1)
Y_test_minmax6.reshape(-1, 1)

regr.fit(X_train_minmax6, Y_train_minmax6)

print('Coefficients: \n', regr.coef_)
print('Intercept: \n', regr.intercept_)
print(regr.score(X_test_minmax6, Y_test_minmax6))

scores = cross_val_score(regr.fit(X_train_minmax6, Y_train_minmax6), X6, Small_City_Y, cv=3)
print('Cross Validation Scores', scores) 

Coefficients: 
 [ 0.42739251 -0.22605051  0.3595223   0.50651963]
Intercept: 
 0.000122341884906
0.91282960652
Cross Validation Scores [ 0.90055842  0.791319    0.85652342]




In [443]:
f_regression(X6, Small_City_Y)

(array([ 877.70192747,  184.38798482,  555.01269269,  661.74610575]),
 array([  3.28045971e-96,   7.42987890e-34,   1.90249174e-73,
          8.86905997e-82]))

## Time to test on new data

In [549]:
data2 = ('/Users/Beba/Documents/JupyterNotebooks/NJ_2013Crime.csv')
raw_crimes2 = pd.read_csv(data2, skiprows=4)
raw_crimes2.head()

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson
0,Aberdeen Township,18150,19,0.0,,0.0,13,6.0,237,42,185,10,0.0
1,Absecon,8380,21,0.0,,2.0,4,15.0,266,91,169,6,0.0
2,Allendale,6712,0,0.0,,0.0,0,0.0,37,6,29,2,0.0
3,Allenhurst,493,0,0.0,,0.0,0,0.0,39,20,18,1,0.0
4,Allentown,1812,3,0.0,,0.0,0,3.0,18,12,6,0,0.0


In [560]:
raw_crimes2['Population'] = raw_crimes2['Population'].str.replace(',','').astype(float)

In [633]:
raw_crimes2['Population'].sort_values(ascending=False);

In [591]:
Small_City_Jersey = raw_crimes2.where((raw_crimes2['Population'] < 100000), inplace=False)
# Jersey also only has 6 cities with more than 100,000 people

In [592]:
Small_City_Jersey.head()

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson
0,Aberdeen Township,18150.0,19,0.0,,0.0,13,6.0,237,42,185,10,0.0
1,Absecon,8380.0,21,0.0,,2.0,4,15.0,266,91,169,6,0.0
2,Allendale,6712.0,0,0.0,,0.0,0,0.0,37,6,29,2,0.0
3,Allenhurst,493.0,0,0.0,,0.0,0,0.0,39,20,18,1,0.0
4,Allentown,1812.0,3,0.0,,0.0,0,3.0,18,12,6,0,0.0


In [594]:
Small_City_Jersey_Crime = pd.DataFrame()

Small_City_Jersey_Crime['City'] = Small_City_Jersey['City']
Small_City_Jersey_Crime['Population'] = Small_City_Jersey['Population']
Small_City_Jersey_Crime['Murder'] = Small_City_Jersey['Murder and\nnonnegligent\nmanslaughter'].astype(float)
Small_City_Jersey_Crime['Rape'] = Small_City_Jersey['Rape\n(legacy\ndefinition)2'].astype(float)
Small_City_Jersey_Crime['Assault'] = Small_City_Jersey['Aggravated\nassault'].astype(float)
Small_City_Jersey_Crime['Robbery'] = Small_City_Jersey['Robbery'].str.replace(',','').astype(float)


In [596]:
Small_City_Jersey_Crime.isnull().sum()

City          8
Population    8
Murder        8
Rape          8
Assault       8
Robbery       8
dtype: int64

In [598]:
Small_City_Jersey_Crime.dropna(axis=0, inplace=True)

In [599]:
Small_City_Jersey_Crime.head()

Unnamed: 0,City,Population,Murder,Rape,Assault,Robbery
0,Aberdeen Township,18150.0,0.0,0.0,6.0,13.0
1,Absecon,8380.0,0.0,2.0,15.0,4.0
2,Allendale,6712.0,0.0,0.0,0.0,0.0
3,Allenhurst,493.0,0.0,0.0,0.0,0.0
4,Allentown,1812.0,0.0,0.0,3.0,0.0


In [617]:
New_Y = Small_City_Jersey['Property\ncrime'].str.replace(',','').astype(float)

In [605]:
New_Y.dropna(axis=0, inplace=True)

In [657]:
X7 = Small_City_Jersey_Crime[['Population', 'Murder', 'Rape', 'Assault']]

scores = cross_val_score(regr.fit(X_train_minmax6, Y_train_minmax6), X7, New_Y, cv=3)
print('Cross Validation Scores', scores) 

Cross Validation Scores [ 0.82296781  0.72865359  0.65782118]


## create revised model and test again

In [658]:
X8 = Small_City_Jersey_Crime[['Population', 'Robbery', 'Murder', 'Rape', 'Assault']]

scores = cross_val_score(regr.fit(X_train_minmax6, Y_train_minmax6), X8, New_Y, cv=3)
print('Cross Validation Scores', scores) 

Cross Validation Scores [ 0.83960425  0.64086009  0.63974305]


# Brief Write Up
    
I chose to use cross_val_score because it easily allows me to choose how many cross validations to do, and gives me a score for one. 

## Revised model

I chose to add 'Robbery' back in to the features to see how it affected the score. 