In [360]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn

from scipy import stats, optimize
from sklearn.preprocessing import Imputer, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LinearRegression, Ridge, LassoLars, BayesianRidge, ARDRegression, Lars
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_regression

from sklearn.svm import LinearSVR
from sklearn.base import clone
from itertools import combinations
from sklearn.metrics import explained_variance_score, r2_score, median_absolute_error, mean_squared_error

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV

import seaborn as sns

print('The scikit-learn version is {}.'.format(sklearn.__version__))
print('The pandas version is {}.'.format(pd.__version__))
print('The numpy version is {}.'.format(np.__version__))

The scikit-learn version is 0.18.1.
The pandas version is 0.19.2.
The numpy version is 1.12.0.


In [2]:
goal_features = ['murders', 'murdPerPop', 'rapes', 'rapesPerPop', 'robberies','robbbPerPop',
                 'assaults', 'assaultPerPop', 'burglaries', 'burglPerPop', 'larcenies', 'larcPerPop',
                 'autoTheft', 'autoTheftPerPop', 'arsons', 'arsonsPerPop', 'violentPerPop', 'nonViolPerPop']

non_predictive_features = ['communityname', 'state', 'countyCode', 'communityCode', 'fold']

In [3]:
df = pd.read_csv('../datasets/UnnormalizedCrimeData.csv');
df = df.replace('?',np.NAN)
features = [x for x in df.columns if x not in goal_features and x not in non_predictive_features]
len(features)

124

In [4]:
def drop_rows_with_null_goal_feature(old_df, feature):
    new_df = old_df.dropna(subset=[feature])
    return new_df

In [356]:
clf = Pipeline([
  ('feature_selection', SelectKBest(f_regression, k=100)),
  ('regression', GradientBoostingRegressor())
])

goal_feature = 'murders'


goal_df = drop_rows_with_null_goal_feature(df, goal_feature)
imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imr = imr.fit(goal_df[features])
imputed_data = imr.transform(goal_df[features]);

df_X_train, df_X_test, df_y_train, df_y_test = \
        train_test_split(imputed_data, goal_df[goal_feature], test_size=0.3)
    
clf.fit(df_X_train, df_y_train)
        
mse = mean_squared_error(df_y_test, clf.predict(df_X_test))
r2_sc = r2_score(df_y_test, clf.predict(df_X_test))
print mse, r2_sc

2270.04065665 0.643525337149


In [358]:
clf = Pipeline([
  ('feature_selection', SelectFromModel(Lasso())),
  ('regression', GradientBoostingRegressor())
])

goal_feature = 'rapes'

goal_df = drop_rows_with_null_goal_feature(df, goal_feature)
goal_df[[goal_feature]] = goal_df[[goal_feature]].apply(pd.to_numeric)
imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imr = imr.fit(goal_df[features])
imputed_data = imr.transform(goal_df[features]);

df_X_train, df_X_test, df_y_train, df_y_test = \
        train_test_split(imputed_data, goal_df[goal_feature], test_size=0.2)
    
clf.fit(df_X_train, df_y_train)
        
mse = mean_squared_error(df_y_test, clf.predict(df_X_test))
r2_sc = r2_score(df_y_test, clf.predict(df_X_test))
print mse, r2_sc

6895.77199625 0.722186308942


In [422]:
clf = Pipeline([
  ('feature_selection', SelectFromModel(Lasso())),
  ('regression', RandomForestRegressor())
])

goal_feature = 'robberies'


goal_df = drop_rows_with_null_goal_feature(df, goal_feature)
goal_df[[goal_feature]] = goal_df[[goal_feature]].apply(pd.to_numeric)
goal_df = goal_df[goal_df.robberies < goal_df.robberies.quantile(.95)]

imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imr = imr.fit(goal_df[features])
imputed_data = imr.transform(goal_df[features]);

df_X_train, df_X_test, df_y_train, df_y_test = \
        train_test_split(imputed_data, goal_df[goal_feature], test_size=0.1)
    
clf.fit(df_X_train, df_y_train)
        
mse = mean_squared_error(df_y_test, clf.predict(df_X_test))
r2_sc = r2_score(df_y_test, clf.predict(df_X_test))
print mse, r2_sc

1287.28014218 0.771254582163


count    2103.000000
mean       55.777936
std        97.983414
min         0.000000
25%         5.000000
50%        16.000000
75%        56.000000
max       628.000000
Name: robberies, dtype: float64

In [227]:
clf.predict(df_X_test)

array([  8.49854082e+02,   8.45285204e+01,   1.26882824e+01,
         1.00485558e+02,  -3.25917366e-02,  -9.66562355e+01,
        -4.73182949e+01,   1.91892554e+01,  -3.20305167e+00,
         1.30965754e+02,   2.85115786e+01,   8.30610045e+02,
         1.53591703e+01,  -3.64983050e+00,  -9.41794406e+00,
         1.49562775e+01,  -1.29270589e+01,  -6.18773727e+00,
         1.32413083e+02,   4.14333184e+01,   2.16524345e+01,
         4.06827778e+01,   7.99853150e+01,   1.54784515e+01,
        -2.40857509e+01,   1.40905284e+01,  -1.00225658e+01,
         7.81143558e+01,   2.80540349e+01,  -1.32576269e+01,
         5.31412484e+01,   1.23455921e+03,   7.79918975e+00,
        -2.94725848e+01,  -3.31264200e+01,   1.17213226e+02,
        -1.63410816e+02,   5.45785720e+01,   2.14796451e+01,
         3.00109380e+02,  -1.36617902e+01,   2.43888565e+01,
         5.26940542e+01,   1.58114780e+03,   5.68340128e+00,
         2.25855956e+02,  -2.12694948e+01,   2.62830107e+01,
         3.62474441e+00,