In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


# Communities & Crime Un-normalized dataset

Communities in the US. Data combines socio-economic data from the '90 Census, law enforcement data from the 1990 Law Enforcement Management and Admin Stats survey, and crime data from the 1995 FBI UCR

This dataset consists of 2215 instances of crimes that has been reported from across all the states in the United States of America

Total number of features  = 147

<br/>
5 - Non-predictive features

-- communityname: Community name - not predictive - for information only (string) 
<br/>
-- state: US state (by 2 letter postal abbreviation)(nominal) 
<br/>
-- countyCode: numeric code for county - not predictive, and many missing values (numeric) 
<br/>
-- communityCode: numeric code for community - not predictive and many missing values (numeric) 
<br/>
-- fold: fold number for non-random 10 fold cross validation, potentially useful for debugging, paired tests - not predictive (numeric - integer) 
<br/>

124 - Predictive features : More details on these can be found [here](https://archive.ics.uci.edu/ml/datasets/Communities+and+Crime+Unnormalized)
<br/>

18 potential goal features which are listed below

## Potential Goal features : To be predicted

-- murders: number of murders in 1995 (numeric - expected to be integer) potential GOAL attribute (to be predicted)
<br />
-- murdPerPop: number of murders per 100K population (numeric - decimal) potential GOAL attribute (to be predicted)
<br />
-- rapes: number of rapes in 1995 (numeric - expected to be integer) potential GOAL attribute (to be predicted) 
<br />
-- rapesPerPop: number of rapes per 100K population (numeric - decimal) potential GOAL attribute (to be predicted) 
<br />
-- robberies: number of robberies in 1995 (numeric - expected to be integer) potential GOAL attribute (to be predicted) 
<br />
-- robbbPerPop: number of robberies per 100K population (numeric - decimal) potential GOAL attribute (to be predicted) 
<br />
-- assaults: number of assaults in 1995 (numeric - expected to be integer) potential GOAL attribute (to be predicted) 
<br />
-- assaultPerPop: number of assaults per 100K population (numeric - decimal) potential GOAL attribute (to be predicted) 
<br />
-- burglaries: number of burglaries in 1995 (numeric - expected to be integer) potential GOAL attribute (to be predicted) 
<br />
-- burglPerPop: number of burglaries per 100K population (numeric - decimal) potential GOAL attribute (to be predicted) 
<br />
-- larcenies: number of larcenies in 1995 (numeric - expected to be integer) potential GOAL attribute (to be predicted) 
<br />
-- larcPerPop: number of larcenies per 100K population (numeric - decimal) potential GOAL attribute (to be predicted) 
<br />
-- autoTheft: number of auto thefts in 1995 (numeric - expected to be integer) potential GOAL attribute (to be predicted) 
<br />
-- autoTheftPerPop: number of auto thefts per 100K population (numeric - decimal) potential GOAL attribute (to be predicted) 
<br />
-- arsons: number of arsons in 1995 (numeric - expected to be integer) potential GOAL attribute (to be predicted) 
<br />
-- arsonsPerPop: number of arsons per 100K population (numeric - decimal) potential GOAL attribute (to be predicted) 
<br />
-- ViolentCrimesPerPop: total number of violent crimes per 100K popuation (numeric - decimal) GOAL attribute (to be predicted) 
<br />
-- nonViolPerPop: total number of non-violent crimes per 100K popuation (numeric - decimal) potential GOAL attribute (to be predicted) 
<br />

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn

from scipy import stats, optimize
from sklearn.preprocessing import Imputer
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.pipeline import Pipeline

from sklearn.base import clone
from itertools import combinations
from sklearn.metrics import explained_variance_score, r2_score, median_absolute_error

print('The scikit-learn version is {}.'.format(sklearn.__version__))
print('The pandas version is {}.'.format(pd.__version__))
print('The numpy version is {}.'.format(np.__version__))

The scikit-learn version is 0.18.1.
The pandas version is 0.19.2.
The numpy version is 1.12.0.




In [15]:
goal_features = ['murders', 'murdPerPop', 'rapes', 'rapesPerPop', 'robberies','robbbPerPop',
                 'assaults', 'assaultPerPop', 'burglaries', 'burglPerPop', 'larcenies', 'larcPerPop',
                 'autoTheft', 'autoTheftPerPop', 'arsons', 'arsonsPerPop', 'violentPerPop', 'nonViolPerPop']

non_predictive_features = ['communityname', 'state', 'countyCode', 'communityCode', 'fold']

In [19]:
df = pd.read_csv('../datasets/UCIrvineUnnormalizeddata.csv');
df = df.replace('?',np.NAN)
features = [x for x in df.columns if x not in goal_features and x not in non_predictive_features]
len(features)

124

In [17]:
df[goal_features].isnull().sum()

murders              0
murdPerPop           0
rapes              208
rapesPerPop        208
robberies            1
robbbPerPop          1
assaults            13
assaultPerPop       13
burglaries           3
burglPerPop          3
larcenies            3
larcPerPop           3
autoTheft            3
autoTheftPerPop      3
arsons              91
arsonsPerPop        91
violentPerPop      221
nonViolPerPop       97
dtype: int64

In [18]:
df[features].isnull().sum()

pop                     0
perHoush                0
pctBlack                0
pctWhite                0
pctAsian                0
pctHisp                 0
pct12-21                0
pct12-29                0
pct16-24                0
pct65up                 0
persUrban               0
pctUrban                0
medIncome               0
pctWwage                0
pctWfarm                0
pctWdiv                 0
pctWsocsec              0
pctPubAsst              0
pctRetire               0
medFamIncome            0
perCapInc               0
whitePerCap             0
blackPerCap             0
NAperCap                0
asianPerCap             0
otherPerCap             1
hispPerCap              0
persPoverty             0
pctPoverty              0
pctLowEdu               0
                     ... 
pctBornStateResid       0
pctSameHouse-5          0
pctSameCounty-5         0
pctSameState-5          0
numPolice            1872
policePerPop         1872
policeField          1872
policeFieldP