In [1]:
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
import pickle

In [2]:
data_filename = "crimes_2016.csv"
dataset = pd.read_csv(data_filename)

In [3]:
## Turn date column into datetime and just get the month
#dates = pd.to_datetime(dataset['Date'], format="%m/%d/%Y %H:%M:%S PM")
def convert_time(dates):
    time = []
    for i in range(dates.shape[0]):
        time.append(dates.iloc[i].split(" ")[0])
        
    return time


def get_hours(dates):
    hours = []
    for i in range(dates.shape[0]):
        h = dates.iloc[i].split(" ")
        
        hour = int(h[1].split(":")[0])
          
        hours.append(hour)
    return hours

In [4]:
dataset['Date'][4]

'2016-01-28 00:53:00'

In [5]:
dates = convert_time(dataset['Date'])
hours = get_hours(dataset['Date'])
dataset['Month'] = pd.DatetimeIndex(dataset['Date']).month
dataset['Hour'] = pd.to_numeric(hours)

In [6]:
## Drop noisy variables and redundant variables: description, updated on, beat, year 
dataset = dataset.drop('Description', 1)
dataset = dataset.drop('Beat', 1)
dataset = dataset.drop('Year', 1) ## all in 2016
dataset = dataset.drop('Date', 1) ## only using month 
dataset = dataset.drop('Location', 1) ## already have lat and long as separate fields
dataset = dataset.drop('Block', 1) 

## Drop UICR because it's not independent of what we are trying to predict
dataset = dataset.drop('IUCR', 1)
dataset = dataset.drop('FBI Code', 1) ## not independent variable

# Drop na values. We will use missing rows in other analysis
dataset = dataset.dropna(axis=0, how='any')

## Uncomment if you'd like to filter to keep only theft and assault
dataset = dataset[(dataset['Primary Type'] == 'THEFT') | (dataset['Primary Type'] == 'ASSAULT')]


In [7]:
dataset.head()

Unnamed: 0,Primary Type,Location Description,Arrest,Domestic,District,Ward,Community Area,Latitude,Longitude,Month,Hour
10,THEFT,RESIDENCE,False,True,6.0,6.0,69.0,41.755939,-87.608986,1,0
11,THEFT,GAS STATION,False,False,22.0,21.0,71.0,41.735931,-87.653642,1,1
12,THEFT,GAS STATION,False,False,3.0,20.0,68.0,41.779998,-87.629295,1,1
13,THEFT,GAS STATION,True,False,8.0,15.0,66.0,41.772201,-87.702981,1,0
14,THEFT,RESTAURANT,True,False,1.0,2.0,28.0,41.868034,-87.639215,1,2


In [8]:
X1 = dataset.iloc[:, 0:2]
X2 = dataset.iloc[:, 3:]
X = pd.concat([X1, X2], axis=1, join='inner')
X = X.values
y = dataset.iloc[:, 2].values

In [9]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Encode Categorical variables 
primary_enc = LabelEncoder()
location_enc = LabelEncoder()
domestic_enc = LabelEncoder()

X[:, 0] = primary_enc.fit_transform(X[:, 0]) 
X[:, 1] = location_enc.fit_transform(X[:, 1]) 
X[:, 2] = domestic_enc.fit_transform(X[:, 2])

#Create dummy variables
onehotencoder = OneHotEncoder(categorical_features = [0, 1, 2, 3, 4, 5])
X = onehotencoder.fit_transform(X).toarray()


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [11]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [12]:
# PCA
from sklearn.decomposition import PCA
pca = PCA(n_components = None)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_

In [13]:
# Get a sense of how much variance is explained by the number of variables
def variance_explained(variances, start_printing_at):   
    result = 0
    for i in range(len(variances)):
        result += variances[i]
        if (result > start_printing_at):
            print("{} variables explain {:.2f} variance".format(i, result))


variance_explained(explained_variance, 0.9)

171 variables explain 0.90 variance
172 variables explain 0.90 variance
173 variables explain 0.91 variance
174 variables explain 0.91 variance
175 variables explain 0.91 variance
176 variables explain 0.92 variance
177 variables explain 0.92 variance
178 variables explain 0.92 variance
179 variables explain 0.93 variance
180 variables explain 0.93 variance
181 variables explain 0.93 variance
182 variables explain 0.94 variance
183 variables explain 0.94 variance
184 variables explain 0.94 variance
185 variables explain 0.95 variance
186 variables explain 0.95 variance
187 variables explain 0.95 variance
188 variables explain 0.95 variance
189 variables explain 0.96 variance
190 variables explain 0.96 variance
191 variables explain 0.96 variance
192 variables explain 0.96 variance
193 variables explain 0.96 variance
194 variables explain 0.97 variance
195 variables explain 0.97 variance
196 variables explain 0.97 variance
197 variables explain 0.97 variance
198 variables explain 0.97 v

In [14]:
# We decide to use 194 variables that explain 97% of the variance
pca = PCA(n_components = 194)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [None]:
#predict test results
#y_pred = classifier.predict(X_test)

In [None]:
#from sklearn.metrics import confusion_matrix
#cm = confusion_matrix(y_test, y_pred)

In [None]:
#print(cm)

In [None]:
#accuracy = (cm[0][0] + cm[1][1])/ np.sum(cm)
#print(accuracy)

In [None]:
#Create ensemble with AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

dt = DecisionTreeClassifier() 
logit = LogisticRegression()
knn = KNeighborsClassifier()
nb = GaussianNB()
svc= SVC(kernel = 'rbf')


#Using Decision Tree
clf = AdaBoostClassifier(n_estimators=50, base_estimator=dt, learning_rate=1)
fit=clf.fit(X_train,y_train)





In [None]:
clf.score(X_train,y_train)

In [None]:
clf.score(X_test,y_test)

In [None]:
#Using Logistic Regresssion
clf = AdaBoostClassifier(n_estimators=50, base_estimator=logit, learning_rate=1)
fit=clf.fit(X_train,y_train)





In [None]:
clf.score(X_train,y_train)

In [None]:
clf.score(X_test,y_test)

In [None]:
#Using Naive Bayes
clf = AdaBoostClassifier(n_estimators=50, base_estimator=nb, learning_rate=1)
fit=clf.fit(X_train,y_train)





In [None]:
clf.score(X_train,y_train)

In [None]:
clf.score(X_test,y_test)

In [None]:
#Using Support Vector Machine
clf = AdaBoostClassifier(n_estimators=50, base_estimator=svc, algorithm='SAMME', learning_rate=1)
fit=clf.fit(X_train,y_train)




In [None]:
clf.score(X_train,y_train)

In [None]:
clf.score(X_test,y_test)