In [22]:
import json as js
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import time
import datetime

In [23]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.learning_curve import validation_curve
from sklearn import grid_search
from sklearn import preprocessing
from sklearn import metrics

In [41]:
# Read CSV Data Into Pandas
print "Reading data..."
train_file = './train.csv'
test_file = './test.csv'
train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)
print train_df.shape
print test_df.shape

Reading data...
(878049, 9)
(884262, 7)


In [42]:
# Remove really unlikely categories
train_df = train_df[train_df['Category'] != 'TREA']
train_df = train_df[train_df['Category'] != 'PORNOGRAPHY/OBSCENE MAT']
# train_df = train_df.loc[train_df['Category'] not in ['PORNOGRAPHY/OBSCENE MAT','TREA']]
# print len(train_df.index)

In [43]:
# Pull out desired number of training instances
print len(train_df)
num_inputs = 100000
rand_input_vec = np.random.choice(len(train_df.index), num_inputs, replace=False)
train_df = train_df.iloc[rand_input_vec,:]

878021


In [44]:
# Create Date/Time Base Features
print "Creating features"
date_str = '%Y-%m-%d %H:%M:%S'
train_df['Year'] = pd.to_datetime(train_df['Dates'], format=date_str).apply(lambda x: x.year)
test_df['Year'] = pd.to_datetime(test_df['Dates'], format=date_str).apply(lambda x: x.year)
train_df['Month'] = pd.to_datetime(train_df['Dates'], format=date_str).apply(lambda x: x.month)
test_df['Month'] = pd.to_datetime(test_df['Dates'], format=date_str).apply(lambda x: x.month)
train_df['Day'] = pd.to_datetime(train_df['Dates'], format=date_str).apply(lambda x: x.day)
test_df['Day'] = pd.to_datetime(test_df['Dates'], format=date_str).apply(lambda x: x.day)
train_df['YearDay'] = pd.to_datetime(train_df['Dates'], format=date_str).apply(lambda x: x.dayofyear)
test_df['YearDay'] = pd.to_datetime(test_df['Dates'], format=date_str).apply(lambda x: x.dayofyear)
train_df['YearWeek'] = pd.to_datetime(train_df['Dates'], format=date_str).apply(lambda x: x.week)
test_df['YearWeek'] = pd.to_datetime(test_df['Dates'], format=date_str).apply(lambda x: x.week)
x = train_df[['Year','Month','Day','YearDay','YearWeek','DayOfWeek','PdDistrict','X','Y']].values
y = train_df[['Category']].values

x_submit = test_df[['Year','Month','Day','YearDay','YearWeek','DayOfWeek','PdDistrict','X','Y']].values

Creating features


In [45]:
# y_le = preprocessing.LabelEncoder()
# y_le.fit(Y)
# Y_train = y_le.transform(Y)
print "Encoding Categorical Data"
LE_DoW = preprocessing.LabelEncoder()
LE_PdD = preprocessing.LabelEncoder()
LE_DoW.fit(x[:,5])
LE_PdD.fit(x[:,6])
x_DoW = np.array(LE_DoW.transform(x[:,5]))
x_PdD = np.array(LE_PdD.transform(x[:,6]))
x_submit_DoW = np.array(LE_DoW.transform(x_submit[:,5]))
x_submit_PdD = np.array(LE_PdD.transform(x_submit[:,6]))

Encoding Categorical Data


In [46]:
x_LE = np.column_stack((x[:,0], x[:,1], x[:,2],  x[:,3],  x[:,4], x_DoW, x_PdD, x[:,7], x[:,8]))
x_submit_LE = np.column_stack((x_submit[:,0], x_submit[:,1], x_submit[:,2], x_submit[:,3], x_submit[:,4], x_submit_DoW, x_submit_PdD, x_submit[:,7], x_submit[:,8]))

In [47]:
x_OHE = preprocessing.OneHotEncoder(n_values='auto', categorical_features=[0, 1, 2, 5, 6], sparse=False)
x_submit_OHE = preprocessing.OneHotEncoder(n_values='auto', categorical_features=[0, 1, 2, 5, 6], sparse=False)
x_OHE.fit(x_LE)
x_submit_OHE.fit(x_submit_LE)
x_feat = x_OHE.transform(x_LE)
x_submit_feat = x_submit_OHE.transform(x_submit_LE)

In [48]:
y_LE = preprocessing.LabelEncoder()
y_LE.fit(y.ravel())
y_feat = y_LE.transform(y)
#Series(df.values.ravel()).unique()
# feature_set = y_LE.inverse_transform(np.arange(pd.Series(train_df[['Category']].values().ravel()).nunique()))
feature_set = list(y_LE.classes_)
print feature_set
print len(feature_set)

['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY', 'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE', 'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION', 'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING', 'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING', 'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES', 'PROSTITUTION', 'RECOVERED VEHICLE', 'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE', 'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE', 'SUSPICIOUS OCC', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT', 'WARRANTS', 'WEAPON LAWS']
37


In [49]:
# Split into test and train sets
X_train, X_test, y_train, y_test = train_test_split(x_feat, y_feat, test_size=0.2, random_state=0)

In [50]:
# Normalize Data
# ------------------------------
print "Scaling Data"
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)  # apply same transformation to test data
X_submit = scaler.transform(x_submit_feat)
print X_train.shape
print X_test.shape
print X_submit.shape

Scaling Data
(80000, 77)
(20000, 77)
(884262, 77)


In [51]:
print "Training Classifier"

Training Classifier


In [53]:
# Boosted classifier
# ------------------------------
# Create boosted decision tree
# dt = DecisionTreeClassifier()
# dt = KNeighborsClassifier()
# clf = AdaBoostClassifier(n_estimators=10, base_estimator=dt, learning_rate=1)
# clf = AdaBoostClassifier(n_estimators=25)
# clf.fit(X_train, y_train.ravel())
# y_pred = clf.predict(X_test)
# print(metrics.accuracy_score( y_test, y_pred))

In [58]:
# SGDClassifier
# ------------------------------
clf = SGDClassifier(loss="log", shuffle=True, class_weight="balanced").fit(X_train, y_train.ravel())
clf.fit(X_train, y_train.ravel())
y_pred = clf.predict_proba(X_test)
print (metrics.log_loss(y_test, y_pred))

3.68728140794


In [59]:
# Grid Search on SGDClassifier
# ------------------------------
# svr = SGDClassifier(loss="log", class_weight="balanced")
# svr = SGDClassifier(loss="hinge")
# params = {'alpha': 10.0**-np.arange(1,7)} 
# clf = grid_search.GridSearchCV(svr, params, refit=True).fit(X_train, y_train.ravel())

# y_pred = clf.predict_proba(X_test)
# print (metrics.log_loss(y_test, y_pred))

In [60]:
# Plot Predicted Categories
# ------------------------------
# print "Plotting results"
# results = sns.countplot(y_test.flatten())
# results = sns.countplot(y_pred)
# plt.show()

In [61]:
# Plot learning curve
# ------------------------------
# X_train, y_train, X_test, y_test = split(X, y)
# n_samples = X_train.shape[0]
# range_array = [40, 400, 4000]
# train_scores, test_scores = [], []
# for n in range_array:
#     print "for flag" + str(n)
#     clf.fit(X_train[:n], y_train[:n])
#     y_train_pred = clf.predict(X_train[:n])
#     y_test_pred = clf.predict(X_test)
#     train_scores.append(metrics.accuracy_score(y_train_pred, y_train[:n]))
#     test_scores.append(metrics.accuracy_score(y_test_pred, y_test))
# plt.plot(range_array, train_scores)
# plt.plot(range_array, test_scores)
# plt.show()

In [65]:
# Create output data
# ------------------------------
header_sub = ['Id','ARSON','ASSAULT','BAD CHECKS','BRIBERY','BURGLARY','DISORDERLY CONDUCT','DRIVING UNDER THE INFLUENCE','DRUG/NARCOTIC','DRUNKENNESS','EMBEZZLEMENT','EXTORTION','FAMILY OFFENSES','FORGERY/COUNTERFEITING','FRAUD','GAMBLING','KIDNAPPING','LARCENY/THEFT','LIQUOR LAWS','LOITERING','MISSING PERSON','NON-CRIMINAL','OTHER OFFENSES','PORNOGRAPHY/OBSCENE MAT','PROSTITUTION','RECOVERED VEHICLE','ROBBERY','RUNAWAY','SECONDARY CODES','SEX OFFENSES FORCIBLE','SEX OFFENSES NON FORCIBLE','STOLEN PROPERTY','SUICIDE','SUSPICIOUS OCC','TREA','TRESPASS','VANDALISM','VEHICLE THEFT','WARRANTS','WEAPON LAWS']
# current_time = datetime.datetime.now().time().isoformat()
current_time = datetime.datetime.strftime(datetime.datetime.now(), '%Y%m%d_%H%M%S')
file_name = "submissions/submission_" + str(current_time)
y_submit = clf.predict_proba(X_submit)
submit_df = pd.DataFrame(y_submit)

missing_categories = list(set(header_sub) - set(feature_set))

for feat in missing_categories:
    submit_df[feat] = pd.Series(np.zeros( len(y_submit)), index=submit_df.index)

# submit_df['PORNOGRAPHY/OBSCENE MAT'] = pd.Series( np.zeros( len(y_submit)), index=submit_df.index)
# submit_df['TREA'] = pd.Series( np.zeros( len(y_submit)), index=submit_df.index)
submit_df.to_csv( file_name, index=True, sep=',', header=header_sub, float_format='%.6f')