# Year Split
This block serves as splitting the original data by year and save resulting sub datasets to .csv files. Note here data in 2015 are discarded because the sample space is too small comparing to other years.

In [4]:
import pandas as pd
import numpy as np
import datetime

def parse_time(x):
    DD = datetime.datetime.strptime(x,"%Y-%m-%d %H:%M:%S")
    return DD.year

df = pd.read_csv('train.csv')
df['Year'] = df['Dates'].apply(parse_time)

years = list()
for i in range(2003, 2016):
    years.append(df[df.Year == i])
    
for i in range(2003, 2015):
    years[i-2003].to_csv(str(i) + '.csv')

# Dependencies
This block include in all dependencies, including
* External libraries like numpy and sklearn;
* Custom functions served for parsing timestamp, and calculating counting-based log odds;
* Merge orignal categories using new label set.

In [5]:
import pandas as pd
import numpy as np
import datetime
import time
from sklearn.preprocessing import scale
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier


def parse_time(x):
    DD = datetime.datetime.strptime(x,"%Y-%m-%d %H:%M:%S")
    time = DD.hour
    day = DD.day
    month = DD.month
    return time,day,month

def odds(x):
    if x == 1.0:
        return 1.0
    else:
        return np.log(x)-np.log(1-x)
    
def update_odds(x, default_logodds, oddUpdate):
    defaut = default_logodds
    val = oddUpdate[x]
    if len(val)!=1:
        defaut[val.keys()] = val
    return pd.Series(defaut)

rough_category = dict()
rough_category['VEHICLE THEFT'] = 0
rough_category['VANDALISM'] = 0
rough_category['DRIVING UNDER THE INFLUENCE'] = 0
rough_category['ARSON'] = 0
rough_category['BRIBERY'] = 0
rough_category['SUICIDE'] = 0
rough_category['SEX OFFENSES NON FORCIBLE'] = 0
rough_category['EXTORTION'] = 0
rough_category['GAMBLING'] = 0
rough_category['BAD CHECKS'] = 0
rough_category['TREA'] = 0
rough_category['RECOVERED VEHICLE'] = 0
rough_category['PORNOGRAPHY/OBSCENE MAT'] = 0
rough_category['WARRANTS'] = 1
rough_category['OTHER OFFENSES'] = 1
rough_category['LARCENY/THEFT'] = 1
rough_category['NON-CRIMINAL'] = 1
rough_category['ROBBERY'] = 1
rough_category['ASSAULT'] = 1
rough_category['WEAPON LAWS'] = 1
rough_category['DRUNKENNESS'] = 1
rough_category['TRESPASS'] = 1
rough_category['LOITERING'] = 1
rough_category['BURGLARY'] = 2
rough_category['SECONDARY CODES'] = 2
rough_category['MISSING PERSON'] = 2
rough_category['RUNAWAY'] = 2
rough_category['FAMILY OFFENSES'] = 2
rough_category['LIQUOR LAWS'] = 2
rough_category['DISORDERLY CONDUCT'] = 2
rough_category['SUSPICIOUS OCC'] = 2
rough_category['KIDNAPPING'] = 2
rough_category['SEX OFFENSES FORCIBLE'] = 2
rough_category['EMBEZZLEMENT'] = 2
rough_category['DRUG/NARCOTIC'] = 3
rough_category['PROSTITUTION'] = 4
rough_category['FORGERY/COUNTERFEITING'] = 5
rough_category['FRAUD'] = 5
rough_category['STOLEN PROPERTY'] = 5

# Main Experiment Function
This block is the main function for feature extraction and machine learning. It is roughly composed of four parts:
* Extract temporal and geographical features;
* Extract count-based log odd ratio features;
* Conduct PCA on data sets for dimension reudction;
* Machine learning using random forest.

In [6]:

def experiment(file_name, csv_num):
    df = pd.read_csv(file_name) 
    to_learn = pd.DataFrame()
    to_learn['Category'] = df.Category.apply(lambda item: rough_category[item])
    df['NewCategory'] = to_learn['Category']
    to_learn['Hour'], to_learn['Day'], to_learn['Month'] = zip(*df.Dates.apply(parse_time))
    to_learn['X'], to_learn['Y'] = df.X, df.Y
    SFPD = df.PdDistrict.unique()
    PD_map, label = dict(), 1
    for name in SFPD:
        PD_map[name] = label
        label += 1
    to_learn['PD'] = df.PdDistrict.apply(lambda item: PD_map[item])
    
    addresses = sorted(df["Address"].unique())
    categories = sorted(df["NewCategory"].unique())
    C_counts = df.groupby(["NewCategory"]).size()
    logoddsPA = dict((df.groupby('Address').size()/len(df)).apply(odds))
    default_logodds = np.log(C_counts/len(df))-np.log(1.0-C_counts/float(len(df)))
    oddUpdate = pd.Series(((df.groupby(['Address','NewCategory']).size()/df.groupby(['Address']).size()).apply(odds)))
    logodds = {k:update_odds(k, default_logodds, oddUpdate) for k in addresses}
    address_features=df["Address"].apply(lambda x: logodds[x])
    address_features.columns=["logodds"+str(x) for x in range(len(address_features.columns))]

    to_learn = pd.concat([to_learn, address_features], axis=1)
    
    col = [col for col in to_learn.columns if col not in ['Category']]
    category, features = to_learn.Category.as_matrix(), to_learn[col].as_matrix()

    new_PCA=PCA(n_components=7)
    features = new_PCA.fit_transform(features)
    np.savetxt('to_learn_' + str(csv_num) + '.csv', np.hstack((np.reshape(category, (category.size, 1)), features)))
    
    arr = []
    print(file_name)
    for i in range(10):
        features_train, features_test, category_train, category_test = train_test_split(features, category, test_size=0.2)
        model = RandomForestClassifier().fit(features_train, category_train)
        prediction = model.predict(features_test)
        acc = accuracy_score(category_test, prediction)
        arr.append(acc)
        print acc
    return arr

# Experiment
The experiment itself which iterates through each year and calculate 5-fold cross validation accuracy for 10 times. After the experiment terminates, we would have a .txt file recording the experiment results, as well as year-separated .csv files for machine learing with spark.

In [3]:
arr = []
for i in range(2003, 2015):
    arr.append(experiment(str(i) + '.csv', i))
np.savetxt('sklearn_results.txt', np.array(arr), delimiter=',')

2003.csv
0.580745551722
0.585143089101
0.581692713619
0.58676679521
0.582030985725
0.578783573507
0.580948514985
0.574250727285
0.585143089101
0.578039374873
2004.csv
0.591828396323
0.58100102145
0.5873340143
0.592645556691
0.589036431733
0.583520599251
0.592168879809
0.586993530814
0.592986040177
0.586040177051
2005.csv
0.59148064425
0.587100875954
0.586041254592
0.59352924555
0.596708109635
0.589855891495
0.592328341339
0.591551285674
0.588443063012
0.59063294716
2006.csv
0.535331140037
0.532684880561
0.53726219425
0.537476755829
0.530896867401
0.529466456873
0.541839507939
0.539264768989
0.535259619511
0.529824059505
2007.csv
0.554951113725
0.550393295597
0.553407336617
0.55620083805
0.546276556642
0.559214879071
0.547452767772
0.541057119753
0.545541424686
0.555759758877
2008.csv
0.560313501959
0.557748485928
0.560313501959
0.553259707873
0.55133594585
0.560384752405
0.544068400428
0.561738510866
0.551763448522
0.549697185607
2009.csv
0.559637681159
0.554927536232
0.55115942029
0.5