# GBDT

In [59]:
import pandas as pd
import numpy as np
import datetime
import time
from sklearn.preprocessing import scale
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.grid_search import GridSearchCV  
from sklearn.ensemble import GradientBoostingClassifier

def parse_time(x):
    DD = datetime.datetime.strptime(x,"%Y-%m-%d %H:%M:%S")
#     year = DD.year-2002
    time = DD.hour
    day = DD.day
    month = DD.month
    return time,day,month

def odds(x):
    if x == 1.0:
        return 1.0
    else:
        return np.log(x)-np.log(1-x)
def update_odds(x, default_logodds, oddUpdate):
    defaut = default_logodds
    val = oddUpdate[x]
    if len(val)!=1:
        defaut[val.keys()] = val
    return pd.Series(defaut)

rough_category = dict()
rough_category['VEHICLE THEFT'] = 15
rough_category['VANDALISM'] = 0
rough_category['DRIVING UNDER THE INFLUENCE'] = 15
rough_category['ARSON'] = 19
rough_category['BRIBERY'] = 8
rough_category['SUICIDE'] = 9
rough_category['SEX OFFENSES NON FORCIBLE'] = 0
rough_category['EXTORTION'] = 20
rough_category['GAMBLING'] = 0
rough_category['BAD CHECKS'] = 23
rough_category['TREA'] = 0
rough_category['RECOVERED VEHICLE'] = 19
rough_category['PORNOGRAPHY/OBSCENE MAT'] = 0
rough_category['WARRANTS'] = 1
rough_category['OTHER OFFENSES'] = 1
rough_category['LARCENY/THEFT'] = 1
rough_category['NON-CRIMINAL'] = 1
rough_category['ROBBERY'] = 1
rough_category['ASSAULT'] = 1
rough_category['WEAPON LAWS'] = 11
rough_category['DRUNKENNESS'] = 14
rough_category['TRESPASS'] = 1
rough_category['LOITERING'] = 10
rough_category['BURGLARY'] = 2
rough_category['SECONDARY CODES'] = 16
rough_category['MISSING PERSON'] = 12
rough_category['RUNAWAY'] = 13
rough_category['FAMILY OFFENSES'] = 22
rough_category['LIQUOR LAWS'] = 14
rough_category['DISORDERLY CONDUCT'] = 21
rough_category['SUSPICIOUS OCC'] = 17
rough_category['KIDNAPPING'] = 16
rough_category['SEX OFFENSES FORCIBLE'] = 17
rough_category['EMBEZZLEMENT'] = 18
rough_category['DRUG/NARCOTIC'] = 3
rough_category['PROSTITUTION'] = 4
rough_category['FORGERY/COUNTERFEITING'] = 5
rough_category['FRAUD'] = 6
rough_category['STOLEN PROPERTY'] = 7

def experiment(file_name):
    df = pd.read_csv(file_name) 
    to_learn = pd.DataFrame()
    to_learn['Category'] = df.Category.apply(lambda item: rough_category[item])
    df['NewCategory'] = to_learn['Category']
    to_learn['Hour'], to_learn['Day'], to_learn['Month'] = zip(*df.Dates.apply(parse_time))
    to_learn['X'], to_learn['Y'] = df.X, df.Y
    SFPD = df.PdDistrict.unique()
    PD_map, label = dict(), 1
    for name in SFPD:
        PD_map[name] = label
        label += 1
    to_learn['PD'] = df.PdDistrict.apply(lambda item: PD_map[item])

    #to_learn.to_csv('cat_feature_' + str(file_no) + '.csv')
    #to_learn = pd.read_csv('cat_feature.csv')
    to_learn.X, to_learn.Y = 10 * scale(to_learn.X), 10 * scale(to_learn.Y)
    
    
    addresses = sorted(df["Address"].unique())
    categories = sorted(df["NewCategory"].unique())
    C_counts = df.groupby(["NewCategory"]).size()
    logoddsPA2 = dict((df.groupby('Address').size()/len(df)).apply(odds))
    default_logodds = np.log(C_counts/len(df))-np.log(1.0-C_counts/float(len(df)))
    oddUpdate = pd.Series(((df.groupby(['Address','NewCategory']).size()/df.groupby(['Address']).size()).apply(odds)))
    logodds2 = {k:update_odds(k, default_logodds, oddUpdate) for k in addresses}
    address_features=df["Address"].apply(lambda x: logodds2[x])
    address_features.columns=["logodds"+str(x) for x in range(len(address_features.columns))]

    to_learn = pd.concat([to_learn, address_features], axis=1)
    
    col = [col for col in to_learn.columns if col not in ['Category']]
    category, features = to_learn.Category.as_matrix(), to_learn[col].as_matrix()

    new_PCA=PCA(n_components=15)
    new_PCA.fit(features)
    return new_PCA.explained_variance_ratio_
#     new_PCA=PCA(n_components=8)
#     features = new_PCA.fit_transform(features)
#     print(file_name)
#     average = []
#     features_train, features_test, category_train, category_test = train_test_split(features, category, test_size=0.2)
#     model      = GradientBoostingClassifier(max_features='log2',n_estimators = 100)
#     grid_model = model.fit(features_train, category_train)
#     prediction = grid_model.predict(features_test)
#     print accuracy_score(category_test, prediction)
#     average.append(accuracy_score(category_test, prediction))
#     return average

In [10]:
All_average = []
for i in range(2003, 2015):
    All_average.append(experiment(str(i) + '.csv'))
print 'Average for 2003-2014:' + str(np.average(All_average))

2003.csv
0.603003856302
0.603815709357
0.607401393681
0.602124348826
0.606048305257
0.608280901157
0.610445842636
0.600568297138
0.60455990799
0.606927812733
Average:      0.605317637508
2004.csv
0.588968335036
0.586789240722
0.584678243105
0.584473953013
0.586176370446
0.58658495063
0.588764044944
0.58937691522
0.587538304392
0.591828396323
Average:      0.587517875383
2005.csv
0.576080813789
0.570570782707
0.576787228031
0.574526702458
0.580319299237
0.578765187906
0.580319299237
0.585264198926
0.581661486296
0.573396439672
Average:      0.577769143826
2006.csv
0.623444428551
0.609712487484
0.616149334859
0.619153196968
0.617508224861
0.615791732227
0.617436704334
0.612859390645
0.617150622229
0.620583607495
Average:      0.616978972965
2007.csv
0.621921634933
0.618098948761
0.616555171653
0.623171359259
0.618245975153
0.616628684849
0.617510843196
0.619201646696
0.618245975153
0.619642725869
Average:      0.618922296552
2008.csv
0.629711435696
0.626433915212
0.626790167439
0.6252226

In [33]:
All_average = []
for i in range(2003, 2015):
    All_average.append(experiment(str(i) + '.csv'))
print 'Average for 2003-2014:'.ljust(20) + str(np.average(All_average))

2003.csv
0.599959407347
Average:            0.599959407347
2004.csv
0.574736125298
Average:            0.574736125298
2005.csv
0.559056230574
Average:            0.559056230574
2006.csv
0.607495351166
Average:            0.607495351166
2007.csv
0.614643828567
Average:            0.614643828567
2008.csv
0.612896330602
Average:            0.612896330602
2009.csv
0.613333333333
Average:            0.613333333333
2010.csv
0.618829363589
Average:            0.618829363589
2011.csv
0.643350345242
Average:            0.643350345242
2012.csv
0.648010036942
Average:            0.648010036942
2013.csv
0.679076841688
Average:            0.679076841688
2014.csv
0.684164771967
Average:            0.684164771967
Average for 2003-2014:0.621295997193


In [37]:
All_average = []
for i in range(2003, 2015):
    All_average.append(experiment(str(i) + '.csv'))
print 'Average for 2003-2014:'.ljust(20) + str(np.average(All_average))
# 15

2003.csv
0.585278397943
2004.csv
0.570241743275
2005.csv
0.557714043515
2006.csv
0.602560434845
2007.csv
0.598617951922
2008.csv
0.600356252227
2009.csv
0.610217391304
2010.csv
0.61642497558
2011.csv
0.639072350645
2012.csv
0.635603262006
2013.csv
0.667901071287
2014.csv
0.679884980607
Average for 2003-2014:0.613656071263


In [39]:
All_average = []
for i in range(2003, 2015):
    All_average.append(experiment(str(i) + '.csv'))
print 'Average for 2003-2014:'.ljust(20) + str(np.average(All_average))
# 16

2003.csv
0.593261619647
2004.csv
0.573987061628
2005.csv
0.553616840916
2006.csv
0.59612358747
2007.csv
0.596706608836
2008.csv
0.607837548985
2009.csv
0.606666666667
2010.csv
0.614546547449
2011.csv
0.635544881417
2012.csv
0.637624590507
2013.csv
0.662809152229
2014.csv
0.681222415407
Average for 2003-2014:0.613328960096


In [53]:
#24
All_average = []
for i in range(2003, 2015):
    All_average.append(experiment(str(i) + '.csv'))
print 'Average for 2003-2014:'.ljust(20) + str(np.average(All_average))

2003.csv
0.585481361207
2004.csv
0.56758597208
2005.csv
0.547753602713
2006.csv
0.594264053783
2007.csv
0.60486657355
2008.csv
0.606056287852
2009.csv
0.599492753623
2010.csv
0.606206326546
2011.csv
0.630066046232
2012.csv
0.634139541368
2013.csv
0.664859145616
2014.csv
0.673532165307
Average for 2003-2014:0.609525319156


In [None]:
from matplotlib import pyplot as plt
plt.figure(1)
for i in range(2003, 2015):
    plt.subplot(3, 4, i - 2002)
    plt.plot(experiment(str(i) + '.csv'))
plt.show()

# Try All Category

In [22]:
import pandas as pd
import numpy as np
import datetime
import time
from sklearn.preprocessing import scale
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.grid_search import GridSearchCV  
from sklearn.ensemble import GradientBoostingClassifier

def parse_time(x):
    DD = datetime.datetime.strptime(x,"%Y-%m-%d %H:%M:%S")
#     year = DD.year-2002
    time = DD.hour
    day = DD.day
    month = DD.month
    return time,day,month

def odds(x):
    if x == 1.0:
        return 1.0
    else:
        return np.log(x)-np.log(1-x)
def update_odds(x, default_logodds, oddUpdate):
    defaut = default_logodds
    val = oddUpdate[x]
    if len(val)!=1:
        defaut[val.keys()] = val
    return pd.Series(defaut)

rough_category = dict()
rough_category['VEHICLE THEFT'] = 0
rough_category['VANDALISM'] = 1
rough_category['DRIVING UNDER THE INFLUENCE'] = 0
rough_category['ARSON'] = 2
rough_category['BRIBERY'] = 3
rough_category['SUICIDE'] = 4
rough_category['SEX OFFENSES NON FORCIBLE'] = 5
rough_category['EXTORTION'] = 6
rough_category['GAMBLING'] = 7
rough_category['BAD CHECKS'] = 8
rough_category['TREA'] = 9
rough_category['RECOVERED VEHICLE'] = 10
rough_category['PORNOGRAPHY/OBSCENE MAT'] = 11
rough_category['WARRANTS'] = 12
rough_category['OTHER OFFENSES'] = 13
rough_category['LARCENY/THEFT'] = 14
rough_category['NON-CRIMINAL'] = 15
rough_category['ROBBERY'] = 16
rough_category['ASSAULT'] = 17
rough_category['WEAPON LAWS'] = 18
rough_category['DRUNKENNESS'] = 19
rough_category['TRESPASS'] = 9
rough_category['LOITERING'] = 20
rough_category['BURGLARY'] = 21
rough_category['SECONDARY CODES'] = 22
rough_category['MISSING PERSON'] = 23
rough_category['RUNAWAY'] = 24
rough_category['FAMILY OFFENSES'] = 25
rough_category['LIQUOR LAWS'] = 19
rough_category['DISORDERLY CONDUCT'] = 26
rough_category['SUSPICIOUS OCC'] = 27
rough_category['KIDNAPPING'] = 28
rough_category['SEX OFFENSES FORCIBLE'] = 29
rough_category['EMBEZZLEMENT'] = 30
rough_category['DRUG/NARCOTIC'] = 31
rough_category['PROSTITUTION'] = 32
rough_category['FORGERY/COUNTERFEITING'] = 33
rough_category['FRAUD'] = 34
rough_category['STOLEN PROPERTY'] = 35

def experiment(file_name):
    df = pd.read_csv(file_name) 
    to_learn = pd.DataFrame()
    to_learn['Category'] = df.Category.apply(lambda item: rough_category[item])
    df['NewCategory'] = to_learn['Category']
    to_learn['Hour'], to_learn['Day'], to_learn['Month'] = zip(*df.Dates.apply(parse_time))
    to_learn['X'], to_learn['Y'] = df.X, df.Y
    SFPD = df.PdDistrict.unique()
    PD_map, label = dict(), 1
    for name in SFPD:
        PD_map[name] = label
        label += 1
    to_learn['PD'] = df.PdDistrict.apply(lambda item: PD_map[item])

    #to_learn.to_csv('cat_feature_' + str(file_no) + '.csv')
    #to_learn = pd.read_csv('cat_feature.csv')
    to_learn.X, to_learn.Y = 10 * scale(to_learn.X), 10 * scale(to_learn.Y)
    
    
    addresses = sorted(df["Address"].unique())
    categories = sorted(df["NewCategory"].unique())
    C_counts = df.groupby(["NewCategory"]).size()
    logoddsPA2 = dict((df.groupby('Address').size()/len(df)).apply(odds))
    default_logodds = np.log(C_counts/len(df))-np.log(1.0-C_counts/float(len(df)))
    oddUpdate = pd.Series(((df.groupby(['Address','NewCategory']).size()/df.groupby(['Address']).size()).apply(odds)))
    logodds2 = {k:update_odds(k, default_logodds, oddUpdate) for k in addresses}
    address_features=df["Address"].apply(lambda x: logodds2[x])
    address_features.columns=["logodds"+str(x) for x in range(len(address_features.columns))]

    to_learn = pd.concat([to_learn, address_features], axis=1)
    
    col = [col for col in to_learn.columns if col not in ['Category']]
    category, features = to_learn.Category.as_matrix(), to_learn[col].as_matrix()

#     new_PCA=PCA(n_components=15)
#     new_PCA.fit(features)
#     return new_PCA.explained_variance_ratio_
    new_PCA=PCA(n_components=7)
    features = new_PCA.fit_transform(features)
    print(file_name)
    average = []
#     for i in range(0,10):
#         features_train, features_test, category_train, category_test = train_test_split(features, category, test_size=0.2)
#         model      = GradientBoostingClassifier(max_features='log2',n_estimators = 40)
#         grid_model = model.fit(features_train, category_train)
#         prediction = grid_model.predict(features_test)
#         print accuracy_score(category_test, prediction)
#         average.append(accuracy_score(category_test, prediction))
    features_train, features_test, category_train, category_test = train_test_split(features, category, test_size=0.1)
    model      = GradientBoostingClassifier(max_features='log2',n_estimators = 20)
    grid_model = model.fit(features_train, category_train)
    prediction = grid_model.predict(features_test)
    print accuracy_score(category_test, prediction)
    average.append(accuracy_score(category_test, prediction))
    return average