# Sanfran RandomForest

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [3]:
weekdays = {'Monday':0., 'Tuesday':1., 'Wednesday':2., 'Thursday': 3., 'Friday':4., 'Saturday':5., 'Sunday':6.}
categories = {c:i for i,c in enumerate(train['Category'].unique())}
cat_rev = {i:c for i,c in enumerate(train['Category'].unique())}
districts = {c:i for i,c in enumerate(train['PdDistrict'].unique())}
dis_rev = {i:c for i,c in enumerate(train['PdDistrict'].unique())}
print("get datasets")

get datasets


## Extract features from given information

In [4]:
# Extract features from given information
train['Hour'] = list(map(lambda x: float(int(x.split(' ')[1].split(':')[0])),train.Dates))
test['Hour'] = list(map(lambda x: float(int(x.split(' ')[1].split(':')[0])),test.Dates))

train['Minute'] = list(map(lambda x: float(int(x.split(' ')[1].split(':')[1])),train.Dates))
test['Minute'] = list(map(lambda x: float(int(x.split(' ')[1].split(':')[1])),test.Dates))

train['Month'] = list(map(lambda x: float(x.split(' ')[0].split('-')[1]), train.Dates))
test['Month'] = list(map(lambda x: float(x.split(' ')[0].split('-')[1]), test.Dates))

train['Year'] = list(map(lambda x: float(x.split(' ')[0].split('-')[0])-2003., train.Dates))
test['Year'] = list(map(lambda x: float(x.split(' ')[0].split('-')[0])-2003., test.Dates))

train['Day'] = list(map(lambda x: float(x.split(' ')[0].split('-')[2]), train.Dates))
test['Day'] = list(map(lambda x: float(x.split(' ')[0].split('-')[2]), test.Dates))

train['Day_Num'] = [float(weekdays[w]) for w in train.DayOfWeek]
test['Day_Num'] = [float(weekdays[w]) for w in test.DayOfWeek]

train['District_Num'] = [float(districts[t]) for t in train.PdDistrict]
test['District_Num'] = [float(districts[t]) for t in test.PdDistrict]

train['Category_Num'] = [float(categories[t]) for t in train.Category]

train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Hour,Minute,Month,Year,Day,Day_Num,District_Num,Category_Num
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,23.0,53.0,5.0,12.0,13.0,2.0,0.0,0.0
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,23.0,53.0,5.0,12.0,13.0,2.0,0.0,1.0
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,23.0,33.0,5.0,12.0,13.0,2.0,0.0,1.0
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,23.0,30.0,5.0,12.0,13.0,2.0,0.0,2.0
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,23.0,30.0,5.0,12.0,13.0,2.0,1.0,2.0


In [5]:
train['X'] = preprocessing.scale(list(map(lambda x: x+122.4194, train.X)))
train['Y'] = preprocessing.scale(list(map(lambda x: x-37.7749, train.Y)))

test['X'] = preprocessing.scale(list(map(lambda x: x+122.4194, test.X)))
test['Y'] = preprocessing.scale(list(map(lambda x: x-37.7749, test.Y)))
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Hour,Minute,Month,Year,Day,Day_Num,District_Num,Category_Num
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-0.107902,0.007832,23.0,53.0,5.0,12.0,13.0,2.0,0.0,0.0
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-0.107902,0.007832,23.0,53.0,5.0,12.0,13.0,2.0,0.0,1.0
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-0.057541,0.064335,23.0,33.0,5.0,12.0,13.0,2.0,0.0,1.0
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-0.144262,0.065338,23.0,30.0,5.0,12.0,13.0,2.0,0.0,2.0
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-0.531112,0.00114,23.0,30.0,5.0,12.0,13.0,2.0,1.0,2.0


## Assign binary value to address by type

In [6]:
# Assign binary value to address by type
def define_address(addr):
    addr_type = 0.
    # Address types:
    # Intersection: 1
    # Residence: 0
    if '/' in addr and 'of' not in addr:
        addr_type = 1.
    else:
        add_type = 0.
    return addr_type

# Define address feature
train['Address_Num'] = list(map(define_address, train.Address))
test['Address_Num'] = list(map(define_address, test.Address))

## Feature selection

In [7]:
X_loc = ['X', 'Y', 'District_Num', 'Address_Num']
X_time = ['Minute', 'Hour']
X_date = ['Year','Month', 'Day', 'Day_Num']
X_all = X_loc + X_time + X_date

In [8]:
# Category column we want to predict
y = 'Category_Num'
print(train.head())

                 Dates        Category                      Descript  \
0  2015-05-13 23:53:00        WARRANTS                WARRANT ARREST   
1  2015-05-13 23:53:00  OTHER OFFENSES      TRAFFIC VIOLATION ARREST   
2  2015-05-13 23:33:00  OTHER OFFENSES      TRAFFIC VIOLATION ARREST   
3  2015-05-13 23:30:00   LARCENY/THEFT  GRAND THEFT FROM LOCKED AUTO   
4  2015-05-13 23:30:00   LARCENY/THEFT  GRAND THEFT FROM LOCKED AUTO   

   DayOfWeek PdDistrict      Resolution                    Address         X  \
0  Wednesday   NORTHERN  ARREST, BOOKED         OAK ST / LAGUNA ST -0.107902   
1  Wednesday   NORTHERN  ARREST, BOOKED         OAK ST / LAGUNA ST -0.107902   
2  Wednesday   NORTHERN  ARREST, BOOKED  VANNESS AV / GREENWICH ST -0.057541   
3  Wednesday   NORTHERN            NONE   1500 Block of LOMBARD ST -0.144262   
4  Wednesday       PARK            NONE  100 Block of BRODERICK ST -0.531112   

          Y  Hour  Minute  Month  Year   Day  Day_Num  District_Num  \
0  0.007832  23

In [9]:
# Create random forest classifie
clf = RandomForestClassifier(max_features="log2", max_depth=11, n_estimators=24,
                             min_samples_split=1000, oob_score=True)
# Fit prediction
clf.fit(train[X_all], train[y])
pred = clf.predict_proba(test[X_all])

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


## Submission

In [10]:
# Create submission
submission = pd.DataFrame({cat_rev[p] : [pred[i][p] for i in range(len(pred))] for p in range(len(pred[0]))})
submission['Id'] = [i for i in range(len(submission))]
submission = submission[['Id'] + sorted(train['Category'].unique())]
print(submission.head())

# Write submission
submission.to_csv('submission1.csv.gz', index=False, compression='gzip')

   Id     ARSON   ASSAULT  BAD CHECKS   BRIBERY  BURGLARY  DISORDERLY CONDUCT  \
0   0  0.003865  0.152396    0.000014  0.001000  0.032813            0.002480   
1   1  0.001817  0.060330    0.000000  0.000181  0.001664            0.003039   
2   2  0.002838  0.100090    0.000161  0.000242  0.075124            0.002467   
3   3  0.003512  0.150962    0.000026  0.000675  0.030094            0.003548   
4   4  0.003512  0.150962    0.000026  0.000675  0.030094            0.003548   

   DRIVING UNDER THE INFLUENCE  DRUG/NARCOTIC  DRUNKENNESS     ...       \
0                     0.004252       0.048995     0.005421     ...        
1                     0.009365       0.073277     0.003159     ...        
2                     0.002407       0.029234     0.005886     ...        
3                     0.003973       0.046315     0.007132     ...        
4                     0.003973       0.046315     0.007132     ...        

   SEX OFFENSES NON FORCIBLE  STOLEN PROPERTY   SUICIDE  SUSPI