# Apply XGboost

In this notebook, we're going to train our model to use XGBoost. All hyperparameters are inspired by [Sergey Lebedev's solution](https://www.kaggle.com/sergeylebedev/sf-crime/initial-benchmark-need-tuning/run/163881).

In [1]:
import numpy as np
import pandas as pd

## Load datasets

In [2]:
data = pd.read_csv("../data/train.csv", parse_dates=["Dates"])

data.drop("Resolution", axis=1, inplace=True)
data.drop("Descript", axis=1, inplace=True)

print(data.shape)
data.head(3)

(878049, 7)


Unnamed: 0,Dates,Category,DayOfWeek,PdDistrict,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,Wednesday,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,Wednesday,NORTHERN,VANNESS AV / GREENWICH ST,-122.424363,37.800414


## Feature Engineering

### DayOfWeek

In [3]:
print(data["DayOfWeek"].unique())

day_of_week_dataframe = pd.get_dummies(data["DayOfWeek"], prefix="DayOfWeek").astype(np.bool)

data = pd.concat([data, day_of_week_dataframe], axis=1)
data.drop("DayOfWeek", axis=1, inplace=True)

print(data.shape)
data.head(3)

['Wednesday' 'Tuesday' 'Monday' 'Sunday' 'Saturday' 'Friday' 'Thursday']
(878049, 13)


Unnamed: 0,Dates,Category,PdDistrict,Address,X,Y,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday
0,2015-05-13 23:53:00,WARRANTS,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599,False,False,False,False,False,False,True
1,2015-05-13 23:53:00,OTHER OFFENSES,NORTHERN,OAK ST / LAGUNA ST,-122.425892,37.774599,False,False,False,False,False,False,True
2,2015-05-13 23:33:00,OTHER OFFENSES,NORTHERN,VANNESS AV / GREENWICH ST,-122.424363,37.800414,False,False,False,False,False,False,True


### PdDistrict

In [4]:
print(data["PdDistrict"].unique())

pd_district_dataframe = pd.get_dummies(data["PdDistrict"], prefix="PdDistrict").astype(np.bool)

data = pd.concat([data, pd_district_dataframe], axis=1)
data.drop("PdDistrict", axis=1, inplace=True)

print(data.shape)
data.head(3)

['NORTHERN' 'PARK' 'INGLESIDE' 'BAYVIEW' 'RICHMOND' 'CENTRAL' 'TARAVAL'
 'TENDERLOIN' 'MISSION' 'SOUTHERN']
(878049, 22)


Unnamed: 0,Dates,Category,Address,X,Y,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,...,PdDistrict_BAYVIEW,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN
0,2015-05-13 23:53:00,WARRANTS,OAK ST / LAGUNA ST,-122.425892,37.774599,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
1,2015-05-13 23:53:00,OTHER OFFENSES,OAK ST / LAGUNA ST,-122.425892,37.774599,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
2,2015-05-13 23:33:00,OTHER OFFENSES,VANNESS AV / GREENWICH ST,-122.424363,37.800414,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False


### Dates

In [5]:
def get_season(x):
    summer=0
    fall=0
    winter=0
    spring=0
    if (x in [5, 6, 7]):
        summer=1
    if (x in [8, 9, 10]):
        fall=1
    if (x in [11, 0, 1]):
        winter=1
    if (x in [2, 3, 4]):
        spring=1
    return summer, fall, winter, spring

In [6]:
data["Dates_year"] = data["Dates"].dt.year
data["Dates_month"] = data["Dates"].dt.month
data["Dates_day"] = data["Dates"].dt.day
data["Dates_hour"] = data["Dates"].dt.hour
data["Dates_minute"] = data["Dates"].dt.minute
data["Dates_second"] = data["Dates"].dt.second
data["Awake"] = data["Dates_hour"].apply(lambda x: 1 if (x==0 or (x>=8 and x<=23)) else 0)
data["Summer"], data["Fall"], data["Winter"], data["Spring"]=zip(*data["Dates_month"].apply(get_season))

data.drop("Dates", axis=1, inplace=True)

print(data.shape)
data.head(3)

(878049, 32)


Unnamed: 0,Category,Address,X,Y,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,...,Dates_month,Dates_day,Dates_hour,Dates_minute,Dates_second,Awake,Summer,Fall,Winter,Spring
0,WARRANTS,OAK ST / LAGUNA ST,-122.425892,37.774599,False,False,False,False,False,False,...,5,13,23,53,0,1,1,0,0,0
1,OTHER OFFENSES,OAK ST / LAGUNA ST,-122.425892,37.774599,False,False,False,False,False,False,...,5,13,23,53,0,1,1,0,0,0
2,OTHER OFFENSES,VANNESS AV / GREENWICH ST,-122.424363,37.800414,False,False,False,False,False,False,...,5,13,23,33,0,1,1,0,0,0


### Define a new feature named 'Address_Type' (Block/CrossRoad)

In [7]:
from sklearn.preprocessing import LabelEncoder

data["Address_Type"] = np.nan

data.loc[data["Address"].str.contains("Block of"), "Address_Type"] = "Block"
data.loc[data["Address"].str.contains("/"), "Address_Type"] = "CrossRoad"

encoder = LabelEncoder()
data["Address_Type(encode)"] = encoder.fit_transform(data["Address_Type"])

data.head(3)

Unnamed: 0,Category,Address,X,Y,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,...,Dates_hour,Dates_minute,Dates_second,Awake,Summer,Fall,Winter,Spring,Address_Type,Address_Type(encode)
0,WARRANTS,OAK ST / LAGUNA ST,-122.425892,37.774599,False,False,False,False,False,False,...,23,53,0,1,1,0,0,0,CrossRoad,1
1,OTHER OFFENSES,OAK ST / LAGUNA ST,-122.425892,37.774599,False,False,False,False,False,False,...,23,53,0,1,1,0,0,0,CrossRoad,1
2,OTHER OFFENSES,VANNESS AV / GREENWICH ST,-122.424363,37.800414,False,False,False,False,False,False,...,23,33,0,1,1,0,0,0,CrossRoad,1


### One hot encode address

In [8]:
# 누적값이 100개 이하인 경우는 'Others'로 바꾼다.
address_counts = data["Address"].value_counts()
other_index = address_counts[address_counts < 100].index
data.loc[data["Address"].isin(other_index), "Address"] = "Others"

print("The number of address types = {address}".format(address=len(data["Address"].value_counts())))
print(data.shape)
data.head()

The number of address types = 1638
(878049, 34)


Unnamed: 0,Category,Address,X,Y,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,...,Dates_hour,Dates_minute,Dates_second,Awake,Summer,Fall,Winter,Spring,Address_Type,Address_Type(encode)
0,WARRANTS,Others,-122.425892,37.774599,False,False,False,False,False,False,...,23,53,0,1,1,0,0,0,CrossRoad,1
1,OTHER OFFENSES,Others,-122.425892,37.774599,False,False,False,False,False,False,...,23,53,0,1,1,0,0,0,CrossRoad,1
2,OTHER OFFENSES,Others,-122.424363,37.800414,False,False,False,False,False,False,...,23,33,0,1,1,0,0,0,CrossRoad,1
3,LARCENY/THEFT,1500 Block of LOMBARD ST,-122.426995,37.800873,False,False,False,False,False,False,...,23,30,0,1,1,0,0,0,Block,0
4,LARCENY/THEFT,Others,-122.438738,37.771541,False,False,False,False,False,False,...,23,30,0,1,1,0,0,0,Block,0


In [9]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

label_encoder = LabelEncoder()
one_hot_encoder = OneHotEncoder(dtype=np.bool)

data["Address(encode)"] = label_encoder.fit_transform(data["Address"])
address = one_hot_encoder.fit_transform(data[["Address(encode)"]])

print(address.shape)
address

(878049, 1638)


<878049x1638 sparse matrix of type '<class 'numpy.bool_'>'
	with 878049 stored elements in Compressed Sparse Row format>

### Encode Category

In [10]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
data["Category(encode)"] = encoder.fit_transform(data["Category"])
      
data.head(3)

Unnamed: 0,Category,Address,X,Y,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,...,Dates_second,Awake,Summer,Fall,Winter,Spring,Address_Type,Address_Type(encode),Address(encode),Category(encode)
0,WARRANTS,Others,-122.425892,37.774599,False,False,False,False,False,False,...,0,1,1,0,0,0,CrossRoad,1,1543,37
1,OTHER OFFENSES,Others,-122.425892,37.774599,False,False,False,False,False,False,...,0,1,1,0,0,0,CrossRoad,1,1543,21
2,OTHER OFFENSES,Others,-122.424363,37.800414,False,False,False,False,False,False,...,0,1,1,0,0,0,CrossRoad,1,1543,21


## Score

In [11]:
exclude_columns = ["Address", "Address_Type", "Address(encode)", "Category"]

label_name = "Category(encode)"
feature_names = data.columns.difference([label_name] + exclude_columns)

X = data[feature_names]

print(X.shape)
X.head(3)

(878049, 31)


Unnamed: 0,Address_Type(encode),Awake,Dates_day,Dates_hour,Dates_minute,Dates_month,Dates_second,Dates_year,DayOfWeek_Friday,DayOfWeek_Monday,...,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN,Spring,Summer,Winter,X,Y
0,1,1,13,23,53,5,0,2015,False,False,...,False,False,False,False,False,0,1,0,-122.425892,37.774599
1,1,1,13,23,53,5,0,2015,False,False,...,False,False,False,False,False,0,1,0,-122.425892,37.774599
2,1,1,13,23,33,5,0,2015,False,False,...,False,False,False,False,False,0,1,0,-122.424363,37.800414


In [12]:
from scipy.sparse import hstack

X = hstack((X.values.astype(np.float32), address.astype(np.float32)))
X

<878049x1669 sparse matrix of type '<class 'numpy.float32'>'
	with 10302421 stored elements in COOrdinate format>

In [13]:
y = data[label_name]

print(y.shape)
y.head(3)

(878049,)


0    37
1    21
2    21
Name: Category(encode), dtype: int64

### Evaluate using XGBoost

In [14]:
import xgboost as xgb
from sklearn.cross_validation import StratifiedKFold, train_test_split, cross_val_score

random_state = 37

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=random_state)

watchlist = [(X_train, y_train), (X_valid, y_valid)]

kfold = StratifiedKFold(y_train, n_folds=5, random_state=random_state)

model = xgb.XGBClassifier(
    objective='multi:softprob',
    learning_rate=1.0,
    max_depth=6,
    max_delta_step=1,
    n_estimators=15,
)

fit_params={
    "eval_set": watchlist,
    "eval_metric": "mlogloss",
    "verbose": False,
}

%time score = cross_val_score(model, X_train, y_train, \
                              scoring='log_loss', cv=kfold, \
                              fit_params=fit_params).mean()

score = -1.0 * score
print("Use XGBClassifier. Score = {0:.6f}".format(score))


KeyboardInterrupt



## Result
  * Before = 2.490879
  * After = 2.298930