In [43]:
import pandas as pd
from pandas import Categorical
import numpy as np

from sklearn.preprocessing import LabelEncoder, LabelBinarizer, StandardScaler, Imputer
from sklearn.cross_validation import LeaveOneLabelOut, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.decomposition import PCA

import time

import matplotlib.pyplot as plt
%matplotlib inline

In [31]:
data_dtypes = {
    'Dataset':'category',
    'ID':'int',
    'YearMonth': 'category',
    'Year': 'category',
    'Month': 'category',
    'Week': 'category',
    'Address': 'category',
    'Species': 'category',
    'Block': 'category',
    'Street': 'category',
    'Trap': 'category',
    'AddressNumberAndStreet': 'category',
    'Latitude': 'float32',
    'Longitude': 'float32',
    'AddressAccuracy': 'category',
    'NumMosquitos': 'float32',
    'WnvPresent': 'category',
    'Tmax': 'float32',
    'Tmin': 'float32',
    'Tavg': 'float32',
    'Depart': 'category',
    'DewPoint': 'float32',
    'WetBulb': 'float32',
    'Heat': 'category',
    'Cool': 'category',
    'Sunrise': 'float32',
    'Sunset': 'float32',
    'CodeSum': 'category',
    'PrecipTotal': 'float32',
    'StnPressure': 'float32',
    'SeaLevel': 'float32',
    'ResultSpeed': 'float32',
    'ResultDir': 'float32',
    'AvgSpeed': 'float32',
    'SprayIntensity': 'float32'
}
data = pd.read_csv('../data/smote_data.csv')
for col,datatype in data_dtypes.items():
    data[col] = data[col].astype(datatype)

### Encode Categorical Features

In [32]:
for col,datatype in data_dtypes.items():
    if datatype is 'category' and col not in ['Dataset','WnvPresent']:
        data[col] = LabelEncoder().fit_transform(data[col])

###Impute missing values and Scale Numeric Features 

In [33]:
for col,datatype in data_dtypes.items():
    if pd.isnull(data[col]).sum() > 0:
        data[col] = Imputer(axis=1).fit_transform(data[col]).T[:,0]

In [34]:
for col,datatype in data_dtypes.items():
    if datatype in ['float32']:
        data[col] = StandardScaler().fit_transform(data[col])

###Train/Test Split

In [35]:
train = data[data.Dataset == 'Train']
test  = data[data.Dataset == 'Test']

### Train random forest, cross validating on year label

In [44]:
year_crossval = LeaveOneLabelOut(train['Year'])

features = [
    col for col in data if col not in [
        'Dataset','ID','NumMosquitos','WnvPresent','Date','Sunrise','Sunset']]

parameters = {
    'pca__n_components':[None,5,10],
    'classifier__n_estimators':[10,25,50,100]
}

pipeline = Pipeline(steps=[
    ('pca',PCA()),
    ('classifier',BaggingClassifier())
])

grid_search_start = time.time()
if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block
    
    grid_search = GridSearchCV(
        pipeline,parameters,cv = year_crossval,n_jobs=-1, verbose=1, scoring='roc_auc'
    )
    grid_search.fit(train[features],train['WnvPresent'])
    best_params = grid_search.best_params_
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:",best_params)
    print("Model trained in %0.3f seconds" % (time.time() - grid_search_start))

[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done  42 out of  48 | elapsed:  3.9min remaining:   33.2s
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  4.4min finished


Fitting 4 folds for each of 12 candidates, totalling 48 fits
Best score: 0.850
Best parameters set: {'pca__n_components': 10, 'classifier__n_estimators': 100}
Model trained in 282.432 seconds


In [45]:
prediction_start = time.time()
results = pd.DataFrame({
'Id': list(range(1,len(test)+1)),
'WnvPresent': grid_search.best_estimator_.predict(test[features])
})
print("Prediction Time: %0.3f seconds" % (time.time() - prediction_start))
results.groupby('WnvPresent').describe()

Prediction Time: 2.804 seconds


Unnamed: 0_level_0,Unnamed: 1_level_0,Id
WnvPresent,Unnamed: 1_level_1,Unnamed: 2_level_1
0,count,109479.0
0,mean,58013.071484
0,std,33717.105116
0,min,1.0
0,25%,29070.5
0,50%,57808.0
0,75%,87642.5
0,max,116293.0
1,count,6814.0
1,mean,60298.799237


In [46]:
results.to_csv('../submissions/19_smote_bagging.csv',index=False)