In [1]:
import pandas as pd
from pandas import Categorical
import numpy as np

from sklearn.preprocessing import LabelEncoder, LabelBinarizer, StandardScaler, Imputer
from sklearn.cross_validation import LeaveOneLabelOut, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA

import time

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data_dtypes = {
    'Dataset':'category',
    'ID':'int',
    'YearMonth': 'category',
    'Year': 'category',
    'Month': 'category',
    'Week': 'category',
    'Address': 'category',
    'Species': 'category',
    'Block': 'category',
    'Street': 'category',
    'Trap': 'category',
    'AddressNumberAndStreet': 'category',
    'Latitude': 'float32',
    'Longitude': 'float32',
    'AddressAccuracy': 'category',
    'NumMosquitos': 'float32',
    'Tmax': 'float32',
    'Tmin': 'float32',
    'Tavg': 'float32',
    'Depart': 'category',
    'DewPoint': 'float32',
    'WetBulb': 'float32',
    'Heat': 'category',
    'Cool': 'category',
    'Sunrise': 'float32',
    'Sunset': 'float32',
    'CodeSum': 'category',
    'PrecipTotal': 'float32',
    'StnPressure': 'float32',
    'SeaLevel': 'float32',
    'ResultSpeed': 'float32',
    'ResultDir': 'float32',
    'AvgSpeed': 'float32',
    'SprayIntensity': 'float32',
    'WnvPresent': 'category'
}
data = pd.read_csv('../data/smote_data.csv')
for col,datatype in data_dtypes.items():
    data[col] = data[col].astype(datatype)

###Impute missing values and Scale Numeric Features 

In [3]:
for col,datatype in data_dtypes.items():
    if pd.isnull(data[col]).sum() > 0:
        data[col] = Imputer(axis=1).fit_transform(data[col]).T[:,0]

In [None]:
for col,datatype in data_dtypes.items():
    if datatype in ['float32']:
        data[col] = StandardScaler().fit_transform(data[col])

###Encode and Binarize Categorical Features

In [None]:
data_non_binarized = data.copy()
categorical_cols = [col for col in data if str(data[col].dtype) == "category" and col not in ['Dataset','WnvPresent','Year']]
for col in categorical_cols:
    le = LabelEncoder()
    binarized_df = pd.DataFrame(
        data = LabelBinarizer().fit_transform(le.fit_transform(data[col])),
        columns = [col + "_" + str(feature) for feature in le.classes_]
    )
    data = data.drop(col,axis=1)
    data = pd.concat([data,binarized_df],axis=1)

###Train/Test Split

In [None]:
train = data[data.Dataset == 'Train']
test  = data[data.Dataset == 'Test']

In [None]:
year_crossval = LeaveOneLabelOut(train['Year'])

features = [
    col for col in data if col not in [
        'Dataset','ID','NumMosquitos','WnvPresent','Date','Sunrise','Sunset']]

parameters = {
    'pca__n_components':[5,10,100,None],
    'classifier__C':[10e-5,10e-2,10e0,10e2,10e5],
    'classifier__gamma':[10e-5,10e-2,10e0,10e2,10e5]
}

pipeline = Pipeline(steps=[
    ('pca',PCA()),
    ('classifier',SVM(cache_size = 2000))
])

grid_search_start = time.time()
if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block
    
    grid_search = GridSearchCV(
        pipeline,parameters,cv = year_crossval,n_jobs=-1, verbose=1, scoring='roc_auc'
    )
    grid_search.fit(train[features],train['WnvPresent'])
    best_params = grid_search.best_params_
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:",best_params)
    print("Model trained in %0.3f seconds" % (time.time() - grid_search_start))

In [None]:
prediction_start = time.time()
results = pd.DataFrame({
'Id': list(range(1,len(test)+1)),
'WnvPresent': grid_search.best_estimator_.predict(test[features])
})
print("Prediction Time: %0.3f seconds" % (time.time() - prediction_start))
results.groupby('WnvPresent').describe()

In [None]:
#results.to_csv('../submissions/19_smote_bagging.csv',index=False)