In [1]:
import numpy as np
import pandas as pd

# Training Data

In [2]:
train = pd.read_csv('assets/train.csv')

FileNotFoundError: File b'assets/train.csv' does not exist

In [None]:
train["Date"] = pd.to_datetime(train["Date"], infer_datetime_format=True)
train['Address'] = train['Address'].astype('category')
train['Species'] = train['Species'].astype('category')
train['Street'] = train['Street'].astype('category')
train['Trap'] = train['Trap'].astype('category')
train['AddressNumberAndStreet'] = train['AddressNumberAndStreet'].astype('category')

# Weather Data

In [3]:
weather = pd.read_csv('weather-nmo.csv', index_col=0)

FileNotFoundError: File b'weather-nmo.csv' does not exist

In [122]:
# We need to create one line per date:
station1 = weather[weather['Station']==1]
station2 = weather[weather['Station']==2]
station1 = station1.drop('Station', axis=1)
station2 = station2.drop('Station', axis=1)

In [123]:
station1.columns = ['Date', 'st1_Tmax', 'st1_Tmin', 'st1_Tavg', 'st1_DewPoint', 'st1_WetBulb', 'st1_CodeSum',
       'st1_SnowFall', 'st1_PrecipTotal', 'st1_StnPressure', 'st1_SeaLevel', 'st1_ResultSpeed',
       'st1_ResultDir', 'st1_AvgSpeed', 'st1_Lat', 'st1_Long']
station2.columns = ['Date', 'st2_Tmax', 'st2_Tmin', 'st2_Tavg', 'st2_DewPoint', 'st2_WetBulb', 'st2_CodeSum',
       'st2_SnowFall', 'st2_PrecipTotal', 'st2_StnPressure', 'st2_SeaLevel', 'st2_ResultSpeed',
       'st2_ResultDir', 'st2_AvgSpeed', 'st2_Lat', 'st2_Long']

In [124]:
weather = pd.merge(station1, station2, on='Date')
weather["Date"] = pd.to_datetime(weather["Date"], infer_datetime_format=True)

In [125]:
# Feature engineer us some over time weather data
weather = weather.set_index('Date')

In [126]:
weather['precip_avg'] = (weather['st1_PrecipTotal'] + weather['st2_PrecipTotal'])/2
weather['2wk_precip'] = weather['precip_avg'].rolling(14, min_periods=1).sum()
weather['4wk_precip'] = weather['precip_avg'].rolling(28, min_periods=1).sum()
weather['90day_precip'] = weather['precip_avg'].rolling(90, min_periods=1).sum()

In [127]:
weather['temp_avg'] = (weather['st1_Tavg'] + weather['st2_Tavg'])/2
weather['2wk_tavg'] = weather['temp_avg'].rolling(14, min_periods=1).mean()
weather['4wk_tavg'] = weather['temp_avg'].rolling(28, min_periods=1).mean()
weather['90day_tavg'] = weather['temp_avg'].rolling(90, min_periods=1).mean()

In [128]:
weather['tempmin_avg'] = (weather['st1_Tmin'] + weather['st2_Tmin'])/2
weather['2wk_mintemp'] = weather['tempmin_avg'].rolling(14, min_periods=1).min()
weather['4wk_mintemp'] = weather['tempmin_avg'].rolling(28, min_periods=1).min()

In [129]:
weather['dew_avg'] = (weather['st1_DewPoint'] + weather['st2_DewPoint'])/2
weather['2wk_dew'] = weather['dew_avg'].rolling(14, min_periods=1).mean()
weather['4wk_dew'] = weather['dew_avg'].rolling(28, min_periods=1).mean()

In [130]:
weather = weather.reset_index()
train = pd.merge(train, weather, how='left', on='Date')

# Categories

In [131]:
final_df = pd.get_dummies(train, columns=['Species', 'Trap', 'Block'])

# Time

In [132]:
final_df['Month'] = final_df['Date'].dt.month
final_df["Day"] = final_df['Date'].dt.dayofyear

# Location Info

In [133]:
#our two origins (the locations with the most WNV activity) are Chicago O'Hare and Doty Ave.
#the following values are their latitudes and longitudes
ohare_lon = -87.890615
ohare_lat = 41.974689
doty_lon =-87.599862
doty_lat=41.673408

In [134]:
lat = train.Latitude
lon = train.Longitude

In [135]:
#haversine takes two lat and longs and creates a distance, from the mean, in miles
from math import radians, cos, sin, asin, sqrt
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    mi = 3956   * c #Radius of earth in miles. Use 6367 for kilometers
    return mi, dlon, dlat

In [136]:
#apply haversine function to training dataset, creating a column called 'dist_from_ohare_MI'
final_df['dist_from_ohare_MI'] = [haversine(y, x, ohare_lon, ohare_lat)[0] for y, x in zip(lon, lat)]
#apply haversine function to training dataset, creating a column called 'dist_from_doty_MI'
final_df['dist_from_doty_MI'] = [haversine(y, x, doty_lon, doty_lat)[0] for y, x in zip(lon, lat)]

In [158]:
#test_features = final_df.drop(['WnvPresent', 'NumMosquitos', 'Date', 'Address', 'Street', 'AddressNumberAndStreet', 'st1_CodeSum', 'st2_CodeSum'], 1)
test_features = final_df[['Latitude', 'Longitude', 'st1_Tmax', 'st1_Tmin', 'st1_Tavg', 'st1_DewPoint', 'st1_WetBulb', 'st1_SnowFall', 'st1_PrecipTotal', 'st1_StnPressure', 'st1_SeaLevel', 'st1_ResultSpeed', 'st1_ResultDir', 'st1_AvgSpeed', 'st2_Tmax', 'st2_Tmin', 'st2_Tavg', 'st2_DewPoint', 'st2_WetBulb', 'st2_SnowFall', 'st2_PrecipTotal', 'st2_StnPressure', 'st2_SeaLevel', 'st2_ResultSpeed', 'st2_ResultDir', 'st2_AvgSpeed', 'precip_avg', '2wk_precip', '4wk_precip', '90day_precip', 'temp_avg', '2wk_tavg', '4wk_tavg', '90day_tavg', 'tempmin_avg', '2wk_mintemp', '4wk_mintemp', 'dew_avg', '2wk_dew', '4wk_dew', 'Species_CULEX ERRATICUS', 'Species_CULEX PIPIENS', 'Species_CULEX PIPIENS/RESTUANS', 'Species_CULEX RESTUANS', 'Species_CULEX SALINARIUS', 'Species_CULEX TARSALIS', 'Species_CULEX TERRITANS', 'Month', 'Day', 'dist_from_ohare_MI', 'dist_from_doty_MI']]
target = final_df.WnvPresent

# Scale stuff

In [138]:
from sklearn.preprocessing import StandardScaler

In [139]:
scale = StandardScaler()

In [159]:
test_features = pd.DataFrame(scale.fit_transform(test_features), columns=test_features.columns)

In [141]:
# test_features = test_features[['Longitude',
#  'Day',
#  'Latitude',
#  'dist_from_ohare_MI',
#  '90day_tavg',
#  '90day_precip',
#  'Species_CULEX TERRITANS',
#  '4wk_dew',
#  'st2_ResultSpeed',
#  'dist_from_doty_MI',
#  'st1_Tmax',
#  '4wk_precip',
#  'Species_CULEX RESTUANS',
#  'st1_AvgSpeed',
#  '2wk_dew',
#  'Species_CULEX SALINARIUS',
#  'st1_ResultSpeed',
#  '4wk_mintemp',
#  'Species_CULEX PIPIENS',
#  '2wk_tavg',
#  'st2_PrecipTotal',
#  'Species_CULEX PIPIENS/RESTUANS',
#  '2wk_precip',
#  'tempmin_avg',
#  'st1_StnPressure',
#  'st2_ResultDir',]]

# Model Time

In [37]:
from sklearn.model_selection import train_test_split, cross_val_score

In [160]:
X_train, X_test, y_train, y_test = train_test_split(test_features, target, test_size=0.3, random_state=42)

In [48]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, roc_auc_score

def eval_sklearn_model(y_true, predictions, model=None, X=None):
    """This function takes the true values for y and the predictions made by the model and prints out the confusion matrix along with Accuracy, Precision, and, if model and X provided, Roc_Auc Scores."""
    cnf_matrix = confusion_matrix(y_true, predictions)

    print('True Negative: ', cnf_matrix[0, 0], '| False Positive: ', cnf_matrix[0, 1])
    print('False Negative: ', cnf_matrix[1, 0], '| True Positive: ', cnf_matrix[1, 1], '\n')

    sensitivity = cnf_matrix[1, 1]/ (cnf_matrix[1, 0] + cnf_matrix[1, 1])
    specificity = cnf_matrix[0, 0]/ (cnf_matrix[0, 1] + cnf_matrix[0, 0])

    print('Sensitivity (TP/ TP + FN): ', sensitivity)
    print('Specificity (TN/ TN + FP): ', specificity, '\n')

    print('Accuracy: ', accuracy_score(y_true, predictions, normalize=True))
    print('Precision: ', precision_score(y_true, predictions))
    if model != None:
        print('Roc-Auc: ', roc_auc_score(y_true, [x[1] for x in model.predict_proba(X)]))
    else:
        pass
    print('\n')

## XGBoost time!

In [28]:
from xgboost import XGBClassifier



In [161]:
xgb = XGBClassifier(scale_pos_weight=(6969/385), objective='binary:logistic', gamma=0.35, learning_rate=0.02, max_depth=3, n_estimators=1000)
# make sure to pick the correct objective for the problem
# scale_pos_weight is supposed to help with unbalanced classes; it recommended number of negative cases divided by positive
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0.35, learning_rate=0.02, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=1000, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=18.101298701298703, seed=0, silent=True,
       subsample=1)

In [162]:
test_predictions = xgb.predict(X_test)
eval_sklearn_model(y_test, test_predictions, model=xgb, X=X_test)

True Negative:  2329 | False Positive:  657
False Negative:  44 | True Positive:  122 

Sensitivity (TP/ TP + FN):  0.734939759036
Specificity (TN/ TN + FP):  0.779973208305 

Accuracy:  0.777601522843
Precision:  0.156611039795
Roc-Auc:  0.846658906221




In [35]:
feature_import = xgb.feature_importances_
features = X_train.columns
pd.DataFrame(columns=['feature', 'import'], data=list(zip(features, feature_import))).sort_values('import',ascending=False)

Unnamed: 0,feature,import
1,Longitude,0.169369
48,Day,0.095495
0,Latitude,0.077477
49,dist_from_ohare_MI,0.063063
33,90day_tavg,0.041441
29,90day_precip,0.041441
46,Species_CULEX TERRITANS,0.037838
39,4wk_dew,0.037838
23,st2_ResultSpeed,0.032432
50,dist_from_doty_MI,0.032432


In [28]:
from sklearn.grid_search import GridSearchCV
import time



In [30]:
start_time = time.time()

# Pick which estimators you want to test (example is for random forest)
param_grid = dict(learning_rate = [.001, .01, .02, .03],
                 n_estimators = [100, 500, 1000],
                 gamma = [0, .1, .2, .5],
                 max_depth = [2,3,4]
                 )

# Switch out the model here that you would like to test
model = XGBClassifier(scale_pos_weight=(6969/385), objective='binary:logistic')

grid = GridSearchCV(model, param_grid, verbose=1, cv=3, scoring='roc_auc')

grid.fit(X_train, y_train)

best_results = {'params': list(grid.best_params_.items()), 'score': grid.best_score_}

best_model = grid.best_estimator_

best_model = best_model.fit(X_train, y_train)

score = best_model.score(X_test, y_test)

# print('Number of Models Run: ', len(n_estimators) * len(max_features) * len(min_samples_leaf) * 3)
# # YOU MUST UPDATE THIS LINE TO HAVE SAME ESTIMATORS AS IN DICTIONARY
print("{} Score: {:0.3}".format('Decision Tree Classifier', score.mean().round(3)), '\n')
print('Elapsed Time: {:0.3}'.format( time.time() - start_time), ' seconds', '\n')
print(grid.best_estimator_, '\n')
print('Best Hyperparameters we tested for', '\n', best_results)

Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed: 20.0min finished


Decision Tree Classifier Score: 0.705 

Elapsed Time: 1.2e+03  seconds 

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0.5, learning_rate=0.03, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=18.101298701298703, seed=0, silent=True,
       subsample=1) 

Best Hyperparameters we tested for 
 {'params': [('gamma', 0.5), ('learning_rate', 0.03), ('max_depth', 3), ('n_estimators', 100)], 'score': 0.8279584836423326}


In [32]:
# Your best model from the grid is already fit and saved as best_model
test_predictions = best_model.predict(X_test)
print('Grid Search TEST SCORE:\n')
# function created above should be run before this cell
eval_sklearn_model(y_test, test_predictions, model=best_model, X=X_test)

Grid Search TEST SCORE:

True Negative:  2084 | False Positive:  902
False Negative:  29 | True Positive:  137 

Sensitivity (TP/ TP + FN):  0.825301204819
Specificity (TN/ TN + FP):  0.69792364367 

Accuracy:  0.704631979695
Precision:  0.131857555342
Roc-Auc:  0.838901822965




In [33]:
start_time = time.time()

# Pick which estimators you want to test (example is for random forest)
param_grid = dict(learning_rate = [.02, .03, .04, .05, .06],
                 n_estimators = [100, 200],
                 gamma = [.4, .5, .6, .7, .8],
                 max_depth = [3]
                 )

# Switch out the model here that you would like to test
model = XGBClassifier(scale_pos_weight=(6969/385), objective='binary:logistic')

grid = GridSearchCV(model, param_grid, verbose=1, cv=3, scoring='roc_auc')

grid.fit(X_train, y_train)

best_results = {'params': list(grid.best_params_.items()), 'score': grid.best_score_}

best_model = grid.best_estimator_

best_model = best_model.fit(X_train, y_train)

score = best_model.score(X_test, y_test)

# print('Number of Models Run: ', len(n_estimators) * len(max_features) * len(min_samples_leaf) * 3)
# # YOU MUST UPDATE THIS LINE TO HAVE SAME ESTIMATORS AS IN DICTIONARY
print("{} Score: {:0.3}".format('Decision Tree Classifier', score.mean().round(3)), '\n')
print('Elapsed Time: {:0.3}'.format( time.time() - start_time), ' seconds', '\n')
print(grid.best_estimator_, '\n')
print('Best Hyperparameters we tested for', '\n', best_results)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:  1.8min finished


Decision Tree Classifier Score: 0.718 

Elapsed Time: 1.11e+02  seconds 

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0.4, learning_rate=0.02, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=200, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=18.101298701298703, seed=0, silent=True,
       subsample=1) 

Best Hyperparameters we tested for 
 {'params': [('gamma', 0.4), ('learning_rate', 0.02), ('max_depth', 3), ('n_estimators', 200)], 'score': 0.8294723783614036}


In [34]:
# Your best model from the grid is already fit and saved as best_model
test_predictions = best_model.predict(X_test)
print('Grid Search TEST SCORE:\n')
# function created above should be run before this cell
eval_sklearn_model(y_test, test_predictions, model=best_model, X=X_test)

Grid Search TEST SCORE:

True Negative:  2125 | False Positive:  861
False Negative:  29 | True Positive:  137 

Sensitivity (TP/ TP + FN):  0.825301204819
Specificity (TN/ TN + FP):  0.71165438714 

Accuracy:  0.717639593909
Precision:  0.137274549098
Roc-Auc:  0.842136799038




In [36]:
start_time = time.time()

# Pick which estimators you want to test (example is for random forest)
param_grid = dict(learning_rate = [.015, .02, .025, .03,],
                 n_estimators = [100, 200, 300],
                 gamma = [.3, .35, .4, .45, .5,],
                 max_depth = [3]
                 )

# Switch out the model here that you would like to test
model = XGBClassifier(scale_pos_weight=(6969/385), objective='binary:logistic')

grid = GridSearchCV(model, param_grid, verbose=1, cv=3, scoring='roc_auc')

grid.fit(X_train, y_train)

best_results = {'params': list(grid.best_params_.items()), 'score': grid.best_score_}

best_model = grid.best_estimator_

best_model = best_model.fit(X_train, y_train)

score = best_model.score(X_test, y_test)

# print('Number of Models Run: ', len(n_estimators) * len(max_features) * len(min_samples_leaf) * 3)
# # YOU MUST UPDATE THIS LINE TO HAVE SAME ESTIMATORS AS IN DICTIONARY
print("{} Score: {:0.3}".format('Decision Tree Classifier', score.mean().round(3)), '\n')
print('Elapsed Time: {:0.3}'.format( time.time() - start_time), ' seconds', '\n')
print(grid.best_estimator_, '\n')
print('Best Hyperparameters we tested for', '\n', best_results)

Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed:  3.0min finished


Decision Tree Classifier Score: 0.718 

Elapsed Time: 1.84e+02  seconds 

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0.35, learning_rate=0.02, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=200, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=18.101298701298703, seed=0, silent=True,
       subsample=1) 

Best Hyperparameters we tested for 
 {'params': [('gamma', 0.35), ('learning_rate', 0.02), ('max_depth', 3), ('n_estimators', 200)], 'score': 0.82962359470379}


In [37]:
# Your best model from the grid is already fit and saved as best_model
test_predictions = best_model.predict(X_test)
print('Grid Search TEST SCORE:\n')
# function created above should be run before this cell
eval_sklearn_model(y_test, test_predictions, model=best_model, X=X_test)

Grid Search TEST SCORE:

True Negative:  2125 | False Positive:  861
False Negative:  29 | True Positive:  137 

Sensitivity (TP/ TP + FN):  0.825301204819
Specificity (TN/ TN + FP):  0.71165438714 

Accuracy:  0.717639593909
Precision:  0.137274549098
Roc-Auc:  0.842136799038




In [38]:
best_model.fit(test_features, target)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0.35, learning_rate=0.02, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=200, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=18.101298701298703, seed=0, silent=True,
       subsample=1)

## Best I could do with XGBoost

## Let's look at SVMs

In [52]:
from sklearn import svm

In [56]:
svc = svm.SVC(C=1.0, gamma='auto', kernel='rbf', class_weight='balanced', probability=True)
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [57]:
test_predictions = svc.predict(X_test)
eval_sklearn_model(y_test, test_predictions, model=svc, X=X_test)

True Negative:  2214 | False Positive:  772
False Negative:  36 | True Positive:  130 

Sensitivity (TP/ TP + FN):  0.78313253012
Specificity (TN/ TN + FP):  0.741460147354 

Accuracy:  0.743654822335
Precision:  0.144124168514
Roc-Auc:  0.834890129843




In [60]:
svc = svm.SVC(C=1.0, gamma='auto', kernel='linear', class_weight='balanced', probability=True)
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [61]:
test_predictions = svc.predict(X_test)
eval_sklearn_model(y_test, test_predictions, model=svc, X=X_test)

True Negative:  2026 | False Positive:  960
False Negative:  29 | True Positive:  137 

Sensitivity (TP/ TP + FN):  0.825301204819
Specificity (TN/ TN + FP):  0.678499665104 

Accuracy:  0.686230964467
Precision:  0.124886052871
Roc-Auc:  0.821597172346




In [62]:
svc = svm.SVC(C=1.0, gamma='auto', kernel='poly', class_weight='balanced', probability=True)
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [63]:
test_predictions = svc.predict(X_test)
eval_sklearn_model(y_test, test_predictions, model=svc, X=X_test)

True Negative:  2261 | False Positive:  725
False Negative:  39 | True Positive:  127 

Sensitivity (TP/ TP + FN):  0.765060240964
Specificity (TN/ TN + FP):  0.757200267917 

Accuracy:  0.757614213198
Precision:  0.149061032864
Roc-Auc:  0.833219683826




In [64]:
svc = svm.SVC(C=1.0, gamma='auto', kernel='sigmoid', class_weight='balanced', probability=True)
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='sigmoid',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [65]:
test_predictions = svc.predict(X_test)
eval_sklearn_model(y_test, test_predictions, model=svc, X=X_test)

True Negative:  1738 | False Positive:  1248
False Negative:  59 | True Positive:  107 

Sensitivity (TP/ TP + FN):  0.644578313253
Specificity (TN/ TN + FP):  0.582049564635 

Accuracy:  0.585342639594
Precision:  0.0789667896679
Roc-Auc:  0.643505031513




Looking at sklearn's docs for SVM parameter turning

In [None]:
#  Scary long to run.  Leave it for the desktop.

# %%time
# C_range = np.logspace(-2, 10, 13)
# gamma_range = np.logspace(-9, 3, 13)
# param_grid = dict(gamma=gamma_range, C=C_range)
# grid = GridSearchCV(svm.SVC(class_weight='balanced', probability=True), param_grid=param_grid, cv=3)
# grid.fit(X_train, y_train)

# print("The best parameters are %s with a score of %0.2f"
#       % (grid.best_params_, grid.best_score_))

# best_model = grid.best_estimator_
# best_model = best_model.fit(X_train, y_train)
# # Probably could have kept my above gridfit and modified that.  Need to turn that to a function...

In [None]:
# # Your best model from the grid is already fit and saved as best_model
# test_predictions = best_model.predict(X_test)
# print('Grid Search TEST SCORE:\n')
# # function created above should be run before this cell
# eval_sklearn_model(y_test, test_predictions, model=best_model, X=X_test)

Alright, Dale's getting frustrated.

Let's look for help!

Dug this out from Cody Laminack's repo.  I've never used a voting classifier, but it might improve out scores.

In [None]:
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
# from sklearn.ensemble import AdaBoostClassifier

In [None]:
# knn = KNeighborsClassifier()
# lr = LogisticRegression(penalty = 'l2', random_state = 42)
# svm = SVC(probability = True, random_state = 42)
# rf = RandomForestClassifier(random_state = 42)
# etc = ExtraTreesClassifier(random_state = 42)
# gbc = GradientBoostingClassifier(random_state = 42)
# ada = AdaBoostClassifier(random_state = 42)

In [None]:
# from sklearn.ensemble import VotingClassifier

# voter = VotingClassifier(estimators = [('knn', knn),
#                                        ('lr', lr),
#                                        ('extra trees', etc),
#                                        ('random forest', rf),
#                                        ('svm', svm),
#                                        ('gbc', gbc),
#                                        #('ada', ada),
#                                        #('bc', bc)
#                                       ],
#                         voting = 'soft', weights = [1, 1, 3, 1, 1, 2])


# Set up test data and export

In [163]:
test = pd.read_csv('assets/test.csv')

In [164]:
test["Date"] = pd.to_datetime(test["Date"], infer_datetime_format=True)

In [165]:
test["Date"] = pd.to_datetime(test["Date"], infer_datetime_format=True)
test['Address'] = test['Address'].astype('category')
test['Species'] = test['Species'].astype('category')
test['Street'] = test['Street'].astype('category')
test['Trap'] = test['Trap'].astype('category')
test['AddressNumberAndStreet'] = test['AddressNumberAndStreet'].astype('category')

In [166]:
test = pd.merge(test, weather, how='left', on='Date')

In [167]:
test = pd.get_dummies(test, columns=['Species', 'Block', 'Trap'])

In [168]:
test['Month'] = test['Date'].dt.month
test["Day"] = test['Date'].dt.dayofyear

In [169]:
lat = test.Latitude
lon = test.Longitude

In [170]:
#apply haversine function to training dataset, creating a column called 'dist_from_ohare_MI'
test['dist_from_ohare_MI'] = [haversine(y, x, ohare_lon, ohare_lat)[0] for y, x in zip(lon, lat)]
#apply haversine function to training dataset, creating a column called 'dist_from_doty_MI'
test['dist_from_doty_MI'] = [haversine(y, x, doty_lon, doty_lat)[0] for y, x in zip(lon, lat)]

In [171]:
# Make match above
features = test[['Latitude', 'Longitude', 'st1_Tmax', 'st1_Tmin', 'st1_Tavg', 'st1_DewPoint', 'st1_WetBulb', 'st1_SnowFall', 'st1_PrecipTotal', 'st1_StnPressure', 'st1_SeaLevel', 'st1_ResultSpeed', 'st1_ResultDir', 'st1_AvgSpeed', 'st2_Tmax', 'st2_Tmin', 'st2_Tavg', 'st2_DewPoint', 'st2_WetBulb', 'st2_SnowFall', 'st2_PrecipTotal', 'st2_StnPressure', 'st2_SeaLevel', 'st2_ResultSpeed', 'st2_ResultDir', 'st2_AvgSpeed', 'precip_avg', '2wk_precip', '4wk_precip', '90day_precip', 'temp_avg', '2wk_tavg', '4wk_tavg', '90day_tavg', 'tempmin_avg', '2wk_mintemp', '4wk_mintemp', 'dew_avg', '2wk_dew', '4wk_dew', 'Species_CULEX ERRATICUS', 'Species_CULEX PIPIENS', 'Species_CULEX PIPIENS/RESTUANS', 'Species_CULEX RESTUANS', 'Species_CULEX SALINARIUS', 'Species_CULEX TARSALIS', 'Species_CULEX TERRITANS', 'Month', 'Day', 'dist_from_ohare_MI', 'dist_from_doty_MI']]

#features = test.drop(['Date', 'Address', 'Street', 'AddressNumberAndStreet', 'st1_CodeSum', 'st2_CodeSum', 'Trap_T218B', 'Block_26', 'Id', 'Trap_T090C', 'Trap_T002A', 'Trap_T200A', 'Trap_T218A', 'Trap_T002B', 'Trap_T090B', 'Trap_T065A', 'Trap_T200B', 'Trap_T218C', 'Trap_T090A', 'Trap_T234', 'Species_UNSPECIFIED CULEX', 'Trap_T128A'], 1)


In [172]:
pred_features = pd.DataFrame(scale.fit_transform(features), columns=features.columns)

In [173]:
# Whatever model you decided on:
predictions = xgb.predict(pred_features)

In [174]:
submission = pd.DataFrame(columns=['Id', 'WnvPresent'], data=list(zip(test.Id, predictions)))
submission = submission.set_index('Id')
submission.to_csv('submission.csv',)

# NN?

In [175]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.utils import np_utils

Using Theano backend.


Couldn't import dot_parser, loading of dot files will not be possible.


In [176]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(7354, 51)
(7354,)
(3152, 51)
(3152,)


In [177]:
X_train = X_train.as_matrix()
X_test = X_test.as_matrix()

In [178]:
model = Sequential()
model.add(Dense(2048, input_shape=(51,)))
model.add(Activation('relu'))
                           
model.add(Dropout(0.5))   # Dropout helps protect the model from memorizing or "overfitting" the training data
model.add(Dense(1024))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))

In [179]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', ])

In [192]:
y_train.value_counts()

0    6969
1     385
Name: WnvPresent, dtype: int64

In [181]:
model.fit(X_train, y_train, epochs=20,
          verbose=1, validation_data=(X_test, y_test), class_weight={0:(1/6969), 1:(1/385)})

Train on 7354 samples, validate on 3152 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x116ed9748>

In [182]:
score = model.evaluate(X_test, y_test, verbose=1)

print('Test score:', score)
print('Test metric:', model.metrics_names)

Test metric: ['loss', 'acc']


In [183]:
# The predict_classes function outputs the highest probability class
# according to the trained classifier for each input example.
predicted_classes = model.predict_classes(X_test)



In [184]:
# Check which items we got right / wrong
correct_indices = np.nonzero(predicted_classes.reshape(3152,) == np.array(y_test))[0]
incorrect_indices = np.nonzero(predicted_classes.reshape(3152,) != np.array(y_test))[0]

In [185]:
true_pos = ((predicted_classes.reshape(3152,) == np.array(y_test)) & (np.array(y_test) == 1)).sum()
true_neg = ((predicted_classes.reshape(3152,) == np.array(y_test)) & (np.array(y_test) == 0)).sum()

In [188]:
false_neg = ((predicted_classes.reshape(3152,) != np.array(y_test)) & (np.array(y_test) == 1)).sum()
false_pos = ((predicted_classes.reshape(3152,) != np.array(y_test)) & (np.array(y_test) == 0)).sum()

In [189]:
print('True Negative: ', true_neg, '| False Positive: ', false_pos)
print('False Negative: ', false_neg, '| True Positive: ', true_pos, '\n')
sensitivity = true_pos/ (true_pos + false_neg)
specificity = true_neg/ (true_neg + false_pos)
print('Sensitivity (TP/ TP + FN): ', sensitivity)
print('Specificity (TN/ TN + FP): ', specificity)

True Negative:  2251 | False Positive:  735
False Negative:  40 | True Positive:  126 

Sensitivity (TP/ TP + FN):  0.759036144578
Specificity (TN/ TN + FP):  0.753851306095


# Train on full data

In [190]:
X = test_features.as_matrix()
y = target

In [191]:
y.value_counts()

0    9955
1     551
Name: WnvPresent, dtype: int64

In [193]:
model.fit(X, y, epochs=30,
          verbose=1, class_weight={0:(1/9955), 1:(1/551)})

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x115de2d68>

In [194]:
X_pred = pred_features.as_matrix()

In [195]:
predictions = model.predict_classes(X_pred)



In [196]:
submission = pd.DataFrame(columns=['Id', 'WnvPresent'], data=list(zip(test.Id, predictions)))
submission = submission.set_index('Id')
submission.to_csv('submission.csv',)

In [197]:
submission.iloc[0]['WnvPresent'][0]

0

In [198]:
submission['WnvPresent'] = submission['WnvPresent'].apply(lambda x: x[0])

In [199]:
submission['WnvPresent'].value_counts()

0    70079
1    46214
Name: WnvPresent, dtype: int64

In [200]:
submission.to_csv('submission.csv',)

In [None]:
# Kaggle score of 0.65394
# weights and dropout update 0.65111