### And now to build some models

In [None]:
#import everything
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
#pull in our formatted train data
df = pd.read_csv('out_train_v2.csv')

In [None]:
#and our formatted test data
df1 = pd.read_csv('out_test_v2.csv')

In [None]:
df1.ix[df1.Species.str.contains("UNSPECIFIED CULEX"), 'Species'] = "CULEX PIPIENS/RESTUANS"


In [None]:
#let's first establish our y variable
X = df.drop("WnvPresent", axis=1)
y = df.WnvPresent

In [None]:
#creating a variable to determine what the variance in temperature was on any given day
X["TVar"] = X["Tmax"] - X["Tmin"]

In [None]:
#and the same for our test data
df1["TVar"] = df1['Tmax'] - df1['Tmin']

In [None]:
#converting species and trap to categorical data
X.Species = X.Species.astype('category')

# X.Species = X.Species.cat.codes

In [None]:
df1.Species = df1.Species.astype('category')
# df1.Species = df1.Species.cat.codes

In [None]:
test = df1

In [None]:
X.Trap = X.Trap.astype('category')
# X.Trap = X.Trap.cat.codes

In [None]:
df1.Trap = df1.Trap.astype('category')
# test.Trap = test.Trap.cat.codes

In [None]:
X = pd.concat([X, pd.get_dummies(X.Species, drop_first=True)], axis=1)


In [None]:
test = pd.concat([test, pd.get_dummies(test.Species, drop_first=True)], axis=1)

In [None]:
#drop the variables we're not going to use
X.head()
X = X.drop(['Unnamed: 0', 'Date', 'Address', 'Block', 'Street', 'AddressNumberAndStreet', 'AddressAccuracy', 'Tavg', 'Tmin', 'NumMosquitos', 'DewPoint', 'Trap', 'Species'], axis=1)

In [None]:
#same with test
test = test.drop(['Unnamed: 0','Id', 'Date', 'Address', 'Block', 'Street', 'AddressNumberAndStreet', 'AddressAccuracy', 'Tavg', 'Tmin', 'DewPoint', 'Trap', 'Species'], axis=1)

In [None]:
#let's check the dataframe to make sure they're both the same
X.head()

In [None]:
#yep, they are
test.head()

In [None]:
#let's start importign everything
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

In [None]:
#scaling the data to make sure all our data is on the same scale
X = pd.concat([X, StandardScaler().fit_transform(X[['Tmax', 'PrecipTotal', 'AvgSpeed', 'length_of_day', 'TVar']])], axis=1)

In [None]:
test = StandardScaler().fit_transform(test)

In [None]:
#IMPORT ALL THE THINGS
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score, StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report, roc_curve, auc

In [None]:
#gradient boosting
from sklearn.ensemble import GradientBoostingClassifier
#set our parameters for gradient boosting
gbc = GradientBoostingClassifier(max_features='sqrt',n_estimators=1000)

In [None]:
#establish names for our confusion matrix
names = ["present", "not present", "predicted present", "predicted not present"]

In [None]:
#pull up this monster of a function again to take a look at our train data and generate the model we think is best
def evaluate_model(model, X, y, names, test):
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, stratify=y, test_size=0.33, random_state=7)
    model = model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    print model.score(X_test, Y_test)
    Y_pp = pd.DataFrame(model.predict_proba(X_test), columns=['class_0_pp','class_1_pp'])
    Y_pp['pred_class_thresh10'] = [1 if x >= 0.05 else 0 for x in Y_pp.class_1_pp.values]
    X_test_df = pd.DataFrame(X_test)
    conmat = np.array(confusion_matrix(Y_test, Y_pp.pred_class_thresh10, labels=[1,0]))
    confusion = pd.DataFrame(conmat, index=[names[0:2]],
                         columns=[names[2:]]) 
    print(Y_pp.iloc[0:10])
    print confusion
    print(classification_report(Y_test, Y_pp.pred_class_thresh10))
    print Y_pp.shape
    print X_test_df.head()
    
    Y_score = model.decision_function(X_test)

    FPR = dict()
    TPR = dict()
    ROC_AUC = dict()

# For class 1, find the area under the curve
    FPR[1], TPR[1], _ = roc_curve(Y_test, Y_score)
    ROC_AUC[1] = auc(FPR[1], TPR[1])

# Plot of a ROC curve for class 1 (has_cancer)
    plt.figure(figsize=[11,9])
    plt.plot(FPR[1], TPR[1], label='ROC curve (area = %0.2f)' % ROC_AUC[1], linewidth=4)
    plt.plot([0, 1], [0, 1], 'k--', linewidth=4)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', fontsize=18)
    plt.ylabel('True Positive Rate', fontsize=18)
    plt.title('Receiver operating characteristic for WNV detection', fontsize=18)
    plt.legend(loc="lower right")
    plt.show()
    return "cross val mean score is", cross_val_score(model, X_train, Y_train, cv=5).mean()
    
    


evaluate_model(gbc, X, y, names, test)

### Prep model for submission!

In [None]:
#fit the model and get the predicted probabilities
model = gbc.fit(X, y)
y_pred = model.predict_proba(test)

In [None]:
#also get the predictions in general
y_predicted = model.predict(test)

In [None]:
X.head()

In [None]:
#just a tiny peak at what the model thought were the most important features
#looks like it's species, trap and length of day
model.feature_importances_

In [None]:
#creating a dataframe for the y_preds
y_preds = pd.DataFrame(data=y_pred, columns=["WnvNotPresent", "WnvPresent"])

In [None]:
#create a new dataframe merging all our data
df_out = pd.merge(df1, y_preds[['WnvPresent']], how = 'left', left_index=True, right_index=True )
df_out = df_out[['Id', 'WnvPresent']]
df_out.to_csv('dummies_unscaled.csv', index=False)

In [None]:
#and one for plotting graphs!
df_plot = pd.merge(df1, y_preds[['WnvPresent']], how='left', left_index = True, right_index=True)

In [None]:
#the one for plot needs very specific columns
#but we also want to set a threshold for where we think the city of Chicago should spray
#we also wanted the date to be manipulatable 
df_plot = df_plot[['Longitude', 'Latitude', 'Trap', 'Date', 'WnvPresent']]
df_plot["y_or_n"] = [1 if x > 0.3 else 0 for x in df_plot.WnvPresent]
df_plot['Date'] = pd.to_datetime(df_plot['Date'])
df_plot['year'] = df_plot.Date.dt.year
df_plot['month'] = df_plot.Date.dt.month

In [None]:
#and let's save that
df_plot.to_csv('dfplot_214.csv', index=False)