In [1]:
#  All of my inputs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, LinearRegression
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn import svm
from sklearn.svm import SVC
from sklearn.utils import resample

In [2]:
train = pd.read_csv('./assets/clean_train.csv')
sample = pd.read_csv('./assets/sampleSubmission.csv')

In [3]:
train = train.drop("Unnamed: 0", axis=1)

In [4]:
train.head()

Unnamed: 0,NumMosquitos,WnvPresent,Species_CULEX ERRATICUS,Species_CULEX PIPIENS,Species_CULEX RESTUANS,Species_CULEX SALINARIUS,Species_CULEX TARSALIS,Species_CULEX TERRITANS,Week_22,Week_23,...,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Year_2007,Year_2009,Year_2011,Year_2013
0,1,0,0,1,1,0,0,0,1,0,...,1,0,0,0,0,0,1,0,0,0
1,1,0,0,0,1,0,0,0,1,0,...,1,0,0,0,0,0,1,0,0,0
2,1,0,0,0,1,0,0,0,1,0,...,1,0,0,0,0,0,1,0,0,0
3,1,0,0,1,1,0,0,0,1,0,...,1,0,0,0,0,0,1,0,0,0
4,4,0,0,0,1,0,0,0,1,0,...,1,0,0,0,0,0,1,0,0,0


In [5]:
print('Baseline accuracy is', train['WnvPresent'].value_counts(normalize=True).sort_values().values[-1]*100)

Baseline accuracy is 94.7553778793


In [6]:
y = train['WnvPresent']
X = train.drop(['WnvPresent'], axis = 1)

In [7]:
#y = y.iloc[2:]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# Random Forest Classifer

In [9]:
params = {
    'n_estimators' : [1, 3, 5, 7, 9, 11],
    'min_samples_leaf' : [1, 3, 5, 7, 9],
    'criterion': ['gini', 'entropy'],
    'max_features': ['auto', 'log2'],
    'bootstrap': [True, False]
    
}

In [10]:
GridSearchCV(
    RandomForestClassifier(),
    params
)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [1, 3, 5, 7, 9, 11], 'min_samples_leaf': [1, 3, 5, 7, 9], 'criterion': ['gini', 'entropy'], 'max_features': ['auto', 'log2'], 'bootstrap': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [12]:
%%time

random_forest = GridSearchCV(
    RandomForestClassifier(),
    params
).fit(X_train, y_train)

Wall time: 37.5 s


In [13]:
random_forest.score(X_test, y_test)

0.94861058241339935

In [14]:
random_forest.best_params_

{'bootstrap': True,
 'criterion': 'entropy',
 'max_features': 'log2',
 'min_samples_leaf': 9,
 'n_estimators': 9}

In [15]:
random_forest.best_score_

0.94770910013961163

In [16]:
random_forest.predict_proba(X_test)

array([[ 0.9191059 ,  0.0808941 ],
       [ 1.        ,  0.        ],
       [ 0.63946234,  0.36053766],
       ..., 
       [ 0.99755262,  0.00244738],
       [ 0.91980863,  0.08019137],
       [ 0.97747842,  0.02252158]])

In [17]:
rf_predictions = random_forest.predict_proba(X)[:,1]
#sample['WnvPresent'] = rf_predictions
#sample.to_csv('random_first.csv', index=False)

# Logisitic Regression

In [18]:
lr =GridSearchCV(
    LogisticRegression(),
    {
        'penalty':['l2', 'l1'],
        'C':[0.01, 0.1, 1, 10, 100, 1000]

    }
)

In [19]:
%%time

lr.fit(X_train, y_train)

Wall time: 4.34 s


GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'penalty': ['l2', 'l1'], 'C': [0.01, 0.1, 1, 10, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [20]:
lr.best_score_

0.94758218047975629

In [21]:
lr.best_params_

{'C': 0.01, 'penalty': 'l2'}

In [22]:
lr.score(X_test, y_test)

0.94746859535591932

### Trying to figure out how to make a submission to Kaggle but getting errors

In [23]:
lr_predictions = lr.predict_proba(X)[:,1]
#sample['WnvPresent'] = lr_predictions
#sample.to_csv('lr_first.csv', index=False)

### At the moment, these are my results:

-  Baseline Accurracy is 94.8%
-  Random Forest is 94.8%
-  Logisitic Regression is 94.8%

### Standardizing data didn't yield any differences

In [24]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [25]:
%%time

random_forestss = GridSearchCV(
    RandomForestClassifier(),
    params
).fit(X_train, y_train)

Wall time: 40.7 s


In [26]:
random_forestss.score(X_test, y_test)

0.94670727065093263

In [27]:
random_forestss.best_params_

{'bootstrap': False,
 'criterion': 'entropy',
 'max_features': 'log2',
 'min_samples_leaf': 9,
 'n_estimators': 11}

In [28]:
random_forestss.best_score_

0.94783601979946697

## Testing Russell's Code post train-test-split that he did for his Neural Network

Getting errors because my merged dataframe in the first line has two WnvPresent columns for X and y.  Will need to troubleshoot.

In [None]:
#rejoin train data on index so it can be downsampled to match classes
traindata = X_train.merge(pd.DataFrame(y_train), how = 'left', right_index = True, left_index = True)

#separate minority and majority classes
train_majority = traindata[traindata['WnvPresent'] == 0]
train_minority = traindata[traindata['WnvPresent'] == 1]

#upsample minority class
train_minority_upsampled = resample(train_minority, 
                                     replace = True, 
                                     n_samples = train_majority.shape[0],
                                     random_state = 65)
#combine classes
train_data_upsampled = pd.concat([train_majority, train_minority_upsampled])
#split back into X_train and y_train
X_train = train_data_upsampled.drop(labels = 'WnvPresent', axis = 1, inplace = True)
y_train = train_data_upsampled['WnvPresent']
