In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

## Section 3: Cleaning weather.csv for combination with train and test data

### Part 3.1: Cleaning weather.csv

The weather data has useful information that can be combined with the training and test data. Let's first clean the weather data so that they can be combined with train and test data.

In [2]:
weather = pd.read_csv('../assets/west_nile/input/weather.csv')
train = pd.read_csv('../assets/west_nile/input/train.csv')
test = pd.read_csv('../assets/west_nile/input/test.csv')

In [3]:
#Drop the columns we don't want, including Sunrise and Sunset and PrecipTotal. For now, remove CodeSum.
weather.drop(columns=['Depth', 'Water1', 'SnowFall', 'Heat', 'Sunrise', 'Sunset', 'PrecipTotal', 'CodeSum'], inplace=True)

In [4]:
#Define a function to replace Missing values with the median and converting the result to a float64
def clean_column(string):
    weather[string][weather[string]=='M'] = weather[string][weather[string]!='M'].median()
    weather[string][weather[string]=='T'] = weather[string][weather[string]!='T'].median()
    weather[string] = weather[string].astype('float64')
    return weather

In [5]:
#Clean all the columns from the previous EDA notebook
clean_column('Tavg')
clean_column('Cool')
clean_column('Depart')
clean_column('SeaLevel')
clean_column('AvgSpeed')
clean_column('StnPressure')
clean_column('WetBulb');

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather[string][weather[string]=='M'] = weather[string][weather[string]!='M'].median()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather[string][weather[string]=='T'] = weather[string][weather[string]!='T'].median()


In [6]:
#Convert the 'Date' column to a datetime so that they can be combined together
train['Date'] = pd.to_datetime(train['Date'])
weather['Date'] = pd.to_datetime(weather['Date'])
test['Date'] = pd.to_datetime(test['Date'])

In [7]:
#Reset the index and replace them with the date
train.reset_index(drop = True, inplace = True)
weather.reset_index(drop = True, inplace = True)
test.reset_index(drop = True, inplace = True)

train.set_index('Date', inplace = True)
weather.set_index('Date', inplace = True)
test.set_index('Date', inplace = True)

In [8]:
weather.dtypes

Station          int64
Tmax             int64
Tmin             int64
Tavg           float64
Depart         float64
DewPoint         int64
WetBulb        float64
Cool           float64
StnPressure    float64
SeaLevel       float64
ResultSpeed    float64
ResultDir        int64
AvgSpeed       float64
dtype: object

### Part 3.2: Combining weather.csv with train.csv and test.csv

You may wonder if combining weather data with train and test csv will somehow contaminate the one with the other. However, train and test data are for different years. This means that train.csv will get weather data for 2007, 2009, 2011, and 2013, while test.csv will get weather data for 2008, 2010, 2012, and 2014. They will simply be augmented with the appropriate weather data.

In [9]:
#Combining the two together
weather_train_df = train.join(weather[weather['Station']==1])
weather_test_df = test.join(weather[weather['Station']==1])

In [10]:
#Let's do a sanity check to ensure that the length of the datasets are the same
print(f"Length of training dataset: {len(train)}")
print(f"Length of combined weather and training dataset: {len(weather_train_df)}")
print()
print(f"Length of test dataset: {len(test)}")
print(f"Length of combined weather and test dataset: {len(weather_test_df)}")

Length of training dataset: 10506
Length of combined weather and training dataset: 10506

Length of test dataset: 116293
Length of combined weather and test dataset: 116293


Drop the 'Station' column from both datasets and drop 'NumMosquitos' from weather_train_df,since that information is not available in weather_test_df

In [11]:
#Dropping the columns
weather_train_df.drop(columns = ['NumMosquitos', 'Station', 'Trap'], inplace=True)
weather_test_df.drop(columns = ['Station', 'Trap'], inplace=True)

In [12]:
weather_train_df.head()

Unnamed: 0_level_0,Address,Species,Block,Street,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,WnvPresent,Tmax,...,Tavg,Depart,DewPoint,WetBulb,Cool,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,0,88,...,74.0,10.0,58,65.0,9.0,29.39,30.11,5.8,18,6.5
2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,0,88,...,74.0,10.0,58,65.0,9.0,29.39,30.11,5.8,18,6.5
2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,0,88,...,74.0,10.0,58,65.0,9.0,29.39,30.11,5.8,18,6.5
2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,0,88,...,74.0,10.0,58,65.0,9.0,29.39,30.11,5.8,18,6.5
2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,0,88,...,74.0,10.0,58,65.0,9.0,29.39,30.11,5.8,18,6.5


In [13]:
weather_test_df.head()

Unnamed: 0_level_0,Id,Address,Species,Block,Street,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,Tmax,...,Tavg,Depart,DewPoint,WetBulb,Cool,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-06-11,1,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,86,...,74.0,7.0,56,64.0,9.0,29.28,29.99,8.9,18,10.0
2008-06-11,2,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,86,...,74.0,7.0,56,64.0,9.0,29.28,29.99,8.9,18,10.0
2008-06-11,3,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,86,...,74.0,7.0,56,64.0,9.0,29.28,29.99,8.9,18,10.0
2008-06-11,4,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,86,...,74.0,7.0,56,64.0,9.0,29.28,29.99,8.9,18,10.0
2008-06-11,5,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,86,...,74.0,7.0,56,64.0,9.0,29.28,29.99,8.9,18,10.0


In [14]:
#Convert species into dummy variables, applying the same list to the test set too
#Taken from 
# https://stackoverflow.com/questions/37425961/dummy-variables-when-not-all-categories-are-present/37451867#37451867
possible_categories = set(list(weather_train_df['Species']))

#Convert the species in train data to dummies
species_train = pd.get_dummies(weather_train_df['Species'].astype(pd.CategoricalDtype(categories=possible_categories)), drop_first= True)
species_train


Unnamed: 0_level_0,CULEX TARSALIS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX PIPIENS,CULEX TERRITANS,CULEX SALINARIUS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2007-05-29,0,1,0,0,0,0
2007-05-29,0,0,1,0,0,0
2007-05-29,0,0,1,0,0,0
2007-05-29,0,1,0,0,0,0
2007-05-29,0,0,1,0,0,0
...,...,...,...,...,...,...
2013-09-26,0,1,0,0,0,0
2013-09-26,0,1,0,0,0,0
2013-09-26,0,1,0,0,0,0
2013-09-26,0,1,0,0,0,0


In [15]:
#Convert the species in test data to dummies
species_test = pd.get_dummies(weather_test_df['Species'].astype(pd.CategoricalDtype(categories=possible_categories)), drop_first= True)
species_test

Unnamed: 0_level_0,CULEX TARSALIS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX PIPIENS,CULEX TERRITANS,CULEX SALINARIUS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008-06-11,0,1,0,0,0,0
2008-06-11,0,0,1,0,0,0
2008-06-11,0,0,0,1,0,0
2008-06-11,0,0,0,0,0,1
2008-06-11,0,0,0,0,1,0
...,...,...,...,...,...,...
2014-10-02,0,0,0,0,0,1
2014-10-02,0,0,0,0,1,0
2014-10-02,1,0,0,0,0,0
2014-10-02,0,0,0,0,0,0


In [16]:
#Concatenating the data together and dropping the original 'Species' column
weather_train_df = pd.concat([weather_train_df, species_train], axis=1)
weather_train_df.drop(columns='Species', inplace=True)

weather_test_df = pd.concat([weather_test_df, species_test], axis=1)
weather_test_df.drop(columns='Species', inplace=True);

In [17]:
X = weather_train_df[weather_train_df.columns.difference(['WnvPresent', 'Id', 'Address', 'Block', 'Street', 'AddressNumberAndStreet'])]
y = weather_train_df['WnvPresent']
X

Unnamed: 0_level_0,AddressAccuracy,AvgSpeed,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,CULEX TERRITANS,Cool,Depart,...,Latitude,Longitude,ResultDir,ResultSpeed,SeaLevel,StnPressure,Tavg,Tmax,Tmin,WetBulb
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007-05-29,9,6.5,0,1,0,0,0,0,9.0,10.0,...,41.954690,-87.800991,18,5.8,30.11,29.39,74.0,88,60,65.0
2007-05-29,9,6.5,0,0,1,0,0,0,9.0,10.0,...,41.954690,-87.800991,18,5.8,30.11,29.39,74.0,88,60,65.0
2007-05-29,9,6.5,0,0,1,0,0,0,9.0,10.0,...,41.994991,-87.769279,18,5.8,30.11,29.39,74.0,88,60,65.0
2007-05-29,8,6.5,0,1,0,0,0,0,9.0,10.0,...,41.974089,-87.824812,18,5.8,30.11,29.39,74.0,88,60,65.0
2007-05-29,8,6.5,0,0,1,0,0,0,9.0,10.0,...,41.974089,-87.824812,18,5.8,30.11,29.39,74.0,88,60,65.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013-09-26,8,4.2,0,1,0,0,0,0,0.0,3.0,...,41.763733,-87.742302,8,3.8,30.04,29.34,63.0,75,50,58.0
2013-09-26,8,4.2,0,1,0,0,0,0,0.0,3.0,...,41.987280,-87.666066,8,3.8,30.04,29.34,63.0,75,50,58.0
2013-09-26,9,4.2,0,1,0,0,0,0,0.0,3.0,...,41.912563,-87.668055,8,3.8,30.04,29.34,63.0,75,50,58.0
2013-09-26,9,4.2,0,1,0,0,0,0,0.0,3.0,...,42.009876,-87.807277,8,3.8,30.04,29.34,63.0,75,50,58.0


In [18]:
#Also drop the columns from weather_test_df for later use
weather_test_modified_df = weather_test_df[weather_test_df.columns.difference(['Id', 'Address', 'Block', 'Street', 'AddressNumberAndStreet'])]
weather_test_modified_df

Unnamed: 0_level_0,AddressAccuracy,AvgSpeed,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,CULEX TERRITANS,Cool,Depart,...,Latitude,Longitude,ResultDir,ResultSpeed,SeaLevel,StnPressure,Tavg,Tmax,Tmin,WetBulb
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-06-11,9,10.0,0,1,0,0,0,0,9.0,7.0,...,41.954690,-87.800991,18,8.9,29.99,29.28,74.0,86,61,64.0
2008-06-11,9,10.0,0,0,1,0,0,0,9.0,7.0,...,41.954690,-87.800991,18,8.9,29.99,29.28,74.0,86,61,64.0
2008-06-11,9,10.0,1,0,0,0,0,0,9.0,7.0,...,41.954690,-87.800991,18,8.9,29.99,29.28,74.0,86,61,64.0
2008-06-11,9,10.0,0,0,0,1,0,0,9.0,7.0,...,41.954690,-87.800991,18,8.9,29.99,29.28,74.0,86,61,64.0
2008-06-11,9,10.0,0,0,0,0,0,1,9.0,7.0,...,41.954690,-87.800991,18,8.9,29.99,29.28,74.0,86,61,64.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2014-10-02,8,7.9,0,0,0,1,0,0,3.0,10.0,...,41.925652,-87.633590,17,7.2,29.78,29.03,68.0,72,63,63.0
2014-10-02,8,7.9,0,0,0,0,0,1,3.0,10.0,...,41.925652,-87.633590,17,7.2,29.78,29.03,68.0,72,63,63.0
2014-10-02,8,7.9,0,0,0,0,1,0,3.0,10.0,...,41.925652,-87.633590,17,7.2,29.78,29.03,68.0,72,63,63.0
2014-10-02,8,7.9,0,0,0,0,0,0,3.0,10.0,...,41.925652,-87.633590,17,7.2,29.78,29.03,68.0,72,63,63.0


## Train-Test Split

In [19]:
X_train, X_test, y_train, y_test = train_test_split (X, y, stratify = y, random_state = 42)

In [20]:
X_train.head()

Unnamed: 0_level_0,AddressAccuracy,AvgSpeed,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,CULEX TERRITANS,Cool,Depart,...,Latitude,Longitude,ResultDir,ResultSpeed,SeaLevel,StnPressure,Tavg,Tmax,Tmin,WetBulb
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-09-06,8,6.0,1,0,0,0,0,0,7.0,5.0,...,41.743402,-87.731435,20,4.7,30.11,29.38,72.0,86,57,64.0
2011-09-12,9,10.9,1,0,0,0,0,0,10.0,10.0,...,41.89923,-87.716788,23,10.5,29.91,29.18,75.0,86,64,65.0
2011-09-30,9,15.0,0,1,0,0,0,0,0.0,-8.0,...,41.766202,-87.562889,34,14.6,30.01,29.32,51.0,57,45,47.0
2007-08-22,5,9.5,1,0,0,0,0,0,15.0,9.0,...,41.726465,-87.585413,21,5.3,29.94,29.23,80.0,90,69,73.0
2007-07-18,8,10.1,1,0,0,0,0,0,12.0,3.0,...,41.862292,-87.64886,24,9.1,29.84,29.11,77.0,85,69,71.0


## Scaling 
- Scaling is required for SVM and KNN which are algorithms that exploit distances or similarities (e.g. in the form of scalar product) between data samples, as they are sensitive to feature transformations. 
-  However, though they are not necessary for graphical-model based classifiers, such as decision trees and Tree-based ensemble methods, it might be a good idea to rescale/standardize the data.
- As such, we have decided to use the scaled data for all our models

In [21]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Modelling (dennis)

### SVM
- Support Vector Machine is a linear model for classification and regression problems. It can solve linear and non-linear problems and work well for many practical problems. 
- SVM creates a line or a hyperplane which separates the data into classes.
- They can be applied to both linear and non linear problems, and they are one of the best general purpose algorithm for machine learning inferior only to neural net and deep learning.

In [22]:
#define a simple function to score a give model with the ROC AUC score
# this function is taken from : https://github.com/doyleax/West-Nile-Virus-Prediction/blob/master/Final-NB.ipynb
from sklearn.metrics import roc_auc_score

def score_model(model,X_test,y_test):
    preds = model.predict_proba(X_test)
    pred_list =[]
    
    for x in preds:
        pred_list.append(x[1])
        
    roc_score = roc_auc_score(y_test, pred_list)
    return roc_score

In [23]:
from sklearn.svm import SVC

svmc= SVC(probability=True)
svm_model = svmc.fit(X_train,y_train)
score_model(svm_model,X_test,y_test)


0.7482983096639707

In [24]:
preds = svm_model.predict_proba(X_test)
preds

array([[0.94812892, 0.05187108],
       [0.92649477, 0.07350523],
       [0.9433584 , 0.0566416 ],
       ...,
       [0.9342947 , 0.0657053 ],
       [0.92650248, 0.07349752],
       [0.9138309 , 0.0861691 ]])

### Adaboost
- For Adaboost, the weights are re-assigned to each instance, with higher weights to incorrectly classified instances. 
- AdaBoost is used to boost the performance of decision trees on binary classification problems, thus, it is relevant in our case

In [25]:
from sklearn.ensemble import AdaBoostClassifier

adaboost= AdaBoostClassifier()
ada_model=adaboost.fit(X_train,y_train)
score_model(ada_model,X_test,y_test)

0.811869326485813

### KNN
- KNN is a simple, supervised machine learning algorithm that can be used to solve both classification and regression problems.
- However, it is more widely used in classification problems in the industry, thus it is useful in our case.

In [26]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
score_model(knn,X_test,y_test)
# knn.score(X_train, y_train)


0.6905762165120733

In [None]:
## we save the best model as best_model


just add the best model in later
best_model = knn?ada_model?


## Exporting Predictions

In [29]:
###This function will takes a model and a model name(as a string), generate predictions, 
### and save that as a CSV labeled with the model name and date.
# this function is taken from : https://github.com/doyleax/West-Nile-Virus-Prediction/blob/master/Final-NB.ipynb

import time 
import math
def model_and_export(model, model_name,test_X=X_test):
    pred_list = []
    predictions =  model.predict_proba(test_X)
    for x in predictions:
        pred_list.append(x[1])
    indexes=np.arange(1, len(predictions)+1, 1)
    preds_df = pd.DataFrame(data=[indexes, pred_list]).T
    preds_df.columns =['Id','WnvPresent']
    preds_df['Id'] = preds_df.Id.astype(int)
#     location = '../submissions/{}_{}.csv'.format(model_name, time.strftime("%d_%m_%Y"))
    preds_df.to_csv('../submissions.csv', index=False)
    return

In [28]:
model_and_export(best_model, 'best model')
