In [1]:
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, classification 
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

In [2]:
#read in the data and format some columns
data = pd.read_csv('warning_level_data.csv', header = 0)
data['StartDate'] = pd.to_datetime(data.StartDate)
data.WarningCode = data.WarningCode.astype(int)
#convert date to a time delta from today in days
data['TimeDelta']=(data.StartDate.apply(lambda x: (dt.datetime.today()-x).days))

In [3]:
data.head()

Unnamed: 0,LocationIdentifier,Latitude,Longitude,StartDate,Pollutant,WarningCode,WarningLevel,TimeDelta
0,11NPSWRD_WQX-PORE_319_EU-L,37.970285,-122.727744,2012-01-23,Nitrate,0,Green,1765
1,11NPSWRD_WQX-PORE_319_EU-L,37.970285,-122.727744,2012-03-13,Nitrate,0,Green,1715
2,11NPSWRD_WQX-PORE_319_EU-L,37.970285,-122.727744,2012-11-28,Nitrate,0,Green,1455
3,11NPSWRD_WQX-PORE_319_EU-L,37.970285,-122.727744,2014-02-09,Nitrate,0,Green,1017
4,11NPSWRD_WQX-PORE_319_EU-L,37.970285,-122.727744,2014-02-11,Nitrate,0,Green,1015


In [4]:
pol = data.Pollutant.unique()
pol

array(['Nitrate', 'Chromium', 'Arsenic', 'Lead', 'Copper', 'Fluoride',
       'Selenium', 'Cadmium', 'Beryllium', 'Mercury', 'Nitrite', 'Barium',
       'Antimony', 'TTHMs', 'Xylene', 'HAA5', 'PCBs', 'Simazine'], dtype=object)

In [5]:
#break into training and test

def splitData(df):
    #check how many unique sites we are measureing this pollutant at
    numLocations = df.LocationIdentifier.unique().size
    
    #pull out randomly selected entire location ids for the test set
    trainLocations = np.random.choice(df.LocationIdentifier.unique(), int(.8*numLocations), replace = False)
    testLocations = np.setdiff1d(df.LocationIdentifier.unique(), trainLocations)

    #subset the data using the train and test location identifiers
    train_data = df[df.LocationIdentifier.isin(trainLocations)]
    test_data = df[df.LocationIdentifier.isin(testLocations)]

    return train_data, test_data

In [7]:
#plots using the confusion matrix to plot precision and recall
#in order to make this useful need to increase the number of K values tried in the runKNNModel function

def PRplots(cm_dict):
    fig, axes = plt.subplots(1,3, sharey=True, figsize=(14,4))
    for k in cm_dict:
        cm = cm_dict[k]
        support = cm.sum(axis=1)
        accuracy = cm.diagonal().sum() / cm.sum().astype(float)
        recall = (cm.diagonal() / support.astype(float))
        precision = (cm.diagonal()  / cm.sum(axis=0).astype(float))
        f1 = 2*precision*recall / (precision+recall)
        for ax, r, p in zip(axes, recall, precision):
            ax.plot(k,r, marker='$R$', c=(1,0,0), markeredgecolor='none', label='recall')
            ax.plot(k,p, marker='$P$', c=(0,0,1), markeredgecolor='none', label='precision')
    [ax.set_title('Warning Code: {}'.format(i), size=16) for i,ax in enumerate(axes)];
    [ax.set_xlabel('k', size=14) for ax in axes];
    fig.tight_layout()


In [21]:
from sklearn.grid_search import GridSearchCV

# using Ashley's function, split the data into testing and training
train_data, test_data = splitData(df)
train_labels = train_data.WarningCode

# find the best parameters in RF and kNN
def best_params(model, parameters, train_data, train_labels):
    RF = RandomForestClassifier()
    kNN = KNeighborsClassifier()
    parameters_RF = [{"n_estimators": [4, 5, 6]}]
    parameters_kNN = [{"n_neighbors": [2, 3, 4, 5, 6, 7, 8]}]
    for i in range(len(pol)):
        df = data[data.Pollutant == pol[i]]
        train_data, test_data = splitData(df)
        train_labels = train_data.WarningCode
        try:
            clf_rf = GridSearchCV(RF, parameters_RF, scoring="accuracy")
            clf_knn = GridSearchCV(kNN, parameters_kNN, scoring="accuracy")
            print 'clf OK'
            clf_rf.fit(X=train_data[['Latitude', 'Longitude', 'TimeDelta']], y=train_data.WarningCode)
            clf_knn.fit(X=train_data[['Latitude', 'Longitude', 'TimeDelta']], y=train_data.WarningCode)
            print 'clf fit OK'
            best_estimator_rf = clf_rf.best_estimator_
            best_estimator_knn = clf_knn.best_estimator_
            #print 'best_estimator OK'
            print ('Best hyperparameters: ')
            log_info('RF: ' + str(clf_rf.best_params_))
            log_info('kNN: ' + str(clf_knn.best_params_))
            log_info('Best hyperparameters: ' + str(clf.best_params_))
            pass
        except:
            print ('For pollutant ', pol[i], ' we don`t have enough data to try different parameters.')
            pass
    return

best_params(model, parameters, train_data, train_labels)


clf OK
clf fit OK
Best hyperparameters: 
('For pollutant ', 'Nitrate', ' we don`t have enough data to try different parameters.')
clf OK
clf fit OK
Best hyperparameters: 
('For pollutant ', 'Chromium', ' we don`t have enough data to try different parameters.')
clf OK
clf fit OK
Best hyperparameters: 
('For pollutant ', 'Arsenic', ' we don`t have enough data to try different parameters.')
clf OK
clf fit OK
Best hyperparameters: 
('For pollutant ', 'Lead', ' we don`t have enough data to try different parameters.')
clf OK
clf fit OK
Best hyperparameters: 
('For pollutant ', 'Copper', ' we don`t have enough data to try different parameters.')
clf OK
clf fit OK
Best hyperparameters: 
('For pollutant ', 'Fluoride', ' we don`t have enough data to try different parameters.')
clf OK
clf fit OK
Best hyperparameters: 
('For pollutant ', 'Selenium', ' we don`t have enough data to try different parameters.')
clf OK
clf fit OK
Best hyperparameters: 
('For pollutant ', 'Cadmium', ' we don`t have enou