In [57]:
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, classification 
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

In [2]:
#read in the data and format some columns
data = pd.read_csv('warning_level_data.csv', header = 0)
data['StartDate'] = pd.to_datetime(data.StartDate)
data.WarningCode = data.WarningCode.astype(int)
#convert date to a time delta from today in days
data['TimeDelta']=(data.StartDate.apply(lambda x: (dt.datetime.today()-x).days))

In [3]:
data.head()

Unnamed: 0,LocationIdentifier,Latitude,Longitude,StartDate,Pollutant,WarningCode,WarningLevel
0,11NPSWRD_WQX-PORE_319_EU-L,37.970285,-122.727744,2012-01-23,Nitrate,0,Green
1,11NPSWRD_WQX-PORE_319_EU-L,37.970285,-122.727744,2012-03-13,Nitrate,0,Green
2,11NPSWRD_WQX-PORE_319_EU-L,37.970285,-122.727744,2012-11-28,Nitrate,0,Green
3,11NPSWRD_WQX-PORE_319_EU-L,37.970285,-122.727744,2014-02-09,Nitrate,0,Green
4,11NPSWRD_WQX-PORE_319_EU-L,37.970285,-122.727744,2014-02-11,Nitrate,0,Green


In [16]:
pol = data.Pollutant.unique()
pol

array(['Nitrate', 'Chromium', 'Arsenic', 'Lead', 'Copper', 'Fluoride',
       'Selenium', 'Cadmium', 'Beryllium', 'Mercury', 'Nitrite', 'Barium',
       'Antimony', 'TTHMs', 'Xylene', 'HAA5', 'PCBs', 'Simazine'], dtype=object)

In [41]:
#break into training and test

def splitData(df):
    #check how many unique sites we are measureing this pollutant at
    numLocations = df.LocationIdentifier.unique().size
    
    #pull out randomly selected entire location ids for the test set
    trainLocations = np.random.choice(df.LocationIdentifier.unique(), int(.8*numLocations), replace = False)
    testLocations = np.setdiff1d(df.LocationIdentifier.unique(), trainLocations)

    #subset the data using the train and test location identifiers
    train_data = df[df.LocationIdentifier.isin(trainLocations)]
    test_data = df[df.LocationIdentifier.isin(testLocations)]

    return train_data, test_data

In [77]:
# instantiate model, train it, score it.
def runKNNModel(train_data, test_data):
    cm_dict = {}
    accuracy = []
    for nn in range(2,6):
        knn = KNeighborsClassifier(n_neighbors=nn, weights='distance')
        model = knn.fit(X=train_data[['Latitude', 'Longitude', 'TimeDelta']], y=train_data.WarningCode)
        y_pred = model.predict(X=test_data[['Latitude', 'Longitude', 'TimeDelta']])
        
        #caluclates and saves the confusion matrix for plotting and calculating other metrics
        cm = classification.confusion_matrix(test_data.WarningCode, y_pred)
        cm_dict[nn] = cm
        
        #saves the accuracy and outputs this
        accuracy.append(model.score(test_data[['Latitude', 'Longitude', 'TimeDelta']], y=test_data.WarningCode))
    
    #return the last report
    report = classification_report(test_data.WarningCode, y_pred)
    
    return accuracy, cm_dict, report

In [51]:
#plots using the confusion matrix to plot precision and recall
#in order to make this useful need to increase the number of K values tried in the runKNNModel function

def PRplots(cm_dict):
    fig, axes = plt.subplots(1,3, sharey=True, figsize=(14,4))
    for k in cm_dict:
        cm = cm_dict[k]
        support = cm.sum(axis=1)
        accuracy = cm.diagonal().sum() / cm.sum().astype(float)
        recall = (cm.diagonal() / support.astype(float))
        precision = (cm.diagonal()  / cm.sum(axis=0).astype(float))
        f1 = 2*precision*recall / (precision+recall)
        for ax, r, p in zip(axes, recall, precision):
            ax.plot(k,r, marker='$R$', c=(1,0,0), markeredgecolor='none', label='recall')
            ax.plot(k,p, marker='$P$', c=(0,0,1), markeredgecolor='none', label='precision')
    [ax.set_title('Warning Code: {}'.format(i), size=16) for i,ax in enumerate(axes)];
    [ax.set_xlabel('k', size=14) for ax in axes];
    fig.tight_layout()


In [78]:
#iterate through the pollutant groups with various k values
for i in range(len(pol)):
    df = data[data.Pollutant == pol[i]]
    train, test = splitData(df)
    output, cm, report = runKNNModel(train, test)
    print '**************************** %s ****************************' % pol[i]
    #the output here is the accuracy for each k value for that pollutant
    print output
    #the report is the last k values precision/recall report
    print report


**************************** Nitrate ****************************
[0.87680288461538458, 0.88611778846153844, 0.88822115384615385, 0.88852163461538458]
             precision    recall  f1-score   support

          0       0.95      0.92      0.94      2933
          2       0.53      0.63      0.57       395

avg / total       0.90      0.89      0.89      3328

**************************** Chromium ****************************
[1.0, 1.0, 1.0, 1.0]
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      1079
          2       1.00      1.00      1.00         2

avg / total       1.00      1.00      1.00      1081

**************************** Arsenic ****************************
[0.87693127330847098, 0.88545551411827383, 0.89025039957378793, 0.89451251997868941]
             precision    recall  f1-score   support

          0       0.66      0.76      0.70        33
          1       0.94      0.94      0.94      1694
          2       0.

In [72]:
def runRFModel(train_data, test_data):
    rf = RandomForestClassifier(n_estimators = 20)
    model = rf.fit(X=train_data[['Latitude', 'Longitude', 'TimeDelta']], y=train_data.WarningCode)
    y_pred = model.predict(X=test_data[['Latitude', 'Longitude', 'TimeDelta']])

    accuracy = model.score(test_data[['Latitude', 'Longitude', 'TimeDelta']], y=test_data.WarningCode)

    return classification_report(test_data.WarningCode, y_pred) 


In [75]:
#iterate through the pollutant groups for random forest model
for i in range(len(pol)):
    df = data[data.Pollutant == pol[i]]
    train, test = splitData(df)
    output = runRFModel(train, test)
    print '**************************** %s ****************************' % pol[i]
    print output

**************************** Nitrate ****************************
             precision    recall  f1-score   support

          0       0.93      0.93      0.93      2786
          2       0.55      0.57      0.56       427

avg / total       0.88      0.88      0.88      3213

**************************** Chromium ****************************
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      1091
          2       1.00      1.00      1.00         1

avg / total       1.00      1.00      1.00      1092

**************************** Arsenic ****************************
             precision    recall  f1-score   support

          0       0.57      0.33      0.42        83
          1       0.93      0.97      0.95      1692
          2       0.65      0.45      0.53       152

avg / total       0.89      0.90      0.89      1927

**************************** Lead ****************************
             precision    recall  f1-sco