In [2]:
import pandas as pd
import numpy as np

# glm classifiers
# sgdc (loss, classifier) - ('log', 'logistic regression'), ('hinge', 'linearsvm')
from sklearn.linear_model import SGDClassifier 

# naive bayes
from sklearn.naive_bayes import GaussianNB

# nearest neighbours
from sklearn.neighbors import KNeighborsClassifier

# decision trees
from sklearn.ensemble import RandomForestClassifier

# metrics
from sklearn.metrics import classification_report

# datasets
from sklearn.datasets import load_iris

In [None]:
# matplotlib broken for now
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
% matplotlib inline

In [1]:
def estimator_report(true_vals, predicted, estimator="Estimator"):
    """
        Helper function to format classfication report
        
        Parameters
        ----------
        true_vals : {np.array}
            true class labels for test set
        
        predicted : {np.array}
            estimator predicted values
        
        estimator : {string}
            Estimator name - formatting purposes
    """
    print "Classification Report for %s" % estimator
    print classification_report(true_vals, predicted)
    print "\n"

In [3]:
data = load_iris()

In [4]:
df = pd.DataFrame(data.data, columns=[data.feature_names])

In [5]:
df["target"] = data.target

In [6]:
# use numpy permutation to shuffle the data
df = df.reindex(np.random.permutation(df.index))

In [7]:
# features of interest
foi = ["petal length (cm)", "petal width (cm)"]
# foi = list(set(df.columns) - set(["target"]))

In [8]:
estimators = {"Logistic Regression": SGDClassifier(loss="log", penalty="l2"),
              "Nearest Neighbours": KNeighborsClassifier(15),
              "Naive Bayes": GaussianNB(),
              "Random Forest": RandomForestClassifier(n_estimators=100)}

In [9]:
X = df[foi].values
y = df["target"].values
X_train = X[:int(len(X)*0.7)] # retain 70% of data set for training
X_test = X[int(len(X)*0.7):]
y_train = y[:int(len(y)*0.7)]
y_test = y[int(len(y)*0.7):]

In [13]:
for name, classifier in estimators.iteritems():
    # train
    classifier.fit(X_train, y_train)
    # predict
    y_pred = classifier.predict(X_test)
    estimator_report(y_test, y_pred, estimator=name)

Classification Report for Nearest Neighbours
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        18
          1       1.00      1.00      1.00        18
          2       1.00      1.00      1.00         9

avg / total       1.00      1.00      1.00        45



Classification Report for Naive Bayes
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        18
          1       1.00      1.00      1.00        18
          2       1.00      1.00      1.00         9

avg / total       1.00      1.00      1.00        45



Classification Report for Random Forest
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        18
          1       1.00      1.00      1.00        18
          2       1.00      1.00      1.00         9

avg / total       1.00      1.00      1.00        45



Classification Report for Logistic Regression
             precisio