In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import linear_model, datasets

In [4]:
# import some data to play with
iris = datasets.load_iris()
df = pd.DataFrame(np.c_[iris.data, iris.target], columns = ["Sepal Length", "Sepal Width", "Petal Length", "Petal Width", "Class"])


data = np.c_[iris.data[:, :2], iris.target] # we only take the first two features.
np.random.shuffle(data)

test_training_split = 0.7

X = data[:, :2]  # we only take the first two features.
y = data[:, 2:]

X_training = X[:int(X.shape[0]*test_training_split),:]
y_training = y[:int(y.shape[0]*test_training_split)]

X_testing = X[int(X.shape[0]*test_training_split):,:]
y_testing = y[int(y.shape[0]*test_training_split):]

classes: 
- Iris Setosa 
- Iris Versicolour 
- Iris Virginica

## We should now take a look at our data to make sure everything looks okay

In [5]:
print(y_testing)
print(y_testing.ravel())
print(y_testing.shape)

[[ 0.]
 [ 1.]
 [ 2.]
 [ 2.]
 [ 2.]
 [ 2.]
 [ 2.]
 [ 2.]
 [ 1.]
 [ 1.]
 [ 2.]
 [ 1.]
 [ 2.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 2.]
 [ 1.]
 [ 0.]
 [ 2.]
 [ 2.]
 [ 2.]
 [ 1.]
 [ 2.]
 [ 0.]
 [ 1.]
 [ 2.]
 [ 0.]
 [ 2.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 2.]
 [ 1.]
 [ 2.]
 [ 2.]
 [ 0.]
 [ 2.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 2.]
 [ 0.]
 [ 1.]]
[ 0.  1.  2.  2.  2.  2.  2.  2.  1.  1.  2.  1.  2.  0.  0.  0.  2.  1.
  0.  2.  2.  2.  1.  2.  0.  1.  2.  0.  2.  0.  0.  0.  0.  2.  1.  2.
  2.  0.  2.  1.  0.  0.  2.  0.  1.]
(45, 1)


### Initializing our logistic regression function imported from sklearn

In [6]:
logreg = linear_model.LogisticRegression()

In [7]:
# We use the initialized function and fit the data.
logreg.fit(X_training, y_training.ravel())

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [8]:
Z = logreg.predict(X_testing)

In [9]:
def classification_rate(y, Z):
    num_right = 0
    for i in range(len(Z)):
        if y[i] == Z[i]:
            num_right = num_right + 1
    return num_right/Z.shape[0]

In [10]:
classification_rate(y_testing.ravel(), Z)

0.7111111111111111

# Well this isn't that great now is it?
## What comes next?

Let's begin implementing k-fold cross validation training and see what our output is

In [15]:
from sklearn import metrics, model_selection
from sklearn.linear_model import LogisticRegression

In [16]:
Z_cross_validation = model_selection.cross_val_predict(LogisticRegression(), X, y.ravel(), cv=10)

print(model_selection.cross_val_score(LogisticRegression(), X, y.ravel()))

[ 0.80392157  0.66666667  0.77083333]


In [17]:
classification_rate(y.ravel(), Z_cross_validation)

0.7533333333333333

# This is not that much better
#### There are many different classification algorithms sklearn has available to utilize in a much similar way to what was shown above. If you're interested, try playing around with the following:
1. SVM
2. Naive Bayes
3. Decision Trees
4. Random Forests
5. Neural Networks