# Logistic Regression

In [1]:
# imports
from sklearn.datasets import load_iris

# instantiate load_iris
iris = load_iris()

X = iris.data

y = iris.target

In [2]:
# imports
from sklearn.linear_model import LogisticRegression

# instantiate the model
logreg = LogisticRegression()

# fit the model with data 
logreg.fit(X, y)

# predict the response values for the observations in x
logreg.predict(X)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [3]:
# store predicted result values
y_pred = logreg.predict(X)

# number of predictions generated
len(y_pred)

150

#### Classification accuracy
* Proportion of correct predicitions
* Common evaluation metric for classification problems

In [4]:
# compute the classification accuracy for the Logistic regression model
from sklearn import metrics
print(metrics.accuracy_score(y, y_pred))

0.96


* When you train and test the model with the same data - training accuracy

# knn(k=5)

In [5]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X, y)
y_pred = knn.predict(X)

print(metrics.accuracy_score(y,y_pred))

0.9666666666666667


# Knn(k=1)

In [9]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X, y)
y_pred = knn.predict(X)

print(metrics.accuracy_score(y,y_pred))

1.0


### Problems with training and testing on the same data 

* Goal is to estimate likely performance of a model on out-of-sample-data.

* But, maximizing training accuracy rewards overly complex models that won't necessarily generalize

* Unnecesary complex models overfit the training data. 

- Models that overfit have learned the noise in the data rather than the signal.
- In the case of KNN, a very low value of k, creates a high complexity model because it follows the noise in the data. 

# Train/Test split

In [10]:
print(X.shape)
print(y.shape)

(150, 4)
(150,)


In [52]:
# STEP 1: split X and y into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=4)

In [53]:
# Check the shape of the X train and test data

print(X_train.shape)
print(X_test.shape)

(90, 4)
(60, 4)


In [54]:
# Check the shape of the X train and test data

print(y_train.shape)
print(y_test.shape)

(90,)
(60,)


In [55]:
# Train the model on the training set
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [56]:
# Make predictions on the testing data
y_pred = logreg.predict(X_test)

In [57]:
# Compare actual values with the predicted response values
print(metrics.accuracy_score(y_test, y_pred))

0.95


# Repeat for KNN with k=5

In [59]:
knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [60]:
y_pred = knn.predict(X_test)

print(metrics.accuracy_score(y_test, y_pred))

0.9666666666666667


# repeat for KNN with k=1

In [62]:
knn = KNeighborsClassifier(n_neighbors=1)

knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

In [63]:
y_pred = knn.predict(X_test)

print(metrics.accuracy_score(y_test, y_pred))

0.95
