# Logistic Regression

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import recall_score, confusion_matrix, roc_curve, auc, classification_report
seed=738

## Example 1

In [None]:
# This dataset describes grains of rice in terms of visual properties.
# Class is rice variety: either Cammeo or Osmancık.
# https://archive.ics.uci.edu/ml/datasets/Rice+%28Cammeo+and+Osmancik%29
# (Actually downloaded the data from the paper's website.)
rice = 
rice.

In [None]:
# Check out class distribution
rice.

In [None]:
#Baseline accuracy? (i.e. accuracy for always guessing the majority class)
A = rice.               # majority class
B = rice.

baselineacc =           # the classifier you train must beat this number else no point in training one

print('Baseline Accuracy: '+str((baselineacc*100).round(1))+'%')

In [None]:
# Split data, train logistic regression

X = rice.
y = rice.

Xtrain, Xtest, ytrain, ytest = 

In [None]:
# Fit model, print coefficients
ricelr = 

print(f"Intercept:\n {ricelr.intercept_.round(3)} \nCoefficients:\n {ricelr.coef_.round(3)}")
# This tells you how each one of the features relates to the probability that an example is from the positive or negative class  

In [None]:
# Predict manually
sigmoid = lambda x: 1 / (1 + np.exp(-x)) # a small anonymous function that can take any number of args but can only have one expression
z = ricelr.

z.round(3)

### You can read more about Lambda [here](https://www.w3schools.com/python/python_lambda.asp).

In [None]:
# now let's see how probabilities look like:
sigmoid(z).round(2)

In [None]:
# or you can use a built-in function to see the probabilities without hard coding:
ricelr.

# first column: probabilities for class 0 (one minus second column),
# second column: probabilities for class 1 (calculated above)

In [None]:
# Get label predictions
ytest_hat = 

ytest_hat

### Evaluating Classifiers

In [None]:
# Calculate performance measures from scratch
# TP: true positives 
# TN: true negatives 
# FP: False positives 
# FN: False negatives

def compute_performance(yhat, y, classes):
    # First, get tp, tn, fp, fn
    tp = sum(np.logical_and(yhat == classes[1], y == classes[1]))
    tn = sum(np.logical_and(yhat == classes[0], y == classes[0]))
    fp = sum(np.logical_and(yhat == classes[1], y == classes[0]))
    fn = sum(np.logical_and(yhat == classes[0], y == classes[1]))

    print(f"tp: {tp} tn: {tn} fp: {fp} fn: {fn}")
    
    # Accuracy
    acc = (tp + tn) / (tp + tn + fp + fn)
    
    # Precision
    # "Of the ones I labeled +, how many are actually +?"
    precision = tp / (tp + fp)
    
    # Recall
    # "Of all the + in the data, how many do I correctly label?"
    recall = tp / (tp + fn)    
    
    # Sensitivity
    # "Of all the + in the data, how many do I correctly label?"
    sensitivity = recall
    
    # Specificity
    # "Of all the - in the data, how many do I correctly label?"
    specificity = tn / (fp + tn)
    
    # Print results
    
    print("Accuracy:",round(acc,3),"Recall:",round(recall,3),"Precision:",round(precision,3),
          "Sensitivity:",round(sensitivity,3),"Specificity:",round(specificity,3))

compute_performance(ytest_hat, ytest, ricelr.classes_)

# Let's compare against base-line accuracy:
print('\nBaseline Accuracy: ', baselineacc.round(3))

In [None]:
# Now let's experiment by adjusting the decision threshold
threshold = 0.1
ytest_prob = ricelr.predict_proba(Xtest)

# ytest_prob

# ytest_prob[:,1].round(3) > threshold

# [(ytest_prob[:,1] > threshold).astype(int)] # turn them into 0 and 1

# ricelr.classes_ # element 0 for label class 0 and element 1 for label class 1

yhat = ricelr.

# yhat

compute_performance(yhat, ytest, ricelr.classes_)

In [None]:
# Ranking based criterion: Receiver Operating Characteristic (ROC) curve using sklearns:
  
fpr, tpr, _ =           # 2nd arg: ranking score, 3rd arg: "Osmancik"

ax =sns.lineplot(x=fpr,y=tpr)
ax.set_xlabel("FP Rate")
ax.set_ylabel("TP Rate")

In [None]:
# Area under ROC (AUROC)
auc()

### Multiclass Logistic Regression

In [None]:
# Read data
iris = pd.read_csv("Dataset_iris.csv")
iris.head()

In [None]:
# Check out class distribution
iris.

# we have 3 balanced classes

In [None]:
# Create y and X. Not going to split these data for this demonstration.
X = iris.drop("Species", axis="columns")
y = iris.Species.values

In [None]:
# Note that unlike our convention in class, sklearn makes a parameter vector
# for every class (not just first K-1) even though it is redundant.
IRISLR = LogisticRegression(penalty=None)
irislr = IRISLR.fit(X,y)
print(f"Intercepts:\n {irislr.intercept_.round(3)} \n\nCoefficients:\n {irislr.coef_.round(3)}")

In [None]:
yhat = irislr.
# yhat
yhat_probs = irislr.
# yhat_probs.round(3)

In [None]:
confusion_matrix

# We transpose to get it in the format we saw in slides:
# Rows: Predicted labels
# Columns: True labels 

### Regularization

In [None]:
# l2: sum of the squares of the values of the coefficients
# C: smaller values specify stronger regularization
IRISLR = LogisticRegression                                # Experiment different values for C

irislr = IRISLR.fit(X,y)
print(f"Intercepts:\n {irislr.intercept_.round(3)} \n\nCoefficients:\n {irislr.coef_.round(3)}")

yhat = irislr.predict(X)
yhat_probs = irislr.predict_proba(X)
confusion_matrix(yhat,y).T

In [None]:
# you could also use `classification_report` from sklearn to output different evaluation metrics


## Example 2

In [None]:
# We use the MNIST dataset, which is a set of 70000 small images of handwritten digits.
# Each image is labeled with the digit it represents.

from sklearn.datasets import fetch_openml # a helper function to download popular datasets
mnist = fetch_openml('mnist_784', version=1, as_frame=False)

In [None]:
# Datasets loaded by Scikit-Learn generally have a similar dictionary structure, including the following:

print(mnist.keys())
# print('\n', mnist['DESCR']) # DESCR key: describing the dataset
# print('\n', mnist['data']) # data key: contains an array with one row per instance and one column per feature
print('\n', mnist['target']) # target key: contains an array with the labels

In [None]:
# Let’s look at X and y:
X, y = 

print('Shape of X:', X.shape)
print('Shape of y:', y.shape)

In [None]:
# There are 70000 images, and each image has 784 features. This is because each image
# is 28 × 28 pixels, and each feature simply represents one pixel’s intensity, from 0
# (white) to 255 (black). Let’s take a peek at one digit from the dataset. All you need to
# do is grab an instance’s feature vector, reshape it to a 28 × 28 array, and display it
# using Matplotlib’s imshow() function. Let's write a function for this task:

def plot_digit(data):
    image = data.reshape(28, 28)
    plt.imshow(image, cmap = mpl.cm.binary,interpolation="nearest")
    plt.axis("off")

In [None]:
plot_digit(X[0])

In [None]:
# This looks like a 5, and indeed that’s what the label tells us:
print('Its label is '+y[0]+', with type being',type(y[0]))

In [None]:
# Note that the label is a string. Most ML algorithms expect numbers, so let’s cast y to integer:
y = 

In [None]:
# To give you a feel for the complexity of the classification task, let's see more images from the dataset.
# Let's create a function first:
 
def plot_digits(instances, images_per_row=10, **options):
    size = 28
    images_per_row = min(len(instances), images_per_row)
    # This is equivalent to n_rows = ceil(len(instances) / images_per_row):
    n_rows = (len(instances) - 1) // images_per_row + 1

    # Append empty images to fill the end of the grid, if needed:
    n_empty = n_rows * images_per_row - len(instances)
    padded_instances = np.concatenate([instances, np.zeros((n_empty, size * size))], axis=0)

    # Reshape the array so it's organized as a grid containing 28×28 images:
    image_grid = padded_instances.reshape((n_rows, images_per_row, size, size))

    # Combine axes 0 and 2 (vertical image grid axis, and vertical image axis),
    # and axes 1 and 3 (horizontal axes). We first need to move the axes that we
    # want to combine next to each other, using transpose(), and only then we
    # can reshape:
    big_image = image_grid.transpose(0, 2, 1, 3).reshape(n_rows * size,
                                                         images_per_row * size)
    # Now that we have a big image, we just need to show it:
    plt.imshow(big_image, cmap = mpl.cm.binary, **options)
    plt.axis("off")

In [None]:
# Let's call the function to see more images:
plt.figure(figsize=(9,9))
example_images = X[:100]
plot_digits(example_images, images_per_row=10)
plt.show()

In [None]:
# Let's see a few labels and compares with the image above:
y[:3]

In [None]:
# Create a test set and set it aside:
X_train, X_test, y_train, y_test = 

### Training a Binary Classifier

In [None]:
# Let’s simplify the problem for now and only try to identify one digit, for example, the number 5. 
# This “5-detector” will be an example of a binary classifier, capable of distinguishing between
# just two classes, 5 and not-5.

# Create new y: True for all 5s, False for all other digits:
y_train_5 = 
y_test_5 = 

In [None]:
# Now let’s start with a Stochastic Gradient Descent classifier, using Scikit-Learn’s SGDClassifier class.
# This classifier has the advantage of being capable of handling very large datasets efficiently.
# Create an SGDClassifier and train it on the whole training set:

sgd_clf = 

sgd_clf.

In [None]:
# Now we can use it to detect images of the number 5:
some_digit = 
sgd_clf.
# Looks like it guessed it right. Because if you look at
# the image above the first element is indeed a 5.

In [None]:
# Let’s look at the baseline score:

A =  # the majority class
B = 

baselineacc = A/(A+B)
print('Baseline Accuracy:', baselineacc.round(2))

In [None]:
# That’s right, the baseline accuracy is 91%! 
# This is simply because only about 9% of the images are 5s. So if you always
# guess that an image is not a 5, you will be right about 91% of the time

# Percentage of images that are 5s:
(B/(A+B)*100).round(1)