In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import random

import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols

from sklearn.model_selection import train_test_split

Before we start:
- Validation vs. Test
- Data Leakage

Today we are going to talk about **Classification**. The goal will be to classify each observation into one of potentially many classes. We'll start with *binary* classification, or classifying into one of two classes. The following data can be found [here](https://www.kaggle.com/faressayah/college-data).

In [None]:
# can we classify colleges as public or private?
# let's look at a sample of 500 colleges
df_college = pd.read_csv('data/college_data.csv').sample(n=200, random_state=2022).reset_index(drop=True)
# what does random_state do?
df_college.head()

In [None]:
# create the variable is_private!
df_college['is_private'] = df_college['private'].apply(lambda x: int(x == 'Yes'))
df_college.head()

In [None]:
# does out of state tuition predict private/public?
plt.scatter(x=df_college['room_board'], y=df_college['is_private'])
plt.show()

In [None]:
# linear regression doesn't seem to be the best here...
sns.lmplot(x='outstate', y='is_private', data=df_college)
plt.show()

- Does binary data fit the assumptions from OLS regression?

The goal here will to predict the *probability* that a given observation falls into the class defined by $1$. In other words given independent variable $X$ and target variable $Y$, we want to find $P(Y=1|X)$. Additionaly we want to do with the same *linear* framework as before.

So let's instead try to predict the **Odds** of an event occurring. In sports betting, odds are often used instead of probability:

- "The San Francisco Giants are a long shot, they have 10-1 odds of winning."
- "There's no way the San Francisco 49ers are going to lose, the betting odds are one to five!"
- "Never tell me the odds." - Han Solo, *Star Wars*

If a event has probability $p$ of occuring, then the odds of the event are

$$
Odds = \frac{p}{1-p}
$$

- What is $p$ in the two sports examples above?

In [None]:
# full dataset
df_college = pd.read_csv('data/college_data.csv')
df_college['is_private'] = df_college['private'].apply(lambda x: int(x == 'Yes'))

df_college['outstate'].hist(bins=12)
plt.show()

In [None]:
# for each of these bins let's count the number of private schools to estimate P(is_private=1|outstate)
tuition_bins = pd.cut(df_college['outstate'], bins=12)
ct = pd.crosstab(tuition_bins,df_college['is_private'])
ct

In [None]:
ct['freq'] = (ct[1] / (ct[0] + ct[1]))
ct

In [None]:
# get the midpoints
ct['midpoints'] = ct.index.to_series().apply(lambda x: x.mid)

# plot the frequency
plt.scatter(x=ct['midpoints'], y=ct['freq'])
plt.show()

In [None]:
# get the odds
def odds(p):
    if p < 1.0:
        return (p / (1-p))
    
    # if the odds are "infinite" set a cap at 75
    else:
        return 75


ct['odds'] = ct['freq'].apply(odds)

# plot the odds
plt.scatter(x=ct['midpoints'], y=ct['odds'])
plt.show()

Now the Odds here don't look very linear... But the *log* of the odds does.

The number $e=2.718281828...$ is a special constant.
- The rate of change of the function $f(x)=e^x$ is given by $f'(x)=e^x$.
- We also have that $\displaystyle \lim_{n\rightarrow\infty} \left(1+ \frac{1}{n}\right)^n = e$ (related to compound interest)

Consider
$$
y = f(x) = e^x.
$$
This completely determines the inverse relationship, i.e. the function $x=g(y)$:
$$
\ln(y) = \log_e(y) = \log(y) = x
$$

Below we plot $\displaystyle \log\left(\frac{p}{1-p}\right)$ against the independent variable.

In [None]:
# looks a bit linear!
ct['log_odds'] = ct['odds'].apply(lambda x : math.log(x))

# plot the odds
plt.scatter(x=ct['midpoints'], y=ct['log_odds'])
plt.show()

### Logistic Regression

Find the parameters $\beta_0, \beta_1$ to create the model

$$
\log\left(\frac{p}{1-p}\right) = \beta_0 + \beta_1 X
$$

Rearranging we have

$$
\begin{align}
\frac{p}{1-p} &= e^{\beta_0 +\beta_1 X}\\
1 + \frac{p}{1-p} &= 1 + e^{\beta_0 +\beta_1 X}\\
\frac{1}{1-p} &= 1 + e^{\beta_0 +\beta_1 X}\\
1-p &= \frac{1}{1 + e^{\beta_0 +\beta_1 X}}\\
1 - \frac{1}{1 + e^{\beta_0 +\beta_1 X}} &= p\\
\frac{ e^{\beta_0 +\beta_1 X}}{1 + e^{\beta_0 +\beta_1 X}} &= p\\
\frac{1}{ e^{-(\beta_0 +\beta_1 X)} + 1} &= p\\
p(X) &= \frac{1}{1 + e^{-(\beta_0 +\beta_1 X)}}
\end{align}
$$

Thus performing linear regression on the log-odds is equivalent to fitting a logistic function to the data.
- The function $p(x) = \frac{1}{1+e^{-x}}$ is called a *sigmoid* function.

In [None]:
x = np.linspace(-10, 10, 200)
p = 1/(1 + np.exp(-x))
plt.plot(x, p)
plt.show()

In [None]:
plt.scatter(x=df_college['outstate'], y=df_college['is_private'])

# logistic model
x = np.linspace(0, 20000, 10000)
p = 1/(1 + np.exp(-((1/1000)*x-8)))
plt.plot(x, p)
plt.show()

# scatter plot
plt.scatter(x=ct['midpoints'], y=ct['freq'])
plt.plot(x, p)
plt.show()

You can see here that the logistic model is estimating the probability values of the target variable with respect to the input variable.

- What is missing here before we can fit the model?

### Likelihood

- For our probability function and an observation $(x_i, y_i)$: What is the likelihood of $(x_i, y_i)$ occuring with the probability function $p(X)$?
- Note here $y_i = 0$ OR $y_i = 1$.

$$
\mathcal{L}(\beta_0, \beta_1) = p(x_i)^{y_i}(1-p(x_i))^{1-y_i}
$$

- We want to maximize the likelihood function!
- This is equivalent to maximizing the **log-likelihood** function:

$$
\mathcal{l}(\beta_0, \beta_1) = \log(p(x_i)^{y_i}(1-p(x_i))^{1-y_i}) = y_i\log(p(x_i)) + (1-y_i)\log(1-p(x_i))
$$

In turn this is equivalent to *minimizing* the negative log-likelihood function:

$$
-\mathcal{l}(\beta_0, \beta_1) = -y_i\log(p(\beta_0, \beta_1, x_i))) - (1-y_i)\log(1-p(\beta_0, \beta_1, x_i)
$$

- Note the inclusion of $\beta_0, \beta_1$ in the $p$ function.

For many observations we arrive at

$$
-\mathcal{l}(\beta_0, \beta_1) = \sum_{i=1}^N -y_i\log(p(\beta_0, \beta_1, x_i))) - (1-y_i)\log(1-p(\beta_0, \beta_1, x_i)
$$

### Summary

Given $N$ data points $(x_i, y_i)$, find parameters $\beta_0, \beta_1$ for the model
$$
p(X) = \frac{1}{1 + e^{-(\beta_0 +\beta_1 X)}}
$$
that minimize the loss function
$$
L(\beta_0,\beta_1) = -\left(\sum_{i=1}^N y_i\log(p) + (1-y_i)\log(1-p)\right)
$$

In [None]:
from statsmodels.formula.api import logit

model = logit(formula = 'is_private ~ outstate', data=df_college)
res = model.fit()
res.summary()

In [None]:
b0, b1 = res.params

# plot the model against the 0-1 values
plt.scatter(x=df_college['outstate'], y=df_college['is_private'])

x = np.linspace(0, 20000, 10000)
p = 1/(1 + np.exp(-(b1*x+b0)))
plt.plot(x, p)

plt.show()

In [None]:
# plot the model against the estimated probabilities
plt.scatter(x=ct['midpoints'], y=ct['freq'])
plt.plot(x, p)
plt.show()

In [None]:
# we can also use seaborn's lmplot again
sns.lmplot(x="outstate", logistic=True, y="is_private", data=df_college)
plt.show()

### Classification Metrics
- How do we turn our probability prediction into a binary prediction?
- What does it mean to set a *threshold* of 0.5?

In [None]:
x = df_college['outstate']
y = df_college['is_private']

y_pred_prob = res.predict(x)
y_pred_prob[:20]

In [None]:
y_pred = (y_pred_prob >= 0.5).apply(int)
y_pred[:20]

In [None]:
# let's utilize sklearn's metrics package
from sklearn import metrics

acc = metrics.accuracy_score(y, y_pred)
print(f'The Accuracy of this model is {100*acc}%')

Looks good... but how good is this really?

In [None]:
y_all_ones = np.ones(len(y))

acc = metrics.accuracy_score(y, y_all_ones)
print(f'The Accuracy of always predicting private is {100*acc}%')

This is what we call an *imbalanced* dataset. There is not an even 50/50 split of private and public schools. To get a better idea of how our model is performing we can look at the **Confusion Matrix**

In [None]:
# build the confusion matrix
cm = metrics.confusion_matrix(y, y_pred)
# display it nicely
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

The bottom left corner are the number of False Positives and the top right corner are the number of False Negatives.

- The **Precision** measures how much we can trust the model's prediction of class 1:

$$
\text{Precision }= \frac{True Positives}{All Predicted Positives}
$$

- The **Recall** measures how well the model "finds" all of class 1:

$$
\text{Precision }= \frac{True Positives}{Total Positives}
$$

What is the Precision and Recall above?

In [None]:
recall = metrics.recall_score(y, y_pred)
prec = metrics.precision_score(y, y_pred)

print(f'The Recall of this model is {recall}')
print(f'The Precision of this model is {prec}')

In [None]:
recall = metrics.recall_score(y, y_all_ones)
prec = metrics.precision_score(y, y_all_ones)

print(f'The Recall of the all ones model is {recall}')
print(f'The Precision of the all ones model is {prec}')

- Will changing the threshold of class 1 change the precision, recall, and accuracy?

![img](https://upload.wikimedia.org/wikipedia/commons/3/36/ROC_space-2.png)

In [None]:
# use the probability predictions to get the FPR and TPR for different thresholds
FPR, TPR, thresholds = metrics.roc_curve(y, y_pred_prob)

In [None]:
thresholds[:10], thresholds[-10:]

In [None]:
# does this make sense? Why?
FPR[:10], FPR[-10:]

In [None]:
plt.plot(FPR, TPR)
plt.title('Receiver Operating Charactertistic Curve (ROC)')
plt.show()

To get a single value, people often use the **A**rea **U**nder the **C**urve (AUC).

In [None]:
auc = metrics.roc_auc_score(y, y_pred_prob)
print(f'The AUC of this model is {auc}')

- What is the AUC of randomly guessing?

### Multiple independent variables

- Generalized Linear Models

**Example (2 indep.):**

Given $N$ data points $((x_1)_i, (x_2)_i, y_i)$, find parameters $\beta_0, \beta_1, \beta_2$ for the model
$$
p(X) = \frac{1}{1 + e^{-(\beta_0 +\beta_1 X_1 + \beta_2 X_2)}}
$$
that minimize the loss function
$$
L(\beta_0,\beta_1, \beta_2) = -\left(\sum_{i=1}^N y_i\log(p) + (1-y_i)\log(1-p)\right)
$$

In [None]:
model = logit(formula = 'is_private ~ outstate + s_f_ratio', data=df_college)
res = model.fit()
res.summary()

In [None]:
sns.scatterplot(x='outstate', y = 's_f_ratio', hue='is_private', data=df_college)
plt.show()

The **Decision Boundary** is the boundary between the model predicting class 0 and class 1

$$
p(X) = \frac{1}{1 + e^{-(\beta_0 +\beta_1 X_1 + \beta_2 X_2)}}=0.5
$$

equivalent to

$$
\beta_0 +\beta_1 X_1 + \beta_2 X_2 = 0\\
X_2 = -\frac{\beta_1}{\beta_2}X_1 - \frac{\beta_0}{\beta_2}
$$

In [None]:
sns.scatterplot(x='outstate', y = 's_f_ratio', hue='is_private', data=df_college)

b0, b1, b2 = res.params
plt.axline((0,-b0/b2), slope=(-b1/b2), color='green', linestyle='--')

plt.show()

Let's what we learned about validation sets to evaluate this model.

In [None]:
train, val = train_test_split(df_college, test_size=0.2)

In [None]:
model = logit(formula = 'is_private ~ outstate + s_f_ratio', data=train)
res = model.fit()

x = val[['outstate', 's_f_ratio']]
y = val['is_private']

y_pred_prob = res.predict(x)
y_pred = (y_pred_prob >= 0.5).apply(int)

acc = metrics.accuracy_score(y, y_pred)
recall = metrics.recall_score(y, y_pred)
prec = metrics.precision_score(y, y_pred)

print(f'The Accuracy of the model on the validation set is {acc}')
print(f'The Recall of the model on the validation set is {recall}')
print(f'The Precision of the model on the validation set is {prec}')

In [None]:
x = train[['outstate', 's_f_ratio']]
y = train['is_private']

y_pred_prob = res.predict(x)
y_pred = (y_pred_prob >= 0.5).apply(int)

acc = metrics.accuracy_score(y, y_pred)
recall = metrics.recall_score(y, y_pred)
prec = metrics.precision_score(y, y_pred)

print(f'The Accuracy of the model on the train set is {acc}')
print(f'The Recall of the model on the train set is {recall}')
print(f'The Precision of the model on the train set is {prec}')

### Imbalanced Classification

Recall that this dataset is *imbalanced* meaning that there are more class 1 than class 0 datapoints.
- Why might this cause issues during model development?
- What if there are differences between class balance from train to validation to test?

In [None]:
sum(train['is_private']) / len(train), sum(val['is_private']) / len(val)

In [None]:
train, val = train_test_split(df_college, test_size=0.1, stratify=df_college[['is_private']])

In [None]:
sum(train['is_private']) / len(train), sum(val['is_private']) / len(val)

Some Techniques
- Under/Over-sampling
- Weighting the loss function

In [None]:
sum(df_college['is_private']), sum(-((df_college['is_private'])-1))

In [None]:
df_college.sort_values('is_private').head()

In [None]:
# under-sample
df_public = df_college.sort_values('is_private')[:200]
df_private = df_college.sort_values('is_private')[-200:]
df = pd.concat((df_public, df_private))

sum(df['is_private']), sum(-((df['is_private'])-1))

In [None]:
# over-sample
df_public = df_college.sort_values('is_private')[:200]
df = pd.concat((df_college, df_public))

sum(df['is_private']), sum(-((df['is_private'])-1))

Let $w_i$ be the positive example weight

$$
L(\beta_0,\beta_1, \beta_2) = -\left(\sum_{i=1}^N w_i y_i\log(p) + (1-y_i)\log(1-p)\right)
$$

- What happens if $0 < w_i < 1$ or if $w_i > 1$?

In [None]:
from sklearn.linear_model import LogisticRegression

x = df_college[['outstate', 's_f_ratio']]
y = df_college['is_private']

model = LogisticRegression(class_weight={0: 1.0, 1: 100.0}, max_iter=400)
model.fit(x,y)

# 0,1 predictions
y_pred = model.predict(x)

# probability outputs
y_pred_prob = model.predict_proba(x)[:,1]

In [None]:
y_pred[:10], y_pred_prob[:10]

In [None]:
acc = metrics.accuracy_score(y, y_pred)
recall = metrics.recall_score(y, y_pred)
prec = metrics.precision_score(y, y_pred)

print(f'The Accuracy of the model is {acc}')
print(f'The Recall of the model is {recall}')
print(f'The Precision of the model is {prec}')

Quiz3 Next Week:
- Summary Above
- Classification Metrics
- Imbalanced Datasets