# <font color="#49699E" size=40>Supervised Learning with Regression and Cross-Validation</font>

# LEARNING OBJECTIVES
# LEARNING MATERIALS


# INTRODUCTION
## Imports

In [ ]:
import pandas as pd 
import numpy as np

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

from dcss.plotting import plot_knn_decision_boundaries
from dcss.plotting import custom_seaborn
custom_seaborn()

## A Very Brief Refresher on Linear and Logistic Regression Models:


## Preparing the Data


In [ ]:
vdem_fh_df = pd.read_csv("../data/vdem_internet_freedom_combined/vdem_fh_combined.csv")
vdem_df = pd.read_csv("../data/vdem_internet_freedom_combined/vdem_only.csv")
vdem_df.head()

## The Train-Test Split


In [ ]:
from sklearn.model_selection import train_test_split

X = vdem_fh_df[['v2x_polyarchy', 'v2x_libdem', 'v2x_partipdem', 'v2x_delibdem', 'v2x_egaldem',]]
y = vdem_fh_df[['Total Score']]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=23)

# Supervised Learning with Linear Regression


## Ordinary Least Squares (OLS) Regression


In [ ]:
from sklearn.linear_model import LinearRegression
ols = LinearRegression()

In [ ]:
ols.fit(X_train, y_train)

In [ ]:
print("Intercept", list(X_train.columns))
print(ols.intercept_, ols.coef_)

In [ ]:
ols.score(X_train, y_train)

## CROSS-VALIDATION


### Putting The Two Together: OLS and CV


In [ ]:
from sklearn.model_selection import cross_val_score

cross_val_score(ols, X_train, y_train, cv=5)

In [ ]:
from sklearn.model_selection import ShuffleSplit

shuffsplit = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

olscv_score = cross_val_score(ols, X_train, y_train, cv=shuffsplit)
olscv_score

### Cheating on the Test


In [ ]:
olscv_score.mean()

In [ ]:
ols.score(X_test, y_test)

## Regularization via Ridge Regression 


In [ ]:
from sklearn.linear_model import Ridge

X = vdem_fh_df[['v2x_polyarchy', 'v2x_libdem', 'v2x_partipdem', 'v2x_delibdem', 'v2x_egaldem',]]
y = vdem_fh_df[['Total Score']]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=2)
shuffsplit = ShuffleSplit(n_splits=15, test_size=0.2, random_state=2)

ridgereg = Ridge(1)
ridgecv_score = cross_val_score(ridgereg, X_train, y_train, cv=shuffsplit)
print(ridgecv_score)
print(f"Mean: {ridgecv_score.mean()}")

## Regularization via Lasso Regression


In [ ]:
from sklearn.linear_model import Lasso

lassoreg = Lasso(1)
lassocv_score = cross_val_score(lassoreg, X_train, y_train, cv=shuffsplit)
print(lassocv_score)
print(f"Mean: {lassocv_score.mean()}")

In [ ]:
alphas = np.linspace(0.01, 2, 50)

ridge_r2s = []
lasso_r2s = []

olscv_score = cross_val_score(LinearRegression(), X_train, y_train, cv=shuffsplit)

for alpha in alphas:
    new_ridge = Ridge(alpha)
    ridge_r2s.append(cross_val_score(new_ridge, X_train, y_train, cv=shuffsplit).mean())
    
    new_lasso = Lasso(alpha)
    new_lasso.fit(X_train, y_train)
    lasso_r2s.append(cross_val_score(new_lasso, X_train, y_train, cv=shuffsplit).mean())
    
r2s = pd.DataFrame(
    zip(alphas, ridge_r2s, lasso_r2s), 
    columns = ["alpha", "Ridge Regression", "Lasso Regression"])


In [ ]:
fig, ax = plt.subplots()
sns.lineplot(x="alpha", y="Ridge Regression", data = r2s, label="Ridge", linestyle='solid')
sns.lineplot(x="alpha", y="Lasso Regression", data = r2s, label = "Lasso", linestyle='dashed')
ax.axhline(olscv_score.mean(), label="OLS", linestyle='dotted', color="darkgray")
ax.set(xlabel='alpha values for Ridge and Lasso Regressions', ylabel='R2')
sns.despine()
ax.legend()
plt.show()

In [ ]:
best_alpha = alphas[ridge_r2s.index(max(ridge_r2s))]
best_alpha

In [ ]:
best_ridgereg = Ridge(best_alpha)
best_ridgereg.fit(X_train, y_train)

pd.DataFrame(
    [
        *best_ridgereg.intercept_, 
        *np.ravel(best_ridgereg.coef_)], 
    index=['Intercept', *X_test.columns]
)

In [ ]:
best_ridgereg.score(X_test, y_test)

## Model Interpretation


In [ ]:
predictions = np.round(best_ridgereg.predict(X_test))

In [ ]:
np.ravel(predictions)

In [ ]:
preds = pd.DataFrame({"Total Score": y_test['Total Score'], "Predicted Score": np.ravel(predictions), "Country":  vdem_fh_df.loc[y_test.index]['Country']})
preds

# CLASSIFICATION WITH LOGISTIC REGRESSION


In [ ]:
y = np.where(vdem_df["v2x_regime"] <= 1, 0, 1).copy()

X = vdem_df[[
                 'v2smgovdom_osp', 
                 "v2smgovfilprc_osp", 
                 "v2smgovsmcenprc_osp", 
                 "v2smonper_osp", 
                 "v2smarrest_osp", 
]]

In [ ]:
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=23)
shuffsplit = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

log_reg = cross_val_score(
    LogisticRegression(), 
    X_train, 
    y_train, 
    cv=shuffsplit)
print(log_reg)
print(f"Mean: {log_reg.mean()}")

In [ ]:

log_reg_regularized = cross_val_score(
    LogisticRegression(C=0.5), 
    X_train, 
    y_train, 
    cv=shuffsplit)
print(log_reg_regularized)
print(f"Mean: {log_reg_regularized.mean()}")

# CONCLUSION
## Key Points 
