# Lectures 7: Class demo

In [None]:
import os
import sys

sys.path.append("../code/.")

import IPython
import matplotlib.pyplot as plt
import mglearn
import numpy as np
import pandas as pd
from plotting_functions import *
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from utils import *

%matplotlib inline
pd.set_option("display.max_colwidth", 200)

## Demo: Model interpretation of linear classifiers

- One of the primary advantages of linear classifiers is their ability to interpret models.
- For instance, by analyzing the sign and magnitude of the learned coefficients, we can address questions regarding which features are influencing the prediction and in which direction.

- We'll demonstrate this by training `LogisticRegression` on the famous [IMDB movie review](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews) dataset. The dataset is a bit large for demonstration purposes. So I am going to put a big portion of it in the test split to speed things up. 

In [None]:
imdb_df = pd.read_csv("../data/imdb_master.csv", encoding="ISO-8859-1")
imdb_df = imdb_df[imdb_df["label"].str.startswith(("pos", "neg"))]
imdb_df.drop(["Unnamed: 0", "type", "file"], axis=1, inplace=True)
imdb_df.head()

Let's clean up the data a bit. 

In [None]:
import re


def replace_tags(doc):
    doc = doc.replace("<br />", " ")
    doc = re.sub("https://\S*", "", doc)
    return doc

In [None]:
imdb_df["review_pp"] = imdb_df["review"].apply(replace_tags)

Are we breaking the Golden rule here? 

Let's split the data and create bag of words representation. 

In [None]:
train_df, test_df = train_test_split(imdb_df, test_size=0.9, random_state=123)
X_train, y_train = train_df["review_pp"], train_df["label"]
X_test, y_test = test_df["review_pp"], test_df["label"]
train_df.shape

Is there any missing data?

In [None]:
train_df.isna().sum()

There is no missing data. We don't need imputation. 

In [None]:
# Let's try CountVectorizer
vec = None
bow = None
bow

### Examining the vocabulary

- The vocabulary (mapping from feature indices to actual words) can be obtained using `get_feature_names()` on the `CountVectorizer` object. 

In [None]:
# vocab = vec.get_feature_names_out()

In [None]:
# vocab[0:10]  # first few words

In [None]:
# vocab[2000:2010]  # some middle words

In [None]:
# vocab[::500]  # words with a step of 500

### Model building on the dataset 

First let's try `DummyClassifier` on the dataset. 

In [None]:
dummy = DummyClassifier()
cross_val_score(dummy, X_train, y_train).mean()

We have a balanced dataset. So the `DummyClassifier` score is around 0.5. 

Now let's try logistic regression. 

In [None]:
# Create a pipeline with CountVectorizer and LogisticRegression 
pipe_lr = None
# scores = cross_validate(pipe_lr, X_train, y_train, return_train_score=True)
# pd.DataFrame(scores)

Seems like we are overfitting. Let's optimize the hyperparameter `C`. 

In [None]:
scores_dict = {
    "C": 10.0 ** np.arange(-3, 3, 1),
    "mean_train_scores": list(),
    "mean_cv_scores": list(),
}
for C in scores_dict["C"]:
    pipe_lr = None
    # scores = cross_validate(pipe_lr, X_train, y_train, return_train_score=True)
    # scores_dict["mean_train_scores"].append(scores["train_score"].mean())
    # scores_dict["mean_cv_scores"].append(scores["test_score"].mean())

# results_df = pd.DataFrame(scores_dict)
# results_df

In [None]:
# optimized_C = results_df["C"].iloc[np.argmax(results_df["mean_cv_scores"])]
# print(
#     "The maximum validation score is %0.3f at C = %0.2f "
#     % (
#         np.max(results_df["mean_cv_scores"]),
#         optimized_C,
#     )
# )

Let's train a model on the full training set with the optimized hyperparameter values. 

In [None]:
# pipe_lr.fit(X_train, y_train)

### Examining learned coefficients 

- The learned coefficients are exposed by the `coef_` attribute of [LogisticRegression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) object. 

In [None]:
# feature_names = np.array(pipe_lr.named_steps["countvectorizer"].get_feature_names_out())
# coeffs = pipe_lr.named_steps["logisticregression"].coef_.flatten()

In [None]:
# feature_names

In [None]:
# word_coeff_df = pd.DataFrame(coeffs, index=feature_names, columns=["Coefficient"])
# word_coeff_df

- Let's sort the coefficients in descending order. 
- Interpretation
    - if $w_j > 0$ then increasing $x_{ij}$ moves us toward predicting $+1$. 
    - if $w_j < 0$ then increasing $x_{ij}$ moves us toward predicting $-1$. 


In [None]:
# word_coeff_df.sort_values(by="Coefficient", ascending=False)

- The coefficients make sense!

Let's visualize the top 20 features.

In [None]:
# mglearn.tools.visualize_coefficients(coeffs, feature_names, n_top_features=20)

Let's explore prediction of the following new review. 

In [None]:
# fake_review = "It got a bit boring at times but the direction was excellent and the acting was flawless. Overall I enjoyed the movie and I highly recommend it!"

In [None]:
# feat_vec = pipe_lr.named_steps["countvectorizer"].transform([fake_review])

In [None]:
# feat_vec

Let's get prediction probability scores of the fake review. 

In [None]:
# pipe_lr.predict_proba([fake_review])

In [None]:
# pipe_lr.classes_

The model is 83.5% confident that it's a positive review. 

In [None]:
# pipe_lr.predict([fake_review])[0]

We can find which of the vocabulary words are present in this review:

In [None]:
# feat_vec.toarray().ravel().astype(bool)

In [None]:
# words_in_ex = feat_vec.toarray().ravel().astype(bool)
# words_in_ex

How many of the words are in this review?

In [None]:
# np.sum(words_in_ex)

In [None]:
# np.array(feature_names)[words_in_ex]

In [None]:
# ex_df = pd.DataFrame(
#     data=coeffs[words_in_ex],
#     index=np.array(feature_names)[words_in_ex],
#     columns=["Coefficient"],
# )
# ex_df

Let's visualize how the words with positive and negative coefficients are driving the hard prediction. 

In [None]:
# mglearn.tools.visualize_coefficients(
#     coeffs[words_in_ex], np.array(feature_names)[words_in_ex], n_top_features=6
# )

In [None]:
# def plot_coeff_example(feat_vect, coeffs, feature_names):
#     words_in_ex = feat_vec.toarray().ravel().astype(bool)

#     ex_df = pd.DataFrame(
#         data=coeffs[words_in_ex],
#         index=np.array(feature_names)[words_in_ex],
#         columns=["Coefficient"],
#     )
#     return ex_df

### Most positive review 

- Remember that you can look at the probabilities (confidence) of the classifier's prediction using the `model.predict_proba` method.
- Can we find the reviews where our classifier is most confident or least confident?

In [None]:
# pos_probs = pipe_lr.predict_proba(X_train)[
#     :, 1
# ]  # only get probabilities associated with pos class
# pos_probs

Let's get the index of the example where the classifier is most confident (highest `predict_proba` score for positive). 

In [None]:
# most_positive = np.argmax(pos_probs)

In [None]:
# X_train.iloc[most_positive]

In [None]:
# print("True target: %s\n" % (y_train.iloc[most_positive]))
# print("Predicted target: %s\n" % (pipe_lr.predict(X_train.iloc[[most_positive]])[0]))
# print("Prediction probability: %0.4f" % (pos_probs[most_positive]))

Let's examine the features associated with the review. 

In [None]:
# feat_vec = pipe_lr.named_steps["countvectorizer"].transform(
#     X_train.iloc[[most_positive]]
# )
# words_in_ex = feat_vec.toarray().ravel().astype(bool)
# mglearn.tools.visualize_coefficients(
#     coeffs[words_in_ex], np.array(feature_names)[words_in_ex], n_top_features=20
# )

The review has both positive and negative words but the words with **positive** coefficients win in this case! 

### Most negative review 

In [None]:
# neg_probs = pipe_lr.predict_proba(X_train)[
#     :, 0
# ]  # only get probabilities associated with pos class
# neg_probs

In [None]:
# most_negative = np.argmax(neg_probs)

In [None]:
# print("Review: %s\n" % (X_train.iloc[[most_negative]]))
# print("True target: %s\n" % (y_train.iloc[most_negative]))
# print("Predicted target: %s\n" % (pipe_lr.predict(X_train.iloc[[most_negative]])[0]))
# print("Prediction probability: %0.4f" % (pos_probs[most_negative]))

In [None]:
# feat_vec = pipe_lr.named_steps["countvectorizer"].transform(
#     X_train.iloc[[most_negative]]
# )
# words_in_ex = feat_vec.toarray().ravel().astype(bool)
# mglearn.tools.visualize_coefficients(
#     coeffs[words_in_ex], np.array(feature_names)[words_in_ex], n_top_features=20
# )

The review has both positive and negative words but the words with negative coefficients win in this case! 

## ❓❓ Questions for you

#### Question for you to ponder on 

- Is it possible to identify most important features using $k$-NNs? What about decision trees?  
