# Lectures 5: Class demo

## Imports

In [87]:
# import the libraries
import os
import sys
sys.path.append(os.path.join(os.path.abspath(".."), (".."), "code"))
from plotting_functions import *
from utils import *

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

%matplotlib inline

pd.set_option("display.max_colwidth", 200)
DATA_DIR = os.path.join(os.path.abspath(".."), (".."), "data/")
pd.set_option("display.max_colwidth", 200)

## Incorporating text features in the Spotify dataset

Recall that we had dropped `song_title` feature when we worked with the Spotify dataset in Lab 1. 

Let's try to include it in our pipeline and examine whether we get better results. 

In [170]:
spotify_df = pd.read_csv(DATA_DIR + "spotify.csv", index_col=0)
X_spotify = spotify_df.drop(columns=["target"])
y_spotify = spotify_df["target"]

In [171]:
X_train, X_test, y_train, y_test = train_test_split(
    X_spotify, y_spotify, test_size=0.2, random_state=123
)

In [None]:
X_train.shape

In [None]:
X_train

In [None]:
X_train.columns

### Dummy model 

In [None]:
from sklearn.dummy import DummyClassifier

results = {}
dummy_model = DummyClassifier()
# mean_std_cross_val_scores is defined in ../code/utils.py
results['dummy'] = mean_std_cross_val_scores(dummy_model, X_train, y_train, return_train_score = True) 
pd.DataFrame(results)

### Feature categorization

In [None]:
X_train.columns

In [None]:
X_train["key"].value_counts()

In [None]:
X_train["time_signature"].value_counts()

In [None]:
X_train["mode"].value_counts()

Let's look at the distribution of values in the `song_title` column. 

In [None]:
X_train["song_title"].value_counts()

- Most of the song titles are unique, which makes sense. 
- What would happen if we apply one-hot encoding to this feature? 
- How about encoding this as a text feature? 

In [None]:
X_train["artist"].value_counts()

In [101]:
numeric_feats = ['acousticness', 'danceability', 'energy',
                 'instrumentalness', 'liveness', 'loudness',
                 'speechiness', 'tempo', 'valence']
categorical_feats = ['time_signature', 'key']
passthrough_feats = ['mode']
artist_cat_feat = ['artist']
text_feat = 'song_title' # Define the text feature

```{important}
Note that unlike other feature types we are defining `text_feature` as a string and not as a list. 
```

### Column transformer without `song_title` and `artist` features

In [102]:
preprocessor_no_text = make_column_transformer(
    (StandardScaler(), numeric_feats), 
    ("passthrough", passthrough_feats),     
    (OneHotEncoder(handle_unknown = "ignore"), categorical_feats),     
)

### Visualizing the transformed data 

In [None]:
transformed_no_text = preprocessor_no_text.fit_transform(X_train)
transformed_no_text.shape

In [None]:
preprocessor_no_text

In [None]:
ohe_feat_names = preprocessor_no_text.named_transformers_["onehotencoder"].get_feature_names_out().tolist()
ohe_feat_names

In [106]:
feat_names = numeric_feats + passthrough_feats + ohe_feat_names

In [None]:
pd.DataFrame(transformed_no_text, columns=feat_names)

### Building models

In [None]:
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC()    
}

for (name, model) in models.items():
    pipe_model = make_pipeline(preprocessor_no_text, model)
    results[name + " (no_text)"] = mean_std_cross_val_scores(pipe_model, X_train, y_train, return_train_score=True)
pd.DataFrame(results).T

<br><br>

### Incorporating "song_title" feature

Let's incorporate bag-of-words representation of "song_title" feature in our column transformer. 

In [None]:
numeric_feats

In [None]:
text_feat

In [111]:
preprocessor = make_column_transformer(
    (StandardScaler(), numeric_feats), 
    ("passthrough", passthrough_feats),     
    (OneHotEncoder(handle_unknown = "ignore"), categorical_feats),     
    (CountVectorizer(stop_words="english"), text_feat)
)

In [112]:
# Transform the data
transformed = preprocessor.fit_transform(X_train)

In [None]:
preprocessor

In [114]:
# Get the vocabulary
vocab = preprocessor.named_transformers_['countvectorizer'].get_feature_names_out()

In [None]:
column_names = numeric_feats + passthrough_feats + ohe_feat_names + vocab.tolist()
len(column_names)

In [None]:
df = pd.DataFrame(transformed.toarray(), columns=column_names)
df

### Visualizing the vocabulary 

In [None]:
vocab[0:10]

In [None]:
vocab[500:510]

In [None]:
vocab[1800:1810]

In [None]:
vocab[0::100]

Let's find songs containing the word _earth_ in them. 

In [None]:
earth_index_vocab = np.where(vocab == "earth")[0][0]
earth_index_vocab

In [None]:
earth_index_in_df = len(numeric_feats) + len(passthrough_feats) + len(ohe_feat_names) + earth_index_vocab
earth_index_in_df

In [None]:
earth_songs = df[df.iloc[:, earth_index_in_df] == 1]
earth_songs.iloc[:, earth_index_in_df - 2 : earth_index_in_df + 2]

In [None]:
earth_songs.index

In [None]:
X_train.iloc[earth_songs.index]["song_title"]

### Model building 

In [None]:
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC()    
}

for (name, model) in models.items():
    pipe_model = make_pipeline(preprocessor, model)
    results[name + " (text)"] = mean_std_cross_val_scores(pipe_model, X_train, y_train, return_train_score=True)
pd.DataFrame(results).T

- Not a big difference in the results. 
- Seems like there is more overfitting when we included the `song_title` feature.
- The training score of SVC is much higher when we include all features. Hyperparameter optimization of `C` and `gamma` may help. 

- What about the `artist` column?
- Does it make sense to apply BOW encoding to it? 
- Let's look at the distribution of values in the `artist` column. 

In [None]:
X_train['artist'].value_counts()

In [None]:
most_frequent = X_train["artist"].value_counts().iloc[:15]
most_frequent

In [129]:
preprocessor_artist = make_column_transformer(
    (StandardScaler(), numeric_feats), 
    ("passthrough", passthrough_feats),     
    (OneHotEncoder(handle_unknown = "ignore"), categorical_feats),
    (OneHotEncoder(dtype=int, handle_unknown="ignore", categories=[most_frequent.index.values]), artist_cat_feat),
    (CountVectorizer(max_features = 100, stop_words="english"), text_feat)
)

In [None]:
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC()    
}

for (name, model) in models.items():
    pipe_model = make_pipeline(preprocessor_artist, model)
    results[name + " (all)"] = mean_std_cross_val_scores(pipe_model, X_train, y_train, return_train_score=True)
pd.DataFrame(results).T

Tiny bit improvement in the mean CV scores but we are still overfitting. 

<br><br><br><br>

## (Optional) Incorporating text features in the restaurant survey dataset

Do you recall [the restaurants survey](https://ubc.ca1.qualtrics.com/jfe/form/SV_73VuZiuwM1eDVrw) you completed at the start of the course?

Let's use that data for this demo. You'll find a [wrangled version](https://github.ubc.ca/MDS-2023-24/DSCI_571_sup-learn-1_students/blob/master/lectures/data/cleaned_restaurant_data.csv) in the course repository.

In [131]:
df = pd.read_csv(DATA_DIR + 'cleaned_restaurant_data.csv')

In [None]:
df

In [None]:
df.describe()

Are there any unusual values in this data that you notice?
Let's get rid of these outliers. 

In [None]:
upperbound_price = 200
lowerbound_people = 1
df = df[~(df['price'] > 200)]
restaurant_df = df[~(df['n_people'] < lowerbound_people)]
restaurant_df.shape

In [None]:
restaurant_df.describe()

### Data splitting 

We aim to predict whether a restaurant is liked or disliked.

In [136]:
# Separate `X` and `y`. 

X = restaurant_df.drop(columns=['target'])
y = restaurant_df['target']

Below I'm perturbing this data just to demonstrate a few concepts. Don't do it in real life. 

In [137]:
X.at[459, 'food_type'] = 'Quebecois'
X['price'] = X['price'] * 100

In [138]:
# Split the data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

<br><br>

### EDA 

In [None]:
X_train.hist(bins=20, figsize=(12, 8));

Do you see anything interesting in these plots? 

In [None]:
X_train['food_type'].value_counts()

Error in data collection? Probably "Fusion" and "fusion" categories should be combined?

In [141]:
X_train['food_type'] = X_train['food_type'].replace("fusion", "Fusion")
X_test['food_type'] = X_test['food_type'].replace("fusion", "Fusion")

In [None]:
X_train['food_type'].value_counts()

Again, usually we should spend lots of time in EDA, but let's stop here so that we have time to learn about transformers and pipelines.   

<br><br>

### Dummy Classifier

In [None]:
from sklearn.dummy import DummyClassifier

results_df = {}
dummy = DummyClassifier()
results_df['dummy'] = mean_std_cross_val_scores(dummy, X_train, y_train, return_train_score=True)
pd.DataFrame(results_df)

We have a relatively balanced distribution of both 'like' and 'dislike' classes.

<br><br>

### Preprocessing

How can we horizontally stack  
- preprocessed numeric features, 
- preprocessed binary features, 
- preprocessed ordinal features, and 
- preprocessed categorical features?

Let's define a column transformer. 

In [144]:
numeric_feats = ['age', 'n_people', 'price'] # Continuous and quantitative features
categorical_feats = ['north_america', 'food_type'] # Discrete and qualitative features
binary_feats = ['good_server'] # Categorical features with only two possible values 
ordinal_feats = ['noise_level'] # Some natural ordering in the categories 
noise_cats = ['no music', 'low', 'medium', 'high', 'crazy loud']
drop_feats = ['comments', 'restaurant_name', 'eat_out_freq'] # Dropping text feats and `eat_out_freq` because it's not that useful

In [None]:
X_train['noise_level'].value_counts()

In [146]:
noise_levels = ["no music", "low", "medium", "high", "crazy loud"]

In [147]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

from sklearn.compose import make_column_transformer

numeric_transformer = make_pipeline(SimpleImputer(strategy="median"),
                                    StandardScaler()) 
binary_transformer = make_pipeline(SimpleImputer(strategy="most_frequent"), 
                                    OneHotEncoder(drop="if_binary"))
ordinal_transformer = make_pipeline(SimpleImputer(strategy="most_frequent"), 
                                    OrdinalEncoder(categories=[noise_levels]))
categorical_transformer = make_pipeline(SimpleImputer(strategy="most_frequent"), 
                                    OneHotEncoder(sparse_output=False, handle_unknown="ignore"))

preprocessor = make_column_transformer(
    (numeric_transformer, numeric_feats), 
    (binary_transformer, binary_feats), 
    (ordinal_transformer, ordinal_feats),
    (categorical_transformer, categorical_feats),
    ("drop", drop_feats)
)


How does the transformed data look like? 

In [None]:
transformed = preprocessor.fit_transform(X_train)
transformed.shape

In [None]:
preprocessor

In [None]:
# Getting feature names from a column transformer
ohe_feat_names = preprocessor.named_transformers_['pipeline-4']['onehotencoder'].get_feature_names_out(categorical_feats).tolist()
ohe_feat_names

In [None]:
numeric_feats

In [152]:
feat_names = numeric_feats + binary_feats + ordinal_feats + ohe_feat_names

In [None]:
transformed

In [None]:
pd.DataFrame(transformed, columns = feat_names)

We have new columns for the categorical features. Let's create a pipeline with the preprocessor and SVC. 

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC()    
}

for (name, model) in models.items():
    pipe_num_model = make_pipeline(SimpleImputer(strategy="median"), StandardScaler(), model)
    results_df[name +' (numeric-only)'] = mean_std_cross_val_scores(pipe_num_model, X_train[numeric_feats], y_train, return_train_score=True)
pd.DataFrame(results_df).T

In [None]:
for (name, model) in models.items():
    pipe_model = make_pipeline(preprocessor, model)
    results_df[name + '(non-text feats)'] = mean_std_cross_val_scores(pipe_model, X_train, y_train, return_train_score=True)
pd.DataFrame(results_df).T

We are getting better results when we include numeric, categorical, binary, ordinal features. 
<br><br><br>

### Incorporating text features 

We haven't incorporated the comments feature into our pipeline yet, even though it holds significant value in indicating whether the restaurant was liked or not.

In [None]:
X_train

Let's create bag-of-words representation of the `comments` feature. But first we need to impute the rows where there are no comments. There is a small complication if we want to put `SimpleImputer` and `CountVectorizer` in a pipeline. 
- `SimpleImputer` takes a 2D array as input and produced 2D array as output. 
- `CountVectorizer` takes a 1D array as input. 

To deal with this, we will use sklearn's `FunctionTransformer` to convert the 2D output of `SimpleImputer` into a 1D array which can be passed to `CountVectorizer` as input. 

In [None]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer

reshape_for_countvectorizer = FunctionTransformer(lambda X: X.squeeze(), validate=False)
text_transformer = make_pipeline(SimpleImputer(strategy="constant", fill_value="missing"), 
                          reshape_for_countvectorizer, 
                          CountVectorizer(stop_words="english"))
text_pipe = make_pipeline(text_transformer, SVC())
cross_val_score(text_pipe, X_train[['comments']], y_train).mean()

Pretty good scores just with text features! Let's examine the transformed data. 

In [159]:
transformed = text_transformer.fit_transform(X_train[['comments']], y_train)

In [None]:
transformed

It's a sparse matrix. Let's explore the the vocabulary. 

In [None]:
vocab = text_transformer.named_steps["countvectorizer"].get_feature_names_out()
vocab[:10]

In [None]:
vocab[0:10]

In [None]:
vocab[200:210]

In [None]:
vocab[500:600]

In [None]:
vocab[0::20]

<br><br>

 Do we get better scores if we combine all features? Let's define a column transformer which carries out 
- imputation and scaling on numeric features
- imputation and one-hot encoding with `drop="if_binary"` on binary features
- imputation and one-hot encoding with `handle_unknown="ignore"` on categorical features
- imputation, reshaping, and bag-of-words transformation on the text feature

In [166]:
from sklearn.feature_extraction.text import CountVectorizer
text_feat = ['comments']

preprocessor_all = make_column_transformer(
    (numeric_transformer, numeric_feats), 
    (binary_transformer, binary_feats), 
    (ordinal_transformer, ordinal_feats),
    (categorical_transformer, categorical_feats),
    (text_transformer, text_feat), 
    ("drop", drop_feats)
)

In [None]:
preprocessor_all.fit_transform(X_train)

In [None]:
for (name, model) in models.items():
    pipe_model = make_pipeline(text_transformer, model)
    results_df[name + '(text)'] = mean_std_cross_val_scores(pipe_model, X_train[['comments']], y_train, return_train_score=True)
pd.DataFrame(results_df).T

In [None]:
for (name, model) in models.items():
    pipe_model = make_pipeline(preprocessor_all, model)
    results_df[name + '(all)'] = mean_std_cross_val_scores(pipe_model, X_train, y_train, return_train_score=True)
pd.DataFrame(results_df).T

Some improvement when we combine all features! 