# Comedy Bang Bang Podcast Best Of - Logistic Regression Model

In [None]:
# Define some exclusions for PEP8 that don't apply when the Jupyter Notebook
#   is exported to .py file
# pylint: disable=pointless-statement
# pylint: disable=fixme
# pylint: disable=expression-not-assigned
# pylint: disable=missing-module-docstring
# pylint: disable=invalid-name

import os

import pandas as pd
import numpy as np
import os 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score

# Setting custom parameters for this model

In [None]:
# percentage of training data to use for test
TEST_SIZE = 0.33

# random seed
RANDOM_STATE = 42069

# the name for the column that indicates the label/target
# the training CSV file should have headers
LABEL_COLUMN_NAME = "label"

# acceptable values: accuracy, f1, precision, recall
# METRIC_TO_MEASURE = "accuracy"

NUM_EPISODES_IN_2024_BEST_OF = 16

# Data import and validation

These code blocks load the prepared data from a CSV file, and then run a variety of tests to validate the values are within expected ranges. This ensures the data will produce useful models.

In [None]:
train_filename = os.path.join(
    os.getcwd(), "data", "Comedy_bang_bang_podcast_dataset_2023-final_train.csv"
)

# index_col is required, since that is how it is output from the data prep script
df = pd.read_csv(train_filename, index_col=0, header=0)

df.head()

Display the features and their datatypes

In [None]:
df.dtypes

### Data type validation
Verify that only numeric and boolean features are present. All strings should have been One Hot Encoded.

In [None]:
# verify that there are only numeric and boolean datatypes left
# there should not be any strings left
for index, value in df.dtypes.items():
    assert value in [
        "float64",
        "bool",
        "int64",
    ], f"Column name {index} is not numeric or boolean- found {value}. All features at this point should be numeric or boolean. Exiting."

print("Feature datatype check passed.")

Verify the episode number and year ranges. The first year of Comedy Bang Bang did not feature a Best Of list, so episodes 1-33 should be removed. The model should not be training on episodes elligible for 2023.

This block should output:

```Episode range of df: [35, 785]```

```Year elligible range of df: [2010, 2022]```

In [None]:
df_episode_num_min = min(df.index)
df_episode_num_max = max(df.index)
print(f"Episode range of df: [{df_episode_num_min}, {df_episode_num_max}]")

df_year_elligible_min = min(df['year_elligible_for_best_of'])
df_year_elligible_max = max(df['year_elligible_for_best_of'])

print(f"Year elligible range of df: [{df_year_elligible_min}, {df_year_elligible_max}]")


# Modeling

### Split up the features and label for model training

In [None]:
y = df[LABEL_COLUMN_NAME]
X = df.drop(columns=LABEL_COLUMN_NAME, axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

print("Number of examples: " + str(X.shape[0]))
print("\nNumber of Features:" + str(X.shape[1]))
print(str(list(X.columns)))


Define a function that builds, runs, and evaluates a model given hyperparameters.

In [None]:
def train_test_LR(X_train, y_train, X_test, y_test, c=1):
    '''
    Fit a Linear Regression classifier to the training data X_train, y_train.
    Return the loss and accuracy of resulting predictions on the test set.
    Parameters:
        C = Factor that controls how much regularization is applied to the model.
    '''
    model = LogisticRegression(C=c)    
    model.fit(X_train, y_train)
    probability_predictions = model.predict_proba(X_test)
    l_loss = log_loss(y_test, probability_predictions)
    class_label_predictions = model.predict(X_test)
    acc_score = accuracy_score(y_test, class_label_predictions)
    
    return l_loss, acc_score

    # metrics = {
    #     "accuracy": sklm.accuracy_score(y_test1, class_label_predictions),
    #     "f1": sklm.f1_score(y_test1, class_label_predictions),
    #     "precision": sklm.precision_score(y_test1, class_label_predictions),
    #     "recall": sklm.recall_score(y_test1, class_label_predictions),
    #     "logloss": log_loss(y_test, probability_predictions)
    # }

    # return metrics

# Train on different hyperparameter values

### Define ranges of hyperparameter values

In [None]:
cs = [10**i for i in range(-10,10)]

ll_cs = []
acc_cs = []

for iter_c in cs:
    loss, acc = train_test_LR(X_train, y_train, X_test, y_test, c=iter_c)
    ll_cs.append(loss)
    acc_cs.append(acc)

In [None]:
plt.figure(figsize=(15,5)) 

ax = sns.barplot(x=cs, y=ll_cs)
g = ax.set_xticklabels([f'10^{i}' for i in range(-10,10)])
ax.set_xlabel('Regularization HyperParameter: C')
ax.set_ylabel('Log Loss')
g = plt.title('Log Loss Test Performance by Regularization Weight C')

# finding the minimum:
log_loss_min_value = min(ll_cs)
c_value_at_min_log_loss = cs[np.argmin(ll_cs)]
print(f'Log loss is minimized at C = {c_value_at_min_log_loss} where log loss = {log_loss_min_value}')

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
x = np.log10(cs)

sns.lineplot(x=x, y=acc_cs, marker='o')

plt.title("Accuracy Test Performance by Regularization Weight C")
plt.xlabel("Log10 of Regularization HyperParameter: C")
plt.ylabel("Accuracy")
plt.show()

# finding the maximum accuracy:
ind = np.argmax(acc_cs)
max_accuracy = acc_cs[ind]
c_value_at_max_acc = cs[ind]
print(f'Accuracy is maximized at value {max_accuracy} , where C = {c_value_at_max_acc}')

# Testing the Optimal model against the 2023 Best Of List

In [None]:
predict_filename = os.path.join(
    os.getcwd(), "data", "Comedy_bang_bang_podcast_dataset_2023-final_predict.csv"
)
df_predict = pd.read_csv(predict_filename, index_col=0, header=0)

df_predict.head()


### Run a model against 2023 episodes data set

In [None]:
# extract the ground truth (label) values for the 2023 episodes
y2024 = df_predict[LABEL_COLUMN_NAME]

# extract the features for the 2023 episodes
X2024 = df_predict.drop(columns=LABEL_COLUMN_NAME, axis=1)

# create a new model based on the optimal C value
optimal_model = LogisticRegression(C=c_value_at_max_acc)

# fit it using ALL the training data, don't leave any test data
optimal_model.fit(X, y)

# predict the binary category for each 2023 episode
class_label_predictions_2024 = optimal_model.predict(X2024)

# compare the ground truth to the prediction of the binary category
acc_score_2024 = accuracy_score(y2024, class_label_predictions_2024)

# print the accuracy
print(acc_score_2024)

In [None]:
# create a new dataframe that will house all the results plus the ground truth
y2024_df = df_predict[LABEL_COLUMN_NAME].to_frame()
y2024_df['predict_logistic_regression_predict'] = class_label_predictions_2024

In [None]:
# extract just the probability [0,1] that the episode is likely to be on the
# best of.
probability_predictions_2024 = optimal_model.predict_proba(X2024)[:,1]

# create an empty dataframe that has the same index as the other
probability_predictions_2024_df = pd.DataFrame(index=X2024.index)

# assign the extracted values to this temporary dataframe
probability_predictions_2024_df['prob'] = probability_predictions_2024

In [None]:
# merge the results into a single dataframe
# have to force it to use the indexes for some reason
y2024_df = y2024_df.merge(probability_predictions_2024_df,
                          left_index=True, right_index=True)
y2024_df.head()

# Display the final predictions for which episodes will be on the 2023 Best of List

In [None]:
y2024_df.sort_values(by='prob', ascending=False).head(NUM_EPISODES_IN_2024_BEST_OF)