# Predicting and Understanding Viewer Engagement

## Imports

In [26]:
import warnings

warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

np.random.seed(0)

In [27]:
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

## Training Set EDA

In [28]:
trainSet = pd.read_csv("assets/train.csv")

In [29]:
trainSet.head(3)

Unnamed: 0,id,title_word_count,document_entropy,freshness,easiness,fraction_stopword_presence,normalization_rate,speaker_speed,silent_period_rate,engagement
0,1,9,7.753995,16310,75.583936,0.553664,0.034049,2.997753,0.0,True
1,2,6,8.305269,15410,86.870523,0.584498,0.018763,2.635789,0.0,False
2,3,3,7.965583,15680,81.915968,0.605685,0.03072,2.538095,0.0,False


## Model Design

In [23]:
def engagement_model():
    trainSet = pd.read_csv("assets/train.csv")
    testSet = pd.read_csv("assets/test.csv")

    X = trainSet[[
            "title_word_count",
            "document_entropy",
            "freshness",
            "easiness",
            "fraction_stopword_presence",
            "speaker_speed",
            "silent_period_rate",]].values
    y = trainSet["engagement"].values

    X_train, X_val, y_train, y_val = train_test_split(X, y)

    X_test = testSet[[
            "title_word_count",
            "document_entropy",
            "freshness",
            "easiness",
            "fraction_stopword_presence",
            "speaker_speed",
            "silent_period_rate",]].values

    grid = GridSearchCV(
        RandomForestClassifier(),
        param_grid={
        "max_depth": [2, 3, 4, 5, 6, 7],
        "max_features": [3, 4, 5, 6, 7],
        "n_estimators": [10, 20, 30],
        },
        n_jobs=-1,
        scoring="roc_auc",
    )
    grid.fit(X, y)

    print(grid.best_params_)
    print(grid.best_score_)

    gridPred = grid.predict_proba(X_val)[:,1]
    gridPred = grid.predict_proba(X_test)[:, 1]

    forest = RandomForestClassifier(max_depth=7, 
                                    max_features=4, 
                                    n_estimators=30,
                                    random_state=42).fit(X, y)

    print(roc_auc_score(y_val, forest.predict_proba(X_val)[:, 1]))

    forestPred = forest.predict_proba(X_test)[:, 1]

    return pd.Series(index=testSet.id, data=forestPred, name="engagement")

## Model Output

In [24]:
engagement_model()

{'max_depth': 7, 'max_features': 4, 'n_estimators': 30}
0.8840865972264315
0.9354988704499514


id
9240     0.021487
9241     0.038760
9242     0.070973
9243     0.966547
9244     0.024444
           ...   
11544    0.022452
11545    0.010717
11546    0.011409
11547    0.933201
11548    0.017443
Name: engagement, Length: 2309, dtype: float64

## Output Checking

In [20]:
ans = engagement_model()
assert isinstance(ans, pd.Series), "Your function should return a pd.Series. "
assert len(ans) == 2309, "Your series is of incorrect length: expected 2309 "
assert np.issubdtype(
    ans.index.dtype, np.integer
), "Your answer pd.Series should have an index of integer type representing video id."

0.9441119698834662
