### Predicting Engagement of an Enducational Video 
This project demonstrates a Machine Learning pipeline for classifying if an educational video will be engaging to users or not. 
The pipleline includes stages of: 
- model selection, 
- training/validation/test data split, 
- parameter tuning based on a predefined evaluation metric, 
- applying to new data, 
- returning the probability of engagements for each unlabeled video. 

In [81]:
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score


### The Data
- There are 2 different files: the labeled data will be used for training /validating the model performance; Then the best model will be selected to predict engagement probability for the unlabeled data 
- Each row represents a single video 
- Target column 'engagement' has boolean value (True /False)

In [82]:
train_df = pd.read_csv('data/ucl_edu_vid_labeled.csv')
new_data = pd.read_csv('data/ucl_edu_vid_unlabeled.csv')
print('Data for training model', train_df.shape, '\nUnlabeled data for prediction', new_data.shape)

train_df.head()

Data for training model (9239, 10) 
Unlabeled data for prediction (2309, 9)


Unnamed: 0,id,title_word_count,document_entropy,freshness,easiness,fraction_stopword_presence,normalization_rate,speaker_speed,silent_period_rate,engagement
0,1,9,7.753995,16310,75.583936,0.553664,0.034049,2.997753,0.0,True
1,2,6,8.305269,15410,86.870523,0.584498,0.018763,2.635789,0.0,False
2,3,3,7.965583,15680,81.915968,0.605685,0.03072,2.538095,0.0,False
3,4,9,8.142877,15610,80.148937,0.593664,0.016873,2.259055,0.0,False
4,5,9,8.16125,14920,76.907549,0.581637,0.023412,2.42,0.0,False


### The ML Pipeline
Definie a **EngagementPredictor** class that encapsulates the machine learning pipeline, including: 
- data processing
- model training 
- model selection based on evaluation metric ('AUC' is used here as an example)
- make predictions on new data 

In [83]:
class EngagementPredictor:

    def __init__(self, train_df: pd.DataFrame, target_col_name: str, columns_to_drop=None, classifiers=None):
        self.train_df = train_df
        self.target = target_col_name
        self.columns_to_drop = columns_to_drop if columns_to_drop else []
        self.classifiers = classifiers if classifiers else ['svc', 'linear_svc', 'logistic_regression']
    
    def preprocess_data(self):

        # Ensure the target and specified columns are not in the features for training
        X = self.train_df.drop([self.target] + self.columns_to_drop, axis=1)

        # Assign the target variable to y
        y = self.train_df[self.target]

        # Scale all features so they're in the same range
        scaler = MinMaxScaler()
        X_scaled = scaler.fit_transform(X)

        return X_scaled, y 


    def train_model(self, X_train, y_train, X_val, y_val, classifier: str, optimization_metric: str):
        if classifier == 'svc':
            clf = SVC(kernel='rbf', probability=True)
            grid_values = {'gamma': [0.001, 0.01, 0.1, 1], 'C': [0.01, 0.1, 1, 10]}

        elif classifier == 'linear_svc':
            clf = LinearSVC()
            grid_values = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

        elif classifier == 'logistic_regression':
            clf = LogisticRegression(solver='liblinear')
            grid_values = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
        else:
            raise ValueError('Unsupported classifier, please choose from svc, linear_svc, or logistic_regression')
        
        # Grid search to find the best hyperparameters for the model
        grid_search = GridSearchCV(clf, param_grid=grid_values, scoring=optimization_metric, n_jobs=-1) # use all processors for parallelism
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_score = float(grid_search.best_score_) # convert numpy float to Python float

        print(f'\nTraining with {classifier}')
        print(f'Validation set {optimization_metric} for {classifier}: {best_score:.3f}')
        print(f'Best hyperparameters: {grid_search.best_params_}')

        return best_model, best_score  # return the best model
    
    def model_selection(self, X_train_scaled, y_train, X_val_scaled, y_val, optimization_metric: str):
        results = []
        for clf in self.classifiers:
            model, score = self.train_model(X_train_scaled, y_train, X_val_scaled, y_val, classifier=clf, optimization_metric= optimization_metric)
            results.append((model, score))
        
        # Sort the results by the optimization metric
        results.sort(key=lambda x: x[1], reverse=True)
        self.best_model = results[0][0]
        print(f'\nBest model selected based on {optimization_metric}: {self.best_model}')

    def best_model_predict(self, unlabeled_X):
        # Drop the same column as the training data
        unlabeled_X_copy = unlabeled_X.drop(self.columns_to_drop, axis=1) 
        # Scale the features
        scaler = MinMaxScaler()
        unlabled_X_scaled = scaler.fit_transform(unlabeled_X_copy)

        # Predict the probability of engagement
        y_pred = self.best_model.predict_proba(unlabled_X_scaled)[:, 1]

        # Combine the predictions with the original data
        unlabeled_X['engagement probability'] = y_pred
        
        return unlabeled_X
        
    



In [84]:
# after reviewing the data, I determined that the 'id' column should not be used for training the model
# Include these classifiers in the model selection
classifiers = ['svc', 'linear_svc', 'logistic_regression']
c = EngagementPredictor(train_df=train_df, target_col_name='engagement', columns_to_drop=['id'], classifiers=classifiers)

# This preprocessing took care of scaling of the features 
X_scaled, y = c.preprocess_data()

# Use train /test split to reserve 20% of the holdout data for validation
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train the model and select the best model based on roc_auc
c.model_selection(X_train, y_train, X_val, y_val, optimization_metric='roc_auc')





Training with svc
Validation set roc_auc for svc: 0.854
Best hyperparameters: {'C': 10, 'gamma': 0.1}

Training with linear_svc
Validation set roc_auc for linear_svc: 0.847
Best hyperparameters: {'C': 1}

Training with logistic_regression
Validation set roc_auc for logistic_regression: 0.845
Best hyperparameters: {'C': 100}

Best model selected based on roc_auc: SVC(C=10, gamma=0.1, probability=True)


In [85]:
predictions_for_new_data = c.best_model_predict(unlabeled_X=new_data)

# Reviewing only the video id and output probability
predictions_for_new_data[['id', 'engagement probability']].head()

Unnamed: 0,id,engagement probability
0,9240,0.001151
1,9241,0.048772
2,9242,0.025342
3,9243,0.974568
4,9244,0.009322
