In [91]:
# import all libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import scale

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor

from sklearn.metrics import f1_score

import warnings # supress warnings
warnings.filterwarnings('ignore')

In [92]:
df = pd.read_csv("./data/clean.csv")

In [93]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5121 entries, 0 to 5120
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         5121 non-null   int64  
 1   video_id           5121 non-null   object 
 2   title              5121 non-null   object 
 3   published_at       5121 non-null   object 
 4   channel_id         5121 non-null   object 
 5   channel_title      5121 non-null   object 
 6   category_id        5121 non-null   int64  
 7   trending_date      5121 non-null   object 
 8   view_count         5121 non-null   int64  
 9   likes              5121 non-null   int64  
 10  comment_count      5121 non-null   int64  
 11  comments_disabled  5121 non-null   bool   
 12  description        5121 non-null   object 
 13  notes              5121 non-null   int64  
 14  age                5121 non-null   int64  
 15  temperature        5121 non-null   float64
dtypes: bool(1), float64(1), 

In [94]:
# train-test 70-30 split
df_train, df_test = train_test_split(df, 
                                     train_size = 0.7, 
                                     test_size = 0.3, 
                                     random_state = 100)

# rescale the features
scaler = MinMaxScaler()

# apply scaler() to all the numeric columns 
numeric_vars = ['temperature', 'age', 'comment_count', 'likes', 'view_count']
df_train[numeric_vars] = scaler.fit_transform(df_train[numeric_vars])
df_test[numeric_vars] = scaler.fit_transform(df_test[numeric_vars])

## Metrics for evaluating regression model

In [95]:
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error
import numpy as np

def run_experiment(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Evaluation of model {}:".format(model))
    print("R^2 : ", r2_score(y_test, y_pred))
    print("MAE :", mean_absolute_error(y_test,y_pred))
    print("RMSE:",np.sqrt(mean_squared_error(y_test, y_pred)))
    print("\n")

## Predictive analysis of number of views for YouTube’s trending videos

In [102]:
x_col = ['comment_count', 'likes']
y_col = ['view_count']

In [103]:
# split into X and y for both train and test sets
# reshaping is required since sklearn requires the data to be in shape
# (n, 1), not as a series of shape (n, )
X_train = np.array(df_train[x_col]).reshape(-1, len(x_col))
y_train = np.array(df_train[y_col]).reshape(-1, 1)

X_test = np.array(df_test[x_col]).reshape(-1, len(x_col))
y_test = np.array(df_test[y_col]).reshape(-1, 1)

(X_train.shape, y_train.shape), (X_test.shape, y_test.shape)

(((3584, 3), (3584, 1)), ((1537, 3), (1537, 1)))

In [104]:
run_experiment(LinearRegression(), X_train, y_train, X_test, y_test)
run_experiment(SGDRegressor(), X_train, y_train, X_test, y_test)

Evaluation of model LinearRegression():
R^2 :  0.5207380784273559
MAE : 0.013296422125119639
RMSE: 0.04440173172787917


Evaluation of model SGDRegressor():
R^2 :  -0.040691217701769755
MAE : 0.019389106363992072
RMSE: 0.06542961893773487


