In [5]:
import pandas as pd
import numpy as np
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor

In [6]:
import warnings
warnings.filterwarnings("ignore")

In [7]:
df_yout = pd.read_csv("./data/clean_file.csv")

# **Predictive analysis of number of views for YouTubeâ€™s trending videos**

In [8]:
df_yout.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'video_id', 'title', 'publishedAt',
       'channelId', 'channelTitle', 'categoryId', 'trending_date',
       'view_count', 'likes', 'comment_count', 'comments_disabled',
       'description', 'age', 'rating_disabled', 'likes_log', 'views_log',
       'comment_log', 'publishing_hour', 'category_name'],
      dtype='object')

In [9]:
x_col = ['comment_count', 'likes', 'rating_disabled', 'age', 'publishing_hour']
y_col = ['view_count']

## **Metrics for evaluating regression model**

In [10]:
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error
import numpy as np

def run_experiment(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Evaluation of model {}:".format(model))
    print("R^2 : ", r2_score(y_test, y_pred))
    print("MAE :", mean_absolute_error(y_test,y_pred))
    print("RMSE:",np.sqrt(mean_squared_error(y_test, y_pred)))
    print("\n")

## **Normalizing data**

In [11]:
# train-test 70-30 split
df_train, df_test = train_test_split(df_yout, 
                                     train_size = 0.7, 
                                     test_size = 0.3, 
                                     random_state = 100)

In [12]:
numeric_vars = ['age', 'comment_count', 'likes', 'view_count']

In [13]:
def min_max_scaler(df_train, df_test, numeric_vars):
    
    # rescale the features for Linear Regression
    scaler = MinMaxScaler()

    # apply scaler() to all the numeric columns 
    df_train[numeric_vars] = scaler.fit_transform(df_train[numeric_vars])
    df_test[numeric_vars] = scaler.fit_transform(df_test[numeric_vars])

    return df_train, df_test

In [14]:
def standard_scaler(df_train, df_test, numeric_vars):

    # rescale the features for Linear Regression
    scaler = StandardScaler()

    # apply scaler() to all the numeric columns 
    df_train[numeric_vars] = scaler.fit_transform(df_train[numeric_vars])
    df_test[numeric_vars] = scaler.fit_transform(df_test[numeric_vars])
    
    return df_train, df_test

In [15]:
df_train_min_max, df_test_min_max = min_max_scaler(df_train, df_test, numeric_vars)
df_train_standard, df_test_standard = standard_scaler(df_train, df_test, numeric_vars)

## **Train-Test-Split**

In [16]:
def split_dataset(df_train, df_test, x_col, y_col):

    X_train = np.array(df_train[x_col]).reshape(-1, len(x_col))
    y_train = np.array(df_train[y_col]).reshape(-1, 1)

    X_test = np.array(df_test[x_col]).reshape(-1, len(x_col))
    y_test = np.array(df_test[y_col]).reshape(-1, 1)

    print((X_train.shape, y_train.shape), (X_test.shape, y_test.shape))
    return (X_train, y_train), (X_test, y_test)

In [17]:
(X_train_min_max, y_train_min_max), (X_test_min_max, y_test_min_max) = split_dataset(df_train_min_max, df_test_min_max, x_col, y_col)
(X_train_standard, y_train_standard), (X_test_standard, y_test_standard) = split_dataset(df_train_standard, df_test_standard, x_col, y_col)

((3056, 5), (3056, 1)) ((1310, 5), (1310, 1))
((3056, 5), (3056, 1)) ((1310, 5), (1310, 1))


## **Running Experiment**

In [29]:
run_experiment(LinearRegression(), X_train_min_max, y_train_min_max, X_test_min_max, y_test_min_max)
run_experiment(SGDRegressor(), X_train_standard, y_train_standard, X_test_standard, y_test_standard)

Evaluation of model LinearRegression():
R^2 :  0.8475076071557255
MAE : 0.19328418720224189
RMSE: 0.3905027437090226


Evaluation of model SGDRegressor():
R^2 :  0.8255901363047129
MAE : 0.1875049050035365
RMSE: 0.417624069822714


