In [16]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
# from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore")
import pandas as pd

In [3]:
#Step1: Load the data

df=pd.read_csv("XY_train.csv")
# df.head()

In [4]:
#Step 2: Data Preprocessing, train_test_split and Standardisation
print(df.isnull().sum())
print(df.dtypes)

Shares                                66
Comments added                        66
Likes (vs. dislikes) (%)              66
Average view duration                 66
Views                                 66
Subscribers                           66
Impressions click-through rate (%)    66
dtype: int64
Shares                                float64
Comments added                        float64
Likes (vs. dislikes) (%)              float64
Average view duration                  object
Views                                 float64
Subscribers                           float64
Impressions click-through rate (%)    float64
dtype: object


In [5]:

#convert the column, "Average view duration" to seconds using lambda function for standardisation
df["Average view duration"]=df["Average view duration"].apply(lambda x: sum(int(i) * 60**idx for idx, i in enumerate(reversed(str(x).split(":"))))
 if isinstance(x,str) and ":" in x else pd.to_numeric(x, errors="coerce"))

print(df["Average view duration"].head())

0    572.0
1    638.0
2    571.0
3    366.0
4    452.0
Name: Average view duration, dtype: float64


In [6]:
df.head()

Unnamed: 0,Shares,Comments added,Likes (vs. dislikes) (%),Average view duration,Views,Subscribers,Impressions click-through rate (%)
0,2035.0,1578.0,97.95,572.0,4319500.0,3574.0,4.66
1,1070.0,1878.0,97.92,638.0,2522030.0,4845.0,5.14
2,1375.0,4314.0,97.54,571.0,3564849.0,8505.0,7.0
3,420.0,502.0,98.34,366.0,1148850.0,513.0,7.27
4,982.0,977.0,97.43,452.0,1559413.0,3865.0,5.08


In [8]:
df.describe()

Unnamed: 0,Shares,Comments added,Likes (vs. dislikes) (%),Average view duration,Views,Subscribers,Impressions click-through rate (%)
count,495.0,495.0,495.0,495.0,495.0,495.0,495.0
mean,2972.51049,1814.564103,97.701166,549.104895,3463973.0,8346.899767,5.590396
std,6770.974615,1510.17241,0.844893,132.573609,2999865.0,11923.696976,1.035558
min,203.0,187.0,91.33,218.0,1033484.0,341.0,2.62
25%,958.5,814.5,97.445,466.0,1676446.0,2327.0,4.955
50%,1871.0,1486.0,97.75,549.104895,2634122.0,5033.0,5.590396
75%,2972.51049,1968.0,98.2,625.5,3582543.0,8346.899767,6.095
max,106414.0,9109.0,99.12,1088.0,28037360.0,120906.0,10.42


In [7]:
#Handle Missing values
#fill the missing values with mean
df.fillna(df.mean(), inplace=True)

In [9]:
#Seperate the dependent values and independent values
X=df.drop(columns=["Views"]) #independent variable-features
y=df["Views"] #dependent variable - predictions "Views"

In [10]:
print(X.shape)  
print(y.shape) 

(495, 6)
(495,)


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42) #train_test_split

In [12]:
#As the features are in different scales, using standard scalar for standardisation
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.fit_transform(X_test)


In [15]:
#Step 3: #defining the models with parameters, train and log

import xgboost as xgb
models = [
    (
        "Linear Regression",  
        {"fit_intercept": True},
        LinearRegression(),
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "Random Forest", 
        {"n_estimators": 30, "max_depth": 3},
        RandomForestRegressor(), 
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "XGBRegressor",
        {"use_label_encoder": False, "eval_metric": 'logloss'},
        xgb.XGBRegressor(random_state=42),
        (X_train, y_train),
        (X_test, y_test)
    )
]

In [None]:
reports = []

for model_name, params, model, train_set, test_set in models:
    X_train = train_set[0]
    y_train = train_set[1]
    X_test = test_set[0]
    y_test = test_set[1]

    model.set_params(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    #create report dictionary
    report = {
        "model": model_name,
        "mean_squared_error": mse,
        "r2_score": r2        
    }
    
    #append the report to reports list
    reports.append(report)

for report in reports:
    print(f"{report['model']} -MSE: {report['mean_squared_error']}, R2: {report['r2_score']}")
    

Linear Regression -MSE: 2578256191028.136, R2: 0.7819823786293159
Random Forest -MSE: 2334613745457.612, R2: 0.8025848100839805
XGBRegressor -MSE: 3059298371068.8853, R2: 0.7413054000434015


In [21]:
#step 4: Log the models and results with mlflow to track the model performance and find the best model
import mlflow
import mlflow.sklearn
import mlflow.xgboost

In [24]:
#Initialize the MLflow
mlflow.set_experiment("Youtube_views_prediction")
mlflow.set_tracking_uri("http://localhost:5000")

for i, element in enumerate(models):
    model_name = element[0]
    params = element[1]
    model = element[2]
    report = reports[i]

    with mlflow.start_run(run_name=model_name):
        mlflow.log_params(params)
        mlflow.log_metrics({
            "mean_squared_error": report["mean_squared_error"],
            "r2_score": report["r2_score"]
        })

        if model_name == "XGBRegressor":
            mlflow.xgboost.log_model(model, "model")
        else:
            mlflow.sklearn.log_model(model, "model")



🏃 View run Linear Regression at: http://localhost:5000/#/experiments/162614338943334033/runs/4ea5dd5e9b474f478f1dcb3db762404b
🧪 View experiment at: http://localhost:5000/#/experiments/162614338943334033




🏃 View run Random Forest at: http://localhost:5000/#/experiments/162614338943334033/runs/ef850bbec2354b9abdd913ffb0c18e57
🧪 View experiment at: http://localhost:5000/#/experiments/162614338943334033




🏃 View run XGBRegressor at: http://localhost:5000/#/experiments/162614338943334033/runs/91701424a40f4e34b500ca5e9caf634d
🧪 View experiment at: http://localhost:5000/#/experiments/162614338943334033


In [25]:
#Step 5: Inferencing the X_test with the best model, which is RandomForest iin our case
test_df = pd.read_csv("X_test.csv") #load the X_test
test_df.head()

Unnamed: 0,Shares,Comments added,Likes (vs. dislikes) (%),Average view duration,Subscribers,Impressions click-through rate (%),Views prediction
0,27558,3243,95.66,00:11:10,28895,4.84,
1,575,942,98.1,00:10:59,879,5.99,
2,5362,1903,95.28,00:06:27,10362,5.98,
3,1231,465,93.65,00:10:26,3119,5.35,
4,4115,2126,97.16,00:13:42,10311,3.55,


In [29]:
#convert the column, "Average view duration" to seconds using lambda function for standardisation
test_df["Average view duration"]=test_df["Average view duration"].apply(lambda x: sum(int(i) * 60**idx for idx, i in enumerate(reversed(str(x).split(":"))))
 if isinstance(x,str) and ":" in x else pd.to_numeric(x, errors="coerce"))

print(test_df["Average view duration"].head())

0    670
1    659
2    387
3    626
4    822
Name: Average view duration, dtype: int64


In [None]:
#Seperate the dependent values and independent values
X=test_df.drop(columns=["Views prediction"]) #independent variable-features
y=test_df["Views prediction"] #dependent variable - Views prediction for test set

In [32]:
X=scaler.fit_transform(X) #standardisation

In [33]:
print(X.shape) #print the sizeof the test set

(71, 6)
(71,)


In [36]:
#load the Random Forest model from mlflow artifacts
model_artifact_path = "file:///f:/Be_Amazed_Projects/Task_1/mlruns/162614338943334033/ef850bbec2354b9abdd913ffb0c18e57/artifacts/model"

rf_model = mlflow.sklearn.load_model(model_artifact_path)


In [37]:
# make predictions using the best model
y_pred = rf_model.predict(X)

# Add the predictions to the test data
test_df['Views prediction'] = y_pred

#save and display the predictions
test_df.to_csv("predicted_test_results.csv", index=False)
print(test_df)

    Shares  Comments added  Likes (vs. dislikes) (%)  Average view duration  \
0    27558            3243                     95.66                    670   
1      575             942                     98.10                    659   
2     5362            1903                     95.28                    387   
3     1231             465                     93.65                    626   
4     4115            2126                     97.16                    822   
..     ...             ...                       ...                    ...   
66     587            1895                     98.66                    428   
67     922             824                     98.28                    457   
68    1141            2196                     97.93                    766   
69    1292             576                     98.39                    598   
70    1427            3026                     97.50                    582   

    Subscribers  Impressions click-through rate (%)