In [27]:
# Load libraries
import os
import pandas as pd
import numpy as np
import math
import random
import collections
import timeit
import xgboost as xgb
import sklearn.metrics

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv('C:/Users/HP/Downloads/video_data_top10_channels.csv')

In [3]:
df.head()

Unnamed: 0,comp_id,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption
0,0,1KEbiqRWOkA,Alex The Analyst,7 Mistakes to Avoid During Your Data Analyst J...,When I was a Hiring Managers I saw a lot of pe...,"['Data Analyst', 'Data Analyst job', 'Data Ana...",2024-03-05T13:00:01Z,8218,382.0,,40.0,PT11M54S,hd,False
1,1,UOBTLzWY1vs,Alex The Analyst,#DataAnalyst #AnalystBuilder #SQL,Link: AnalystBuilder.com,,2024-03-01T13:43:29Z,5703,401.0,,13.0,PT38S,hd,False
2,2,8zOkBTs0yxs,Alex The Analyst,Q/A Livestream | February Livestream | Ask Me ...,This is February's Livestream where you can co...,"['Data Analyst', 'Data Analyst job', 'Data Ana...",2024-02-29T17:10:18Z,4536,181.0,,6.0,PT1H34M57S,hd,False
3,3,7NBt0V8ebGk,Alex The Analyst,Window Functions in MySQL | Intermediate MySQL,Full MySQL Course: https://www.analystbuilder....,"['Data Analyst', 'Data Analyst job', 'Data Ana...",2024-02-27T13:00:08Z,5471,183.0,,13.0,PT13M29S,hd,False
4,4,FGC0cCAgGu0,Alex The Analyst,Twitter making me tear up over here 🥹,,,2024-02-23T13:34:27Z,3988,213.0,,11.0,PT16S,hd,False


In [4]:
mapping = {'hd': 1, 'sd': 0}
df['definition'] = df['definition'].replace(mapping)

In [8]:
# Convert categorical variables to numerical format
le = LabelEncoder()
df['title'] = le.fit_transform(df['title'])
df['tags'] = le.fit_transform(df['tags'])
df['description'] = le.fit_transform(df['description'])

In [9]:
df.head(5)

Unnamed: 0,comp_id,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption
0,0,1KEbiqRWOkA,Alex The Analyst,111,3983,257,2024-03-05T13:00:01Z,8218,382.0,,40.0,PT11M54S,1,False
1,1,UOBTLzWY1vs,Alex The Analyst,6,2319,3601,2024-03-01T13:43:29Z,5703,401.0,,13.0,PT38S,1,False
2,2,8zOkBTs0yxs,Alex The Analyst,3401,3276,204,2024-02-29T17:10:18Z,4536,181.0,,6.0,PT1H34M57S,1,False
3,3,7NBt0V8ebGk,Alex The Analyst,4774,670,257,2024-02-27T13:00:08Z,5471,183.0,,13.0,PT13M29S,1,False
4,4,FGC0cCAgGu0,Alex The Analyst,4447,4330,3601,2024-02-23T13:34:27Z,3988,213.0,,11.0,PT16S,1,False


In [10]:
count_likes_greater_than_1000 = df[df['likeCount'] > 1000]['likeCount'].count()

In [11]:
count_likes_greater_than_1000

1705

In [12]:
threshold_likes = 1000
df['trending'] = (df['likeCount'] > threshold_likes).astype(int)

In [13]:
df.head(5)

Unnamed: 0,comp_id,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption,trending
0,0,1KEbiqRWOkA,Alex The Analyst,111,3983,257,2024-03-05T13:00:01Z,8218,382.0,,40.0,PT11M54S,1,False,0
1,1,UOBTLzWY1vs,Alex The Analyst,6,2319,3601,2024-03-01T13:43:29Z,5703,401.0,,13.0,PT38S,1,False,0
2,2,8zOkBTs0yxs,Alex The Analyst,3401,3276,204,2024-02-29T17:10:18Z,4536,181.0,,6.0,PT1H34M57S,1,False,0
3,3,7NBt0V8ebGk,Alex The Analyst,4774,670,257,2024-02-27T13:00:08Z,5471,183.0,,13.0,PT13M29S,1,False,0
4,4,FGC0cCAgGu0,Alex The Analyst,4447,4330,3601,2024-02-23T13:34:27Z,3988,213.0,,11.0,PT16S,1,False,0


In [14]:
#Train Test Split
# Select features for training the model
features = ['title', 'tags', 'viewCount', 'commentCount']
X = df[features]
y = df['trending']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
     

In [15]:
print(X_train.shape)
print(X_test.shape)


(3946, 4)
(987, 4)


In [16]:
trlabel = y_train
telabel = y_test

dtrain = xgb.DMatrix(X_train, label=trlabel)
dtest  = xgb.DMatrix(X_test, label=telabel)

In [17]:
# Set parameters.
param = {'max_depth': 7, 
         'eta': 0.2,
         'objective': 'reg:squarederror',
         'nthread': 5,
         'eval_metric': 'rmse'
        }

evallist = [(dtest, 'eval'), (dtrain, 'train')]

In [19]:
# Train the model.
num_round = 70
bst = xgb.train(param, dtrain, num_round, evallist)

[0]	eval-rmse:0.41027	train-rmse:0.40304
[1]	eval-rmse:0.36398	train-rmse:0.34915
[2]	eval-rmse:0.33058	train-rmse:0.30783
[3]	eval-rmse:0.30730	train-rmse:0.27668
[4]	eval-rmse:0.29142	train-rmse:0.25351
[5]	eval-rmse:0.28282	train-rmse:0.23455
[6]	eval-rmse:0.27584	train-rmse:0.21897
[7]	eval-rmse:0.27125	train-rmse:0.20765
[8]	eval-rmse:0.26814	train-rmse:0.19798
[9]	eval-rmse:0.26524	train-rmse:0.18943
[10]	eval-rmse:0.26536	train-rmse:0.18177
[11]	eval-rmse:0.26433	train-rmse:0.17664
[12]	eval-rmse:0.26410	train-rmse:0.17327
[13]	eval-rmse:0.26313	train-rmse:0.17033
[14]	eval-rmse:0.26320	train-rmse:0.16565
[15]	eval-rmse:0.26326	train-rmse:0.16348
[16]	eval-rmse:0.26348	train-rmse:0.16136
[17]	eval-rmse:0.26363	train-rmse:0.15988
[18]	eval-rmse:0.26513	train-rmse:0.15738
[19]	eval-rmse:0.26513	train-rmse:0.15494
[20]	eval-rmse:0.26537	train-rmse:0.15324
[21]	eval-rmse:0.26526	train-rmse:0.14972
[22]	eval-rmse:0.26547	train-rmse:0.14688
[23]	eval-rmse:0.26544	train-rmse:0.14543
[2



[37]	eval-rmse:0.26512	train-rmse:0.11916
[38]	eval-rmse:0.26518	train-rmse:0.11760
[39]	eval-rmse:0.26528	train-rmse:0.11519
[40]	eval-rmse:0.26582	train-rmse:0.11321
[41]	eval-rmse:0.26616	train-rmse:0.11151
[42]	eval-rmse:0.26601	train-rmse:0.10980
[43]	eval-rmse:0.26666	train-rmse:0.10700
[44]	eval-rmse:0.26656	train-rmse:0.10667
[45]	eval-rmse:0.26658	train-rmse:0.10636
[46]	eval-rmse:0.26682	train-rmse:0.10484
[47]	eval-rmse:0.26732	train-rmse:0.10271
[48]	eval-rmse:0.26726	train-rmse:0.10246
[49]	eval-rmse:0.26793	train-rmse:0.10087
[50]	eval-rmse:0.26794	train-rmse:0.10049
[51]	eval-rmse:0.26820	train-rmse:0.09945
[52]	eval-rmse:0.26820	train-rmse:0.09865
[53]	eval-rmse:0.26854	train-rmse:0.09748
[54]	eval-rmse:0.26814	train-rmse:0.09597
[55]	eval-rmse:0.26836	train-rmse:0.09517
[56]	eval-rmse:0.26872	train-rmse:0.09456
[57]	eval-rmse:0.26886	train-rmse:0.09290
[58]	eval-rmse:0.26884	train-rmse:0.09193
[59]	eval-rmse:0.26903	train-rmse:0.09042
[60]	eval-rmse:0.26943	train-rmse:

In [24]:
# Make prediction.
y_pred = bst.predict(dtest).round()

# Compute RMSE on test set.
mse_xgboost = mean_squared_error(y_test, ypred)
rmse_xgboost = math.sqrt(mse_xgboost)

print('RMSE with XGBoost', rmse_xgboost)

RMSE with XGBoost 0.31830350703961513


### Fit model to test data

In [30]:
dtest = xgb.DMatrix(df.loc[:, df.columns != 'comp_id'][bst.feature_names])
solution = bst.predict(d_test).round()
solution_df = pd.concat([df[['comp_id']], pd.DataFrame(solution, columns = ['views'])], axis=1)
solution_df.to_csv('solution.csv', index=False)

In [31]:
# Assuming 'dtest' contains XGBoost DMatrix object and 'y_pred' contains predicted labels
y_true = dtest.get_label()  # Extracting true labels from the DMatrix object
y_pred = np.array(y_pred)  # Converting predicted labels to a numpy array

In [34]:
# Evaluate the model
#accuracy = accuracy_score(y_true, y_pred)
#print(f'Accuracy: {accuracy:.2f}')

# Display classification report
#print(classification_report(y_true, y_pred))

ValueError: Found input variables with inconsistent numbers of samples: [0, 987]