Loading the Data set and Exploratory Data Analysis


In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# Load the dataset
df = pd.read_csv('/content/TDL_DATASET.csv')

# Display the first few rows of the DataFrame
print(df.head())



      video_id                                              title  \
0  9TFJ9fSirB8  APARAJITA - Full Episode - 562 | ଅପରାଜିତା | Od...   
1  Sm7fButI6lU        Mahanadhi | 30th Nov & 1st Dec 2023 - Promo   
2  R-sh22bAAA4  अब AARAMBH NEET 2024 का होगा AAGAAZ…… 🔥🚀 #Aara...   
3  4u92ooRjKzc  School Se Lene  Gaya Piyush Kunali Ko 😍 Super ...   
4  tU6O2XBOjro  Girls College Bunk | EMI Rani | ( Check Descri...   

           publishedAt                 channelId        channelTitle  \
0  2023-11-30 13:27:52  UCbBWncD3X_dfXwxmj4KwJnA        Sidharrth TV   
1   2023-11-30 7:50:38  UCvrhwpnp2DHYQ1CbXby9ypQ    Vijay Television   
2   2023-11-30 6:45:01  UCD16eo98AXl-9T61Xd711kQ  Competition Wallah   
3   2023-11-30 2:30:02  UCjvgGbPPn-FgYeguc5nxG4A  Sourav Joshi Vlogs   
4  2023-11-29 12:30:29  UCUKv9os4AZolovN8AS6yyBw            EMI Rani   

   categoryId         trending_date  \
0          24  2023-12-01T00:00:00Z   
1          24  2023-12-01T00:00:00Z   
2          27  2023-12-01T00:00:00Z

Basic pre-processing steps such as checking for missing values and datatypes of the columns


In [None]:
# Check for missing values
print(df.isnull().sum())

# Remove rows with any missing values
df.dropna(inplace=True)

# Remove duplicate rows
df.drop_duplicates(inplace=True)

# Check the shape of the DataFrame after removing missing values and duplicates
print("Shape after removing missing values and duplicates:", df.shape)

# Check the data types of each column
print(df.dtypes)

# Convert 'publishedAt' and 'trending_date' to datetime
df['publishedAt'] = pd.to_datetime(df['publishedAt'])
df['trending_date'] = pd.to_datetime(df['trending_date'])

# Display the first few rows of the DataFrame after preprocessing
print(df.head())


video_id              0
title                 0
publishedAt           0
channelId             0
channelTitle          0
categoryId            0
trending_date         0
tags                  0
view_count            0
likes                 0
comment_count         0
thumbnail_link        0
comments_disabled     0
ratings_disabled      0
Subscriber_Count     23
Is_Live_Stream        0
Top_Comments          0
dtype: int64
Shape after removing missing values and duplicates: (1421, 17)
video_id              object
title                 object
publishedAt           object
channelId             object
channelTitle          object
categoryId             int64
trending_date         object
tags                  object
view_count             int64
likes                  int64
comment_count          int64
thumbnail_link        object
comments_disabled       bool
ratings_disabled        bool
Subscriber_Count     float64
Is_Live_Stream         int64
Top_Comments          object
dtype: object
      vid

Mapping of category name and ID


In [None]:
# Define the category ID to category name mapping
category_mapping = {
    "1": "Film & Animation",
    "2": "Autos & Vehicles",
    "10": "Music",
    "15": "Pets & Animals",
    "17": "Sports",
    "18": "Short Movies",
    "19": "Travel & Events",
    "20": "Gaming",
    "21": "Videoblogging",
    "22": "People & Blogs",
    "23": "Comedy",
    "24": "Entertainment",
    "25": "News & Politics",
    "26": "Howto & Style",
    "27": "Education",
    "28": "Science & Technology",
    "30": "Movies",
    "31": "Anime/Animation",
    "32": "Action/Adventure",
    "33": "Classics",
    "34": "Comedy",
    "35": "Documentary",
    "36": "Drama",
    "37": "Family",
    "38": "Foreign",
    "39": "Horror",
    "40": "Sci-Fi/Fantasy",
    "41": "Thriller",
    "42": "Shorts",
    "43": "Shows",
    "44": "Trailers",
    # Add missing category IDs here
}

# Map category IDs to category names and add a new column 'category_name' to the DataFrame
df['category_name'] = df['categoryId'].astype(str).map(category_mapping)

# Display the rows where 'category_name' is NaN to identify missing category IDs
print(df[df['category_name'].isnull()])



Empty DataFrame
Columns: [video_id, title, publishedAt, channelId, channelTitle, categoryId, trending_date, tags, view_count, likes, comment_count, thumbnail_link, comments_disabled, ratings_disabled, Subscriber_Count, Is_Live_Stream, Top_Comments, category_name]
Index: []


In [None]:
print(df.head())

      video_id                                              title  \
0  9TFJ9fSirB8  APARAJITA - Full Episode - 562 | ଅପରାଜିତା | Od...   
1  Sm7fButI6lU        Mahanadhi | 30th Nov & 1st Dec 2023 - Promo   
2  R-sh22bAAA4  अब AARAMBH NEET 2024 का होगा AAGAAZ…… 🔥🚀 #Aara...   
3  4u92ooRjKzc  School Se Lene  Gaya Piyush Kunali Ko 😍 Super ...   
4  tU6O2XBOjro  Girls College Bunk | EMI Rani | ( Check Descri...   

          publishedAt                 channelId        channelTitle  \
0 2023-11-30 13:27:52  UCbBWncD3X_dfXwxmj4KwJnA        Sidharrth TV   
1 2023-11-30 07:50:38  UCvrhwpnp2DHYQ1CbXby9ypQ    Vijay Television   
2 2023-11-30 06:45:01  UCD16eo98AXl-9T61Xd711kQ  Competition Wallah   
3 2023-11-30 02:30:02  UCjvgGbPPn-FgYeguc5nxG4A  Sourav Joshi Vlogs   
4 2023-11-29 12:30:29  UCUKv9os4AZolovN8AS6yyBw            EMI Rani   

   categoryId             trending_date  \
0          24 2023-12-01 00:00:00+00:00   
1          24 2023-12-01 00:00:00+00:00   
2          27 2023-12-01 00:0

Validating the mapping of categories from IN_category_id.json


In [None]:
import json

# Load the JSON file containing category information
with open('/content/IN_category_id.json', 'r') as f:
    category_data = json.load(f)

# Extract category mappings from the JSON data
category_mapping_json = {}
for item in category_data['items']:
    category_id = item['id']
    category_name = item['snippet']['title']
    category_mapping_json[category_id] = category_name

# Validate category IDs and names from your dataset
for category_id, category_name in category_mapping.items():
    if category_id not in category_mapping_json:
        print(f"Category ID {category_id} not found in the JSON file")
    elif category_mapping_json[category_id] != category_name:
        print(f"Mismatch for Category ID {category_id}: JSON title - {category_mapping_json[category_id]}, Dataset title - {category_name}")
    else:
        print(f"Category ID {category_id} validated successfully")

# Additional validation for categories present in the JSON file but not in your dataset
for category_id, category_name in category_mapping_json.items():
    if category_id not in category_mapping:
        print(f"Category ID {category_id} - {category_name} present in JSON file but not in the dataset")



Category ID 1 validated successfully
Category ID 2 validated successfully
Category ID 10 validated successfully
Category ID 15 validated successfully
Category ID 17 validated successfully
Category ID 18 validated successfully
Category ID 19 validated successfully
Category ID 20 validated successfully
Category ID 21 validated successfully
Category ID 22 validated successfully
Category ID 23 validated successfully
Category ID 24 validated successfully
Category ID 25 validated successfully
Category ID 26 validated successfully
Category ID 27 validated successfully
Category ID 28 validated successfully
Category ID 30 validated successfully
Category ID 31 validated successfully
Category ID 32 validated successfully
Category ID 33 validated successfully
Category ID 34 validated successfully
Category ID 35 validated successfully
Category ID 36 validated successfully
Category ID 37 validated successfully
Category ID 38 validated successfully
Category ID 39 validated successfully
Category ID 40

One hot encoding categories to convert them to numerical values


In [None]:
# Perform one-hot encoding for 'category_name'
df_encoded = pd.get_dummies(df, columns=['category_name'], prefix='', prefix_sep='')

# Drop the original 'categoryid' column
df_encoded.drop('categoryId', axis=1, inplace=True)

# Display the first few rows of the encoded DataFrame
print(df_encoded.head())


      video_id                                              title  \
0  9TFJ9fSirB8  APARAJITA - Full Episode - 562 | ଅପରାଜିତା | Od...   
1  Sm7fButI6lU        Mahanadhi | 30th Nov & 1st Dec 2023 - Promo   
2  R-sh22bAAA4  अब AARAMBH NEET 2024 का होगा AAGAAZ…… 🔥🚀 #Aara...   
3  4u92ooRjKzc  School Se Lene  Gaya Piyush Kunali Ko 😍 Super ...   
4  tU6O2XBOjro  Girls College Bunk | EMI Rani | ( Check Descri...   

          publishedAt                 channelId        channelTitle  \
0 2023-11-30 13:27:52  UCbBWncD3X_dfXwxmj4KwJnA        Sidharrth TV   
1 2023-11-30 07:50:38  UCvrhwpnp2DHYQ1CbXby9ypQ    Vijay Television   
2 2023-11-30 06:45:01  UCD16eo98AXl-9T61Xd711kQ  Competition Wallah   
3 2023-11-30 02:30:02  UCjvgGbPPn-FgYeguc5nxG4A  Sourav Joshi Vlogs   
4 2023-11-29 12:30:29  UCUKv9os4AZolovN8AS6yyBw            EMI Rani   

              trending_date  \
0 2023-12-01 00:00:00+00:00   
1 2023-12-01 00:00:00+00:00   
2 2023-12-01 00:00:00+00:00   
3 2023-12-01 00:00:00+00:00   
4 2

In [None]:
# Export df to CSV
df.to_csv('/content/dataset_category_name.csv', index=False)

# Export df_encoded to CSV
df_encoded.to_csv('/content/dataset_category_encoded.csv', index=False)


Sentiment Analysis using Textblob on the top 10 comments per video to analyse the overall audience sentiment and to arrive at a Sentiment score

In [None]:
import pandas as pd
from textblob import TextBlob

# Load the existing DataFrame
existing_df = pd.read_csv('/content/dataset_category_encoded.csv')
# Define the number of comments to analyze for each video
comments_per_video = 10
batch_size = 10000  # Adjust as needed

# Function to perform sentiment analysis on comments
def analyze_sentiment(comments):
    sentiment_scores = []
    for comment in comments:
        if comment:
            analysis = TextBlob(comment)
            sentiment_scores.append(analysis.sentiment.polarity)
        else:
            sentiment_scores.append(None)
    return sentiment_scores

# Check if 'Sentiment_Scores' column already exists
if 'Sentiment_Scores' not in existing_df.columns:
    existing_df['Sentiment_Scores'] = None

# Iterate over the rows in the existing DataFrame and update the 'Sentiment_Scores' column
for index, row in existing_df.iterrows():
    top_comments_str = row['Top_Comments']

    # Check for NaN values before splitting
    if not pd.isna(top_comments_str):
        # Split the comma-separated string into a list of comments
        top_comments = [comment.strip() for comment in top_comments_str.split(',')]

        # Perform sentiment analysis
        sentiment_scores = analyze_sentiment(top_comments)

        # Calculate the mean sentiment score
        mean_sentiment = pd.Series(sentiment_scores).mean()

        # Update the 'Sentiment_Scores' column
        existing_df.at[index, 'Sentiment_Scores'] = mean_sentiment

# Save the updated DataFrame back to the same CSV file
existing_df.to_csv('/content/dataset_category_encoded.csv', index=False)


Analysing video tags to check for frequency

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Load the existing DataFrame
existing_df = pd.read_csv('/content/TDL DATASET - Sheet1.csv')

# Function to preprocess tags
def preprocess_tags(tags):
    if pd.isna(tags):
        return ''
    # Convert tags to lowercase and replace '|' with space
    return tags.lower().replace('|', ' ')

# Preprocess the tags column
existing_df['Processed_Tags'] = existing_df['tags'].apply(preprocess_tags)

# Initialize CountVectorizer
vectorizer = CountVectorizer(max_features=100)  # Adjust max_features as needed

# Fit and transform the processed tags to numerical features
tag_features = vectorizer.fit_transform(existing_df['Processed_Tags']).toarray()

# Create a DataFrame for the tag features
tag_feature_names = vectorizer.get_feature_names_out()
tag_features_df = pd.DataFrame(tag_features, columns=tag_feature_names)

# Concatenate the tag features with the existing DataFrame
existing_df = pd.concat([existing_df, tag_features_df], axis=1)

# Calculate the sum of tag occurrences for each video
existing_df['Tag_Score'] = existing_df[tag_feature_names].sum(axis=1)

# Display the updated DataFrame with the tag features and score
print(existing_df.head())


      video_id                                              title  \
0  9TFJ9fSirB8  APARAJITA - Full Episode - 562 | ଅପରାଜିତା | Od...   
1  Sm7fButI6lU        Mahanadhi | 30th Nov & 1st Dec 2023 - Promo   
2  R-sh22bAAA4  अब AARAMBH NEET 2024 का होगा AAGAAZ…… 🔥🚀 #Aara...   
3  4u92ooRjKzc  School Se Lene  Gaya Piyush Kunali Ko 😍 Super ...   
4  tU6O2XBOjro  Girls College Bunk | EMI Rani | ( Check Descri...   

           publishedAt                 channelId        channelTitle  \
0  2023-11-30 13:27:52  UCbBWncD3X_dfXwxmj4KwJnA        Sidharrth TV   
1   2023-11-30 7:50:38  UCvrhwpnp2DHYQ1CbXby9ypQ    Vijay Television   
2   2023-11-30 6:45:01  UCD16eo98AXl-9T61Xd711kQ  Competition Wallah   
3   2023-11-30 2:30:02  UCjvgGbPPn-FgYeguc5nxG4A  Sourav Joshi Vlogs   
4  2023-11-29 12:30:29  UCUKv9os4AZolovN8AS6yyBw            EMI Rani   

   categoryId         trending_date  \
0          24  2023-12-01T00:00:00Z   
1          24  2023-12-01T00:00:00Z   
2          27  2023-12-01T00:00:00Z

Using XGBoost as regressor to determine most important features


In [None]:
import pandas as pd
import xgboost as xgb

# Load the DataFrame from the CSV file
df = pd.read_csv('/content/TDL DATASET - Sheet1.csv')  # Replace 'minmax_scaled_dataset.csv' with the actual file path

# Drop non-numeric columns
non_numeric_columns = ['video_id', 'title', 'publishedAt', 'channelId', 'channelTitle']
df_numeric = df.drop(non_numeric_columns, axis=1)

# Exclude specified columns from feature selection
exclude_columns = ["trending_date", "tags", "thumbnail_link", "comments_disabled", "ratings_disabled", "Is_Live_Stream", "Top_Comments"]
features_for_selection = [col for col in df_numeric.columns if col not in exclude_columns and col != "likes"]

# Extract features and target variable
X = df_numeric[features_for_selection]
y = df_numeric["likes"]  # Likes column as target variable

# Train XGBoost model for feature selection
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.1,
                           max_depth=5, alpha=10, n_estimators=10)  # You can adjust hyperparameters as needed

# Fit the model to get feature importances
xg_reg.fit(X, y)

# Get selected features based on importance
selected_features_xgboost = [feature for feature, importance in zip(X.columns, xg_reg.feature_importances_) if importance > 0]

# Print selected features
print("Selected Features (XGBoost):", selected_features_xgboost)

Selected Features (XGBoost): ['categoryId', 'view_count', 'comment_count', 'Subscriber_Count']


The following code converts publishedAT & Trending_date into the same format

In [None]:
# Convert 'publishedAt' and 'trending_date' to datetime objects
existing_df = pd.read_csv('/dataset_category_encoded (1).csv')
existing_df['publishedAt'] = pd.to_datetime(existing_df['publishedAt'])
existing_df['trending_date'] = pd.to_datetime(existing_df['trending_date']).dt.date  # Extract date portion

# Calculate time till trend
existing_df['time_till_trend'] = (existing_df['trending_date'] - existing_df['publishedAt'].dt.date).dt.days

existing_df.to_csv('/dataset_category_encoded (1).csv', index=False)

# Display the DataFrame
print(existing_df)

         video_id                                              title  \
0     9TFJ9fSirB8  APARAJITA - Full Episode - 562 | ଅପରାଜିତା | Od...   
1     Sm7fButI6lU        Mahanadhi | 30th Nov & 1st Dec 2023 - Promo   
2     R-sh22bAAA4  अब AARAMBH NEET 2024 का होगा AAGAAZ…… 🔥🚀 #Aara...   
3     4u92ooRjKzc  School Se Lene  Gaya Piyush Kunali Ko 😍 Super ...   
4     tU6O2XBOjro  Girls College Bunk | EMI Rani | ( Check Descri...   
...           ...                                                ...   
1416  6YfvMXjn32c   PAYAL OR GOLU KHOLENGI SAARE RAAZ | Armaan Malik   
1417  bALYalNOuCA  ഇന്നും കളി നടന്നില്ലെങ്കിൽ കിരീടം ആർക്ക് ലഭിക്...   
1418  CTpzqNLjudk        Inside the Great Barrier Reef of Australia!   
1419  zWLe3MDr68k  Bharat Mata Ki Jai' slogans reverberate in new...   
1420  xECqx8V730k  I went to EVERY Unique Cafe in the World!! *दु...   

             publishedAt                 channelId        channelTitle  \
0    2023-11-30 13:27:52  UCbBWncD3X_dfXwxmj4KwJnA        Sid

The below snippet uses min-max normalisation to standardize the dataset

In [None]:
import pandas as pd

# Read the CSV file
df = pd.read_csv("dataset_category_encoded_final.csv")

# Select columns to be normalized
columns_to_normalize = ['view_count', 'comment_count', 'Sentiment_Scores', 'time_till_trend', 'Subscriber_Count']

# Apply min-max normalization
df[columns_to_normalize] = (df[columns_to_normalize] - df[columns_to_normalize].min()) / (df[columns_to_normalize].max() - df[columns_to_normalize].min())

# Keep the "likes" column and normalized columns
columns_to_keep = ['likes'] + columns_to_normalize
df = df[columns_to_keep]

# Save the modified dataset to a new CSV file
df.to_csv("likes_and_normalized_columns.csv", index=False)


The following code snippet applies random forest after splitting the data into Test and Train dataframes. The features used to predict the likes are :
 'view_count', 'comment_count', 'Sentiment_Scores', 'time_till_trend', 'Subscriber_Count'. RMSE, R^2 and MAE are used to validate the predicted likes.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error, mean_squared_log_error

# Read the CSV file
df = pd.read_csv("/content/likes_and_normalized_columns.csv")

# Define features and target variable
X = df[['view_count', 'comment_count', 'Sentiment_Scores', 'time_till_trend', 'Subscriber_Count']]
y = df['likes']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Random Forest regressor
rf_model = RandomForestRegressor()

# Train the model
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Calculate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("RMSE:", rmse)

# Calculate R-squared value
r_squared = r2_score(y_test, y_pred)
print("R-squared value:", r_squared)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print("MAE:", mae)

# Calculate Mean Absolute Percentage Error (MAPE)
mape = mean_absolute_percentage_error(y_test, y_pred)
print("MAPE:", mape)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("MSE:", mse)

# Store predicted likes in a new column
X_test['predicted_likes'] = y_pred

# Concatenate the original 'likes' column with the DataFrame containing predicted likes
result_df = pd.concat([X_test, y_test], axis=1)

# Save the dataframe to a new CSV file
result_df.to_csv("predicted_likes_with_original_rf.csv", index=False)


RMSE: 195179.4082651223
R-squared value: 0.8744729310420721
MAE: 37743.68140350877
MAPE: 0.6260529721525804
MSE: 38095001410.72329


The following code snippet applies an ensemble model in the form of XGboost after splitting the data into Test and Train dataframes. The features used to predict the likes are :
 'view_count', 'comment_count', 'Sentiment_Scores', 'time_till_trend', 'Subscriber_Count'. RMSE, R^2 and MAE are used to validate the predicted likes.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error, mean_squared_log_error

# Read the CSV file
df = pd.read_csv("/content/likes_and_normalized_columns.csv")

# Define features and target variable
X = df[['view_count', 'comment_count', 'Sentiment_Scores', 'time_till_trend', 'Subscriber_Count']]
y = df['likes']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize XGBoost regressor
xgb_model = XGBRegressor()

# Train the model
xgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred = xgb_model.predict(X_test)

# Calculate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("RMSE:", rmse)

# Calculate R-squared value
r_squared = r2_score(y_test, y_pred)
print("R-squared value:", r_squared)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print("MAE:", mae)


# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("MSE:", mse)


# Store predicted likes in a new column
X_test['predicted_likes'] = y_pred

# Concatenate the original 'likes' column with the DataFrame containing predicted likes
result_df = pd.concat([X_test, y_test], axis=1)

# Save the dataframe to a new CSV file
result_df.to_csv("predicted_likes_with_original.csv", index=False)


RMSE: 110538.93938925382
R-squared value: 0.9597375701889096
MAE: 31216.7363869918
MSE: 12218857121.30113


Xg Boost Regressor provides the best results