<a href="https://colab.research.google.com/github/SanjulaDeshan/YouTube-Trending-Video-Prediction/blob/main/data%5Ccolab_EDA_and_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Step 1: Load the datasets
df_trending = pd.read_csv("../data/trending.csv")
df_non_trending = pd.read_excel("../data/non_trending.xlsx")

# Step 2: Add label columns
df_trending['label'] = 1  # trending videos
df_non_trending['label'] = 0  # non-trending videos

# Step 3: Keep only needed columns and standardize names
df_trending = df_trending[['video_title', 'video_description', 'label']]
df_non_trending = df_non_trending[['title', 'description', 'label']]
df_non_trending.rename(columns={'title': 'video_title', 'description': 'video_description'}, inplace=True)

# Step 4: Combine the two DataFrames
df_combined = pd.concat([df_trending, df_non_trending], ignore_index=True)

# Step 5: Drop rows with missing title/description
df_combined.dropna(subset=['video_title', 'video_description'], inplace=True)

# Step 6: Create a combined 'text' column for model input
df_combined['text'] = df_combined['video_title'] + " " + df_combined['video_description']

# Step 7: Preview the result
print("Combined Dataset Preview:")
print(df_combined.head())

# Optional: Check how many samples you have
print("Total samples:", len(df_combined))
print("Trending videos:", df_combined['label'].sum())
print("Non-trending videos:", len(df_combined) - df_combined['label'].sum())

Combined Dataset Preview:
                                         video_title  \
0                     BTS: Boy with Luv (Live) - SNL   
1          Star Wars: The Rise of Skywalker – Teaser   
2  Gordon Ramsay Enters An Indian Cooking Competi...   
3                         We Got Married...(Pt. 2/4)   
4             BTS Eat Churros on The Morning Mash Up   

                                   video_description  label  \
0  Musical guest BTS performs "Boy with Luv" on S...      1   
1  Every generation has a legend. Watch the brand...      1   
2  As Gordon's trip in Malaysia comes towards an ...      1   
3  The Day i Committed To My Bestfriend!!\n\n\nFO...      1   
4  The Morning Mash Up crew gifted BTS with their...      1   

                                                text  
0  BTS: Boy with Luv (Live) - SNL Musical guest B...  
1  Star Wars: The Rise of Skywalker – Teaser Ever...  
2  Gordon Ramsay Enters An Indian Cooking Competi...  
3  We Got Married...(Pt. 2/4) The Day 

In [None]:
# Save the combined DataFrame to a CSV file
df_combined.to_csv("../data/combined_entertainment_data.csv", index=False)

In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Load the combined CSV (if not already loaded)
df_combined = pd.read_csv("../data/combined_entertainment_data.csv")

# Step 2: Define text cleaning function
def clean_text(text):
    text = str(text).lower()  # lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # remove URLs
    text = re.sub(r'@\w+|#', '', text)  # remove @mentions and hashtags
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text

# Step 3: Clean the text column
df_combined['clean_text'] = df_combined['text'].apply(clean_text)

# Step 4: Prepare feature and label variables
X_text = df_combined['clean_text']
y = df_combined['label']

# Step 5: Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(
    stop_words='english',       # remove common English stopwords
    max_features=5000,          # limit to top 5000 features
    ngram_range=(1, 2)          # use unigrams and bigrams
)

# Step 6: Fit and transform text into vector format
X = vectorizer.fit_transform(X_text)

# Step 7: Output shapes
print("✅ TF-IDF matrix shape:", X.shape)
print("✅ Labels shape:", y.shape)
# Save the cleaned dataset with clean_text column
df_combined.to_csv("../data/cleaned_entertainment_data.csv", index=False)

✅ TF-IDF matrix shape: (405511, 5000)
✅ Labels shape: (405511,)


In [None]:
print(df_combined['label'].value_counts())

label
1    404464
0      1047
Name: count, dtype: int64


In [None]:
import pandas as pd
from sklearn.utils import resample

# Step 1: Load the cleaned dataset
df = pd.read_csv("../content/cleaned_entertainment_data.csv")  # Replace with your actual file name if different

# Step 2: Separate majority and minority classes
df_majority = df[df['label'] == 1]
df_minority = df[df['label'] == 0]

# Step 3: Downsample the majority class
df_majority_downsampled = resample(
    df_majority,
    replace=False,                   # no duplicates
    n_samples=len(df_minority),     # match minority count (1047)
    random_state=42                 # for reproducibility
)

# Step 4: Combine minority and downsampled majority class
df_balanced = pd.concat([df_majority_downsampled, df_minority])

# Step 5: Shuffle the rows
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Step 6: Save to new CSV
df_balanced.to_csv("balanced_data.csv", index=False)

# Check balance
print("✅ New label balance:\n", df_balanced['label'].value_counts())

✅ New label balance:
 label
0    1047
1    1047
Name: count, dtype: int64


In [None]:
import pandas as pd
import re

# Load cleaned dataset (after balancing)
df = pd.read_csv("./balanced_data.csv")  # Or the actual path

# Cleaning function
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # remove URLs
    text = re.sub(r"[^\w\s]", "", text)  # remove punctuation
    text = re.sub(r"\d+", "", text)  # remove numbers
    text = re.sub(r"\s+", " ", text).strip()  # remove extra whitespace
    return text

# Apply separately
df["clean_title"] = df["video_title"].apply(clean_text)
df["clean_description"] = df["video_description"].apply(clean_text)

# Combine for modeling
df["final_text"] = df["clean_title"] + " " + df["clean_description"]

# Save cleaned and combined file
df.to_csv("./final_dataset_for_model.csv", index=False)

print("✅ Cleaned and combined dataset saved as 'final_dataset_for_model.csv'")

✅ Cleaned and combined dataset saved as 'final_dataset_for_model.csv'


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

print("Step 4: Feature Extraction (Vectorization) \n")

# Load the dataset
df = pd.read_csv("./final_dataset_for_model.csv")

# Confirm the column we’ll use
texts = df['final_text']
labels = df['label']

# Create the TF-IDF Vectorizer
tfidf = TfidfVectorizer(
    max_features=5000,     # Limit to top 5000 features
    stop_words='english',  # Remove common English stop words
    ngram_range=(1, 2)     # Use unigrams and bigrams
)

# Fit and transform the data
X = tfidf.fit_transform(texts)

# Save vectorizer for future use (important for inference!)
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

from scipy.sparse import save_npz

# Save features (sparse matrix format) and labels/
save_npz("X_features_tfidf.npz", X)
labels.to_csv("y_labels.csv", index=False)


Step 4: Feature Extraction (Vectorization) 



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your cleaned CSV file
df = pd.read_csv("./final_dataset_for_model.csv")  # replace with your actual cleaned file name

# Combine clean_title and clean_description into one final column
df["final_text"] = df["clean_title"].fillna('') + " " + df["clean_description"].fillna('')

# Ensure correct column order (optional)
df = df[[
    "video_title",
    "video_description",
    "label",
    "clean_title",
    "clean_description",
    "final_text"
]]

# Split into training and testing sets (80% train, 20% test)
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

# Save to new CSV files
train_df.to_csv("./train_data.csv", index=False)
test_df.to_csv("./test_data.csv", index=False)

print("✅ Dataset split and saved successfully!")

✅ Dataset split and saved successfully!


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Step 1: Load data
df = pd.read_csv("./train_data.csv")  # Make sure this is the latest cleaned and balanced file

# Step 2: Define features and target
X = df['final_text']
y = df['label']

# Step 3: Split into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 5: Train the classification model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Step 6: Evaluate the model
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Step 7: Save the model and vectorizer
joblib.dump(model, "./text_classifier_model.pkl")
joblib.dump(vectorizer, "./tfidf_vectorizer.pkl")
print("✅ successfully do the Step 6: Train the Classification Model")

Accuracy: 0.9343283582089552
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93       166
           1       0.93      0.93      0.93       169

    accuracy                           0.93       335
   macro avg       0.93      0.93      0.93       335
weighted avg       0.93      0.93      0.93       335

✅ successfully do the Step 6: Train the Classification Model


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier
from xgboost import XGBClassifier
import joblib

# Step 1: Load the dataset
df = pd.read_csv("./train_data.csv")

# Step 2: Split features and labels
X = df['final_text']
y = df['label']

# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: TF-IDF Vectorization (fit only on train, transform both train & test)
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Save vectorizer (used by all models)
joblib.dump(vectorizer, "./tfidf_vectorizer.pkl")

# --- Model 1: Ridge Classifier (Regression model for classification) ---
ridge_model = RidgeClassifier()
ridge_model.fit(X_train_tfidf, y_train)
joblib.dump(ridge_model, "./ridge_classifier_model.pkl")

# --- Model 2: SVM Classifier ---
svm_model = SVC(kernel='linear', probability=True)
svm_model.fit(X_train_tfidf, y_train)
joblib.dump(svm_model, "./svm_classifier_model.pkl")

# --- Model 3: XGBoost Classifier ---
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train_tfidf, y_train)
joblib.dump(xgb_model, "./xgb_classifier_model.pkl")

print("✅ All models trained and saved successfully.")

Parameters: { "use_label_encoder" } are not used.



✅ All models trained and saved successfully.


In [None]:
import joblib
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Load data
df = pd.read_csv("./train_data.csv")
X = df['final_text']
y = df['label']

# Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Load TF-IDF vectorizer and transform text
vectorizer = joblib.load("tfidf_vectorizer.pkl")
X_test_tfidf = vectorizer.transform(X_test)

# Define models and names
model_files = {
    "Logistic Regression": "text_classifier_model.pkl",
    "Ridge Classifier": "ridge_classifier_model.pkl",
    "SVM": "svm_classifier_model.pkl",
    "XGBoost": "xgb_classifier_model.pkl"
}

# Evaluate and compare
results = []

for model_name, file_name in model_files.items():
    model = joblib.load(file_name)
    y_pred = model.predict(X_test_tfidf)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results.append({
        "Model": model_name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1-Score": f1
    })

    print(f"----- {model_name} -----")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print()

# Display comparison table
results_df = pd.DataFrame(results)
print("🔍 Model Comparison:\n")
print(results_df.sort_values(by="F1-Score", ascending=False).reset_index(drop=True))

----- Logistic Regression -----
              precision    recall  f1-score   support

           0       0.93      0.93      0.93       166
           1       0.93      0.93      0.93       169

    accuracy                           0.93       335
   macro avg       0.93      0.93      0.93       335
weighted avg       0.93      0.93      0.93       335

Confusion Matrix:
[[155  11]
 [ 11 158]]

----- Ridge Classifier -----
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       166
           1       0.96      0.96      0.96       169

    accuracy                           0.96       335
   macro avg       0.96      0.96      0.96       335
weighted avg       0.96      0.96      0.96       335

Confusion Matrix:
[[159   7]
 [  7 162]]

----- SVM -----
              precision    recall  f1-score   support

           0       0.95      0.94      0.95       166
           1       0.94      0.95      0.95       169

    accuracy         

In [None]:
import pandas as pd
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import nltk
nltk.download('stopwords')

# Load test data
test_df = pd.read_csv("./test_data.csv")

# Combine title and description into final_text
def clean_text(text):
    import re
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer
    stop_words = set(stopwords.words('english'))
    ps = PorterStemmer()

    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [ps.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

test_df['final_text'] = (test_df['video_title'].astype(str) + ' ' + test_df['video_description'].astype(str)).apply(clean_text)

# Load vectorizer
vectorizer = joblib.load("tfidf_vectorizer.pkl")
X_test_final = vectorizer.transform(test_df['final_text'])

# Labels
y_true = test_df['label']

# Load all models
model_paths = {
    "Logistic Regression": "text_classifier_model.pkl",
    "Ridge Classifier": "ridge_classifier_model.pkl",
    "SVM": "svm_classifier_model.pkl",
    "XGBoost": "xgb_classifier_model.pkl"
}

# Evaluate
print("\n📊 Evaluation on External Test Data:")
for name, path in model_paths.items():
    model = joblib.load(path)
    y_pred = model.predict(X_test_final)

    print(f"\n📌 {name}")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_true, y_pred):.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print(classification_report(y_true, y_pred))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.



📊 Evaluation on External Test Data:

📌 Logistic Regression
Accuracy: 0.6492
Precision: 0.9844
Recall: 0.3014
F1 Score: 0.4615
Confusion Matrix:
[[209   1]
 [146  63]]
              precision    recall  f1-score   support

           0       0.59      1.00      0.74       210
           1       0.98      0.30      0.46       209

    accuracy                           0.65       419
   macro avg       0.79      0.65      0.60       419
weighted avg       0.79      0.65      0.60       419


📌 Ridge Classifier
Accuracy: 0.6778
Precision: 0.9512
Recall: 0.3732
F1 Score: 0.5361
Confusion Matrix:
[[206   4]
 [131  78]]
              precision    recall  f1-score   support

           0       0.61      0.98      0.75       210
           1       0.95      0.37      0.54       209

    accuracy                           0.68       419
   macro avg       0.78      0.68      0.64       419
weighted avg       0.78      0.68      0.64       419


📌 SVM
Accuracy: 0.6969
Precision: 0.9556
Recall: 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os

save_dir = "/content/drive/My Drive/ML_Model_Files"
os.makedirs(save_dir, exist_ok=True)

import shutil

# Move model and data files to Google Drive
shutil.copy("ridge_classifier_model.pkl", save_dir)
shutil.copy("svm_classifier_model.pkl", save_dir)
shutil.copy("xgb_classifier_model.pkl", save_dir)
shutil.copy("text_classifier_model.pkl", save_dir)
shutil.copy("tfidf_vectorizer.pkl", save_dir)
shutil.copy("balanced_data.csv", save_dir)
shutil.copy("test_data.csv", save_dir)
shutil.copy("train_data.csv", save_dir)

os.listdir(save_dir)

Mounted at /content/drive


['ridge_classifier_model.pkl',
 'svm_classifier_model.pkl',
 'xgb_classifier_model.pkl',
 'text_classifier_model.pkl',
 'tfidf_vectorizer.pkl',
 'balanced_data.csv',
 'test_data.csv',
 'train_data.csv']

In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score,
    mean_absolute_error, mean_squared_error, r2_score,
    classification_report, confusion_matrix
)

# === Load and preprocess test data ===
test_df = pd.read_csv("/content/drive/My Drive/ML_Model_Files/test_data.csv")

def clean_text(text):
    import re
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer
    stop_words = set(stopwords.words('english'))
    ps = PorterStemmer()

    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [ps.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

test_df['final_text'] = (test_df['video_title'].astype(str) + ' ' + test_df['video_description'].astype(str)).apply(clean_text)

# Load vectorizer
vectorizer = joblib.load("/content/drive/My Drive/ML_Model_Files/tfidf_vectorizer.pkl")
X_test_final = vectorizer.transform(test_df['final_text'])
y_true = test_df['label']

# Load saved models
model_paths = {
    "Logistic Regression": "/content/drive/My Drive/ML_Model_Files/text_classifier_model.pkl",
    "Ridge Classifier": "/content/drive/My Drive/ML_Model_Files/ridge_classifier_model.pkl",
    "SVM": "/content/drive/My Drive/ML_Model_Files/svm_classifier_model.pkl",
    "XGBoost": "/content/drive/My Drive/ML_Model_Files/xgb_classifier_model.pkl"
}

# === Evaluate all models ===
results = []

for model_name, model_path in model_paths.items():
    model = joblib.load(model_path)
    y_pred = model.predict(X_test_final)

    # Some models don't support probability; handle ROC accordingly
    try:
        if hasattr(model, "predict_proba"):
            y_proba = model.predict_proba(X_test_final)[:, 1]
        elif hasattr(model, "decision_function"):
            from sklearn.preprocessing import MinMaxScaler
            scores = model.decision_function(X_test_final)
            y_proba = MinMaxScaler().fit_transform(scores.reshape(-1, 1)).flatten()
        else:
            y_proba = y_pred  # fallback if no score or proba method
    except:
        y_proba = y_pred

    results.append({
        "Model": model_name,
        "Accuracy": accuracy_score(y_true, y_pred),
        "F1 Score": f1_score(y_true, y_pred),
        "ROC-AUC": roc_auc_score(y_true, y_proba),
        "MAE": mean_absolute_error(y_true, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_true, y_pred)),
        "R² Score": r2_score(y_true, y_pred)
    })

# Convert to DataFrame
results_df = pd.DataFrame(results).sort_values(by="Accuracy", ascending=False)

# Display results
print("\n🔍 Model Evaluation Comparison:\n")
print(results_df.to_string(index=False))


🔍 Model Evaluation Comparison:

              Model  Accuracy  F1 Score  ROC-AUC      MAE     RMSE  R² Score
            XGBoost  0.835322  0.804533 0.928070 0.164678 0.405805  0.341285
                SVM  0.696897  0.575251 0.969310 0.303103 0.550548 -0.212417
   Ridge Classifier  0.677804  0.536082 0.969150 0.322196 0.567623 -0.288790
Logistic Regression  0.649165  0.461538 0.971656 0.350835 0.592314 -0.403349


In [5]:
import pandas as pd
import joblib
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from google.colab import drive # Import drive

# Mount Google Drive to access saved files
drive.mount('/content/drive')


# Load TF-IDF Vectorizer used in training
tfidf_vectorizer = joblib.load("/content/drive/My Drive/ML_Model_Files/tfidf_vectorizer.pkl")

# Load all trained models
logistic_model = joblib.load("/content/drive/My Drive/ML_Model_Files/text_classifier_model.pkl")
ridge_model = joblib.load("/content/drive/My Drive/ML_Model_Files/ridge_classifier_model.pkl")
svm_model = joblib.load("/content/drive/My Drive/ML_Model_Files/svm_classifier_model.pkl")
xgb_model = joblib.load("/content/drive/My Drive/ML_Model_Files/xgb_classifier_model.pkl")

# === Sample input ===
video_title = "One Piece Ancient Mural Explained"
video_description = "In this video we will see detail comparison of one piece ancient mural. So enjoy the video."

# === Clean text function (used during training) ===
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # remove links
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # remove symbols
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra whitespace
    return text

# Clean title and description
clean_title = clean_text(video_title)
clean_description = clean_text(video_description)
final_text = f"{clean_title} {clean_description}"

# Vectorize
X_input = tfidf_vectorizer.transform([final_text])

# Predict
logistic_pred_proba = logistic_model.predict_proba(X_input)[0] # Get probabilities
ridge_pred = ridge_model.predict(X_input)[0]
svm_pred_proba = svm_model.predict_proba(X_input)[0] # Get probabilities
xgb_pred_proba = xgb_model.predict_proba(X_input) # Get probabilities


# Print predictions
print("Predictions for sample input:")
# Print probabilities for models that support it
print(f"Logistic Regression Probability (Class 0, Class 1): {logistic_pred_proba}")
print(f"Ridge Classifier Prediction: {ridge_pred}") # Ridge returns the predicted class directly
print(f"SVM Probability (Class 0, Class 1): {svm_pred_proba}")
print(f"XGBoost Probability (Class 0, Class 1): {xgb_pred_proba[0][1]}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Predictions for sample input:
Logistic Regression Probability (Class 0, Class 1): [0.42844524 0.57155476]
Ridge Classifier Prediction: 0
SVM Probability (Class 0, Class 1): [0.67648475 0.32351525]
XGBoost Probability (Class 0, Class 1): 0.0008356526377610862
