In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
from google.colab import drive
import os

# --- Classification Models ---
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# --- Regression Models ---
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor





In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file_path = '/content/drive/My Drive/YTML/YTML_Project_Data/V1_new_processed_data.csv'
df = pd.read_csv(file_path)

In [None]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6143 entries, 0 to 6142
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   video_id                 6143 non-null   object 
 1   title                    6143 non-null   object 
 2   published_at             6143 non-null   object 
 3   channel_id               6143 non-null   object 
 4   category_id              6143 non-null   int64  
 5   view_count               6143 non-null   int64  
 6   like_count               6143 non-null   int64  
 7   comment_count            6143 non-null   int64  
 8   duration                 6143 non-null   object 
 9   description              5262 non-null   object 
 10  tags                     6143 non-null   object 
 11  channel_title            6143 non-null   object 
 12  channel_description      5830 non-null   object 
 13  channel_start_date       6143 non-null   object 
 14  subscriber_count        

Unnamed: 0,category_id,view_count,like_count,comment_count,subscriber_count,channel_view_count,channel_video_count,publish_hour,title_length,description_length,has_banner_image,publish_day_of_week,channel_age_days,duration_seconds,engagement_class,log_view_count,log_subscriber_count,log_channel_view_count,log_channel_video_count
count,6143.0,6143.0,6143.0,6143.0,6143.0,6143.0,6143.0,6143.0,6143.0,6143.0,6143.0,6143.0,6143.0,6143.0,6143.0,6143.0,6143.0,6143.0,6143.0
mean,22.333225,13606220.0,148718.4,3544.994,3162654.0,1798495000.0,8049.599381,12.836562,60.684845,1015.250203,0.931467,2.925118,2560.534104,1240.095719,1.050138,12.908385,12.524611,18.124321,6.351534
std,12.459527,145671900.0,900044.2,30068.03,8550996.0,7227179000.0,36417.702639,5.743314,21.657242,1071.45406,0.252679,1.948689,1978.426804,4608.215193,0.805039,2.707937,2.984617,3.285348,2.008483
min,1.0,5.0,0.0,0.0,3.0,113.0,1.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,1.791759,1.386294,4.736198,0.693147
25%,20.0,84113.5,1442.5,41.0,45800.0,8713752.0,159.5,10.0,44.0,182.0,1.0,1.0,844.0,39.0,0.5,11.339934,10.732059,15.980413,5.078289
50%,23.0,448337.0,8994.0,211.0,390000.0,94187890.0,532.0,14.0,58.0,678.0,1.0,3.0,2245.0,163.0,1.0,13.013303,12.873905,18.360802,6.278521
75%,27.0,2421856.0,48505.5,1007.5,2730000.0,946563800.0,1622.0,17.0,78.0,1465.5,1.0,5.0,3950.0,768.5,1.5,14.700045,14.819813,20.668349,7.392032
max,99.0,6523764000.0,34618240.0,1208429.0,299000000.0,305313500000.0,528327.0,23.0,107.0,5000.0,1.0,6.0,20207.0,80157.0,3.0,22.598717,19.515954,26.444605,13.177473


#**Separating the datasets**

In [None]:
# Define the features (X) by dropping the target and identifier columns
features_to_use = ['category_id',
	'duration_seconds',
	 'publish_hour',
	'publish_day_of_week',
	'channel_age_days',
 	'title_length',
 	'description_length',
  # 'tags',
# Use the new log-transformed features
	'log_subscriber_count',
	'log_channel_view_count',
 	'log_channel_video_count' ]

X = df[features_to_use]

# Define the two target variables (y)
y_clf = df['engagement_class'] # for Classification
y_reg = df['log_view_count']   # for Regression

# Verify the shapes
print("Features and Targets are separated.")
print(f"Shape of X (features): {X.shape}")
print(f"Shape of y_clf (classification target): {y_clf.shape}")
print(f"Shape of y_reg (regression target): {y_reg.shape}")

Features and Targets are separated.
Shape of X (features): (6143, 10)
Shape of y_clf (classification target): (6143,)
Shape of y_reg (regression target): (6143,)


In [None]:
X.groupby('category_id').mean()
X.describe()
X.head()


Unnamed: 0,category_id,duration_seconds,publish_hour,publish_day_of_week,channel_age_days,title_length,description_length,log_subscriber_count,log_channel_view_count,log_channel_video_count
0,22,13,0,3,151,68,0.0,10.73859,17.678679,6.003887
1,27,350,16,4,1772,40,1322.0,14.834358,18.873784,6.866933
2,28,53,13,0,733,43,180.0,10.445841,15.511978,6.413459
3,27,42,12,2,135,60,1442.0,9.22039,13.777452,4.584967
4,27,26,11,2,121,74,72.0,5.934894,11.828028,3.401197


In [None]:
X_train, X_test, y_clf_train, y_clf_test = train_test_split(X, y_clf, test_size=0.2, random_state=42, stratify= y_clf)  #42 is the answer to everything

# Create the regression targets using the same split
y_reg_train = y_reg.loc[X_train.index]
y_reg_test = y_reg.loc[X_test.index]


print("Data split into training and testing sets.")
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

Data split into training and testing sets.
Training set size: 4914 samples
Testing set size: 1229 samples


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Training and testing data have been scaled.")


Training and testing data have been scaled.


#**Classification Model Training and Testing**

In [None]:

classification_models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'LightGBM': LGBMClassifier(random_state=42)
}

# Loop through each model and evaluate its performance
print("--- Evaluating Classification Models using Cross-Validation ---")
for name, model in classification_models.items():
    # We use 'f1_weighted' as the score because it's good for imbalanced classes
    scores = cross_val_score(model, X_train_scaled, y_clf_train, cv=5, scoring='f1_weighted')

    # Print the mean and standard deviation of the scores
    print(f"{name} F1-Score: {np.mean(scores):.4f} (+/- {np.std(scores):.4f})")

--- Evaluating Classification Models using Cross-Validation ---
Logistic Regression F1-Score: 0.5913 (+/- 0.0177)
Random Forest F1-Score: 0.6443 (+/- 0.0117)
XGBoost F1-Score: 0.6336 (+/- 0.0178)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000369 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 3931, number of used features: 10
[LightGBM] [Info] Start training from score -1.386040
[LightGBM] [Info] Start training from score -0.693402
[LightGBM] [Info] Start training from score -1.609692
[LightGBM] [Info] Start training from score -2.993445




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000139 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1669
[LightGBM] [Info] Number of data points in the train set: 3931, number of used features: 10
[LightGBM] [Info] Start training from score -1.386040
[LightGBM] [Info] Start training from score -0.693402
[LightGBM] [Info] Start training from score -1.609692
[LightGBM] [Info] Start training from score -2.993445




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000419 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1669
[LightGBM] [Info] Number of data points in the train set: 3931, number of used features: 10
[LightGBM] [Info] Start training from score -1.386040
[LightGBM] [Info] Start training from score -0.692893
[LightGBM] [Info] Start training from score -1.610965
[LightGBM] [Info] Start training from score -2.993445




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000141 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1669
[LightGBM] [Info] Number of data points in the train set: 3931, number of used features: 10
[LightGBM] [Info] Start training from score -1.385023
[LightGBM] [Info] Start training from score -0.692893
[LightGBM] [Info] Start training from score -1.610965
[LightGBM] [Info] Start training from score -2.998534




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000433 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1669
[LightGBM] [Info] Number of data points in the train set: 3932, number of used features: 10
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -0.693147
[LightGBM] [Info] Start training from score -1.609947
[LightGBM] [Info] Start training from score -2.993700
LightGBM F1-Score: 0.6441 (+/- 0.0084)





**Testing Classification Best Model**





In [None]:
from sklearn.metrics import classification_report

# 1. Initialize and train your best model on the full (scaled) training set
best_model = RandomForestClassifier(random_state=42)
best_model.fit(X_train_scaled, y_clf_train)

# 2. Make predictions on the unseen test set
y_pred = best_model.predict(X_test_scaled)

# 3. Print the final evaluation report
# This shows the precision, recall, and f1-score for each engagement tier
print("\n--- Final Evaluation of Random Forest on the Test Set ---")
print(classification_report(y_clf_test, y_pred, target_names=['Underperforming', 'Average', 'Popular', 'Viral']))


--- Final Evaluation of Random Forest on the Test Set ---
                 precision    recall  f1-score   support

Underperforming       0.69      0.50      0.58       307
        Average       0.62      0.79      0.70       614
        Popular       0.56      0.44      0.49       246
          Viral       0.65      0.39      0.48        62

       accuracy                           0.63      1229
      macro avg       0.63      0.53      0.56      1229
   weighted avg       0.63      0.63      0.62      1229



#**Regression Model Training and Testing**

**Finding the best among 4**

In [None]:
from sklearn.model_selection import cross_val_score
import numpy as np

# Create a dictionary of our regression models
regression_models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'LightGBM': LGBMRegressor(random_state=42)
}

# Loop through each model and evaluate its performance
print("--- Evaluating Regression Models using Cross-Validation ---")
for name, model in regression_models.items():
    # Use 'neg_mean_squared_error' for scoring
    scores = cross_val_score(
        model,
        X_train_scaled,
        y_reg_train,
        cv=5,
        scoring='neg_mean_squared_error'
    )

    # Calculate RMSE from the negative MSE scores
    rmse_scores = np.sqrt(-scores)

    print(f"{name} RMSE: {np.mean(rmse_scores):.4f} (+/- {np.std(rmse_scores):.4f})")

--- Evaluating Regression Models using Cross-Validation ---
Linear Regression RMSE: 1.8186 (+/- 0.0530)
Random Forest RMSE: 1.6470 (+/- 0.0289)
XGBoost RMSE: 1.7519 (+/- 0.0262)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000429 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 3931, number of used features: 10
[LightGBM] [Info] Start training from score 12.899537
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000149 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1669
[LightGBM] [Info] Number of data points in the train set: 3931, number of used features: 10
[LightGBM] [Info] Start training from score 12.873809




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000402 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 3931, number of used features: 10
[LightGBM] [Info] Start training from score 12.908679
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000422 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1669
[LightGBM] [Info] Number of data points in the train set: 3931, number of used features: 10
[LightGBM] [Info] Start training from score 12.879300




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000417 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1669
[LightGBM] [Info] Number of data points in the train set: 3932, number of used features: 10
[LightGBM] [Info] Start training from score 12.969612
LightGBM RMSE: 1.6679 (+/- 0.0395)




**Testing**

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# 1. Initialize and train the best model on the full (scaled) training set
best_reg_model = RandomForestRegressor(random_state=42)
best_reg_model.fit(X_train_scaled, y_reg_train)

# 2. Make predictions on the unseen test set
y_reg_pred = best_reg_model.predict(X_test_scaled)

# 3. Calculate and print the final evaluation metrics
final_rmse = np.sqrt(mean_squared_error(y_reg_test, y_reg_pred))
final_r2 = r2_score(y_reg_test, y_reg_pred)

print("\n--- Final Evaluation of Random Forest Regressor on the Test Set ---")
print(f"Final RMSE: {final_rmse:.4f}")
print(f"Final R-squared (R²): {final_r2:.4f}")


--- Final Evaluation of Random Forest Regressor on the Test Set ---
Final RMSE: 1.6900
Final R-squared (R²): 0.5998


#**Save both the Models and the scaler**

In [None]:
import joblib

# --- Define the folder to save your models ---
model_folder = '/content/drive/My Drive/YTML/prediction_api/models/'
os.makedirs(model_folder, exist_ok=True) # Create folder if it doesn't exist

# --- Save your BEST trained models and the scaler ---
# (Assuming your best classifier is 'best_model' and best regressor is 'best_reg_model')

# 1. Save the Classification Model
joblib.dump(best_model, os.path.join(model_folder, 'classifier.joblib'))

# 2. Save the Regression Model
joblib.dump(best_reg_model, os.path.join(model_folder, 'regressor.joblib'))

# 3. Save the Scaler object (this is crucial)
joblib.dump(scaler, os.path.join(model_folder, 'scaler.joblib'))

print("Models and scaler have been saved successfully.")

Models and scaler have been saved successfully.
