# 📊 Modelling Task – Classification
**Goal**: Predict video popularity using three different models:
- Baseline: Logistic Regression
- Model 1: Random Forest Classifier
- Model 2: Support Vector Machine (SVM)

Each model uses the same features: topic, language, duration, and publication hour.
The target variable is whether the video is trending (above median views).

## 1️⃣ Baseline Model – Logistic Regression

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv("processed_data.csv")

# Preprocessing
df['Views'] = pd.to_numeric(df['Views'], errors='coerce')
df['Duration'] = pd.to_numeric(df['Duration'], errors='coerce')
df['Publication Time'] = pd.to_datetime(df['Publication Time'], errors='coerce')
df.dropna(subset=['Views', 'Duration', 'Publication Time'], inplace=True)

# Create target
df['Trending'] = (df['Views'] > df['Views'].median()).astype(int)

# Features
topic_cols = [col for col in df.columns if col.startswith('Topic_')]
lang_cols = [col for col in df.columns if col.startswith('Language_')]
df['Hour'] = df['Publication Time'].dt.hour

X = df[topic_cols + lang_cols + ['Duration', 'Hour']]
y = df['Trending']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Baseline model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluation
print("✅ Baseline Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print("\n🔍 Classification Report:\n", classification_report(y_test, y_pred))
print("\n📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


✅ Baseline Logistic Regression Accuracy: 0.6317204301075269

🔍 Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.62      0.63       187
           1       0.63      0.64      0.63       185

    accuracy                           0.63       372
   macro avg       0.63      0.63      0.63       372
weighted avg       0.63      0.63      0.63       372


📊 Confusion Matrix:
 [[116  71]
 [ 66 119]]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## 2️⃣ Model 1 – Random Forest Classifier

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Load dataset
df = pd.read_csv("processed_data.csv")

# Preprocessing
df['Views'] = pd.to_numeric(df['Views'], errors='coerce')
df['Duration'] = pd.to_numeric(df['Duration'], errors='coerce')
df['Publication Time'] = pd.to_datetime(df['Publication Time'], errors='coerce')
df.dropna(subset=['Views', 'Duration', 'Publication Time'], inplace=True)

# Create target
df['Trending'] = (df['Views'] > df['Views'].median()).astype(int)

# Features
topic_cols = [col for col in df.columns if col.startswith('Topic_')]
lang_cols = [col for col in df.columns if col.startswith('Language_')]
df['Hour'] = df['Publication Time'].dt.hour

X = df[topic_cols + lang_cols + ['Duration', 'Hour']]
y = df['Trending']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Prediction and evaluation
y_pred_rf = rf_model.predict(X_test)
print("✅ Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\n🔍 Classification Report:\n", classification_report(y_test, y_pred_rf))
print("\n📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


✅ Random Forest Accuracy: 0.6290322580645161

🔍 Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.67      0.65       187
           1       0.64      0.58      0.61       185

    accuracy                           0.63       372
   macro avg       0.63      0.63      0.63       372
weighted avg       0.63      0.63      0.63       372


📊 Confusion Matrix:
 [[126  61]
 [ 77 108]]


## 3️⃣ Model 2 – Support Vector Machine (SVM)

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load dataset
df = pd.read_csv("processed_data.csv")

# Preprocessing
df['Views'] = pd.to_numeric(df['Views'], errors='coerce')
df['Duration'] = pd.to_numeric(df['Duration'], errors='coerce')
df['Publication Time'] = pd.to_datetime(df['Publication Time'], errors='coerce')
df.dropna(subset=['Views', 'Duration', 'Publication Time'], inplace=True)

# Create target
df['Trending'] = (df['Views'] > df['Views'].median()).astype(int)

# Features
topic_cols = [col for col in df.columns if col.startswith('Topic_')]
lang_cols = [col for col in df.columns if col.startswith('Language_')]
df['Hour'] = df['Publication Time'].dt.hour

X = df[topic_cols + lang_cols + ['Duration', 'Hour']]
y = df['Trending']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling (important for SVM)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# SVM model
svm_model = SVC()
svm_model.fit(X_train_scaled, y_train)

# Predict
y_pred_svm = svm_model.predict(X_test_scaled)

# Evaluation
print("✅ SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("\n🔍 Classification Report:\n", classification_report(y_test, y_pred_svm))
print("\n📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))


✅ SVM Accuracy: 0.6182795698924731

🔍 Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.64      0.63       187
           1       0.62      0.60      0.61       185

    accuracy                           0.62       372
   macro avg       0.62      0.62      0.62       372
weighted avg       0.62      0.62      0.62       372


📊 Confusion Matrix:
 [[119  68]
 [ 74 111]]
