In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier  # <-- Correct import
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score
)
import matplotlib.pyplot as plt
import seaborn as sns



In [3]:
# Load data
df = pd.read_csv("data/processed/tiktok_processed.csv")

# Ensure viral label exists
if 'is_viral_plays' not in df.columns:
    df['is_viral_plays'] = (df['plays'] > df['plays'].quantile(0.98)).astype(int)

# Drop columns not useful for modeling
drop_cols = [
    'video_id', 'create_time', 'description', 'hashtags',
    'plays', 'likes', 'comments', 'shares', 'fetch_time'
]
df = df.drop(columns=[col for col in drop_cols if col in df.columns], errors='ignore')

# Separate features and target
X = df.drop(columns=['is_viral_plays'])
y = df['is_viral_plays']

# One-hot encode categoricals
X = pd.get_dummies(X, drop_first=True)

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Scale numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



In [4]:
# Define models (including Decision Tree)
models = [
    (LogisticRegression(max_iter=1000, class_weight='balanced'), "Logistic Regression"),
    (DecisionTreeClassifier(class_weight='balanced', random_state=42), "Decision Tree"),
    (RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42), "Random Forest"),
    (XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42), "XGBoost"),
    (SVC(kernel='rbf', probability=True, class_weight='balanced'), "SVM"),
    (MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=300, random_state=42), "Neural Network")
]

# Model evaluation function with cross-validation
def evaluate_model(model, model_name):
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1] if hasattr(model, "predict_proba") else y_pred

    # Stratified 5-fold cross-validation on training set
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_f1 = cross_val_score(model, X_train_scaled, y_train, cv=skf, scoring='f1').mean()

    metrics = {
        "Model": model_name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "ROC AUC": roc_auc_score(y_test, y_prob),
        "CV F1 (train)": cv_f1
    }
    return metrics



In [None]:
# Train and evaluate all models
results = []
for model, name in models:
    print(f"Training {name}...")
    result = evaluate_model(model, name)
    results.append(result)

# Display results
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="F1 Score", ascending=False)
print("\n📊 Model Performance Comparison:")
print(results_df)



Training Logistic Regression...
Training Decision Tree...
Training Random Forest...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training SVM...


In [None]:
# Plot performance
plt.figure(figsize=(12, 7))
sns.barplot(data=results_df.melt(id_vars="Model"), x="value", y="Model", hue="variable")
plt.title("Model Performance Metrics")
plt.xlabel("Score")
plt.ylabel("Model")
plt.tight_layout()
plt.show()

# Train and show feature importance for best tree-based model
best_tree_model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
best_tree_model.fit(X_train_scaled, y_train)
importances = best_tree_model.feature_importances_
feat_names = X.columns
feat_imp_df = pd.DataFrame({'feature': feat_names, 'importance': importances})
feat_imp_df = feat_imp_df.sort_values(by='importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feat_imp_df.head(20), x='importance', y='feature', palette='viridis')
plt.title("Top 20 Feature Importances (Random Forest)")
plt.tight_layout()
plt.show()



In [None]:
# Classification report and confusion matrix for best model
y_pred = best_tree_model.predict(X_test_scaled)
print("Classification Report (Random Forest):")
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Viral', 'Viral'], yticklabels=['Not Viral', 'Viral'])
plt.title("Confusion Matrix (Random Forest)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()