In [None]:
# =======================================================
# 6. AI-Driven Trend Forecasting
# Goal: Predict the next viral trends on TikTok using ML
# =======================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# --- Load Data ---
print("Loading data...")
df = pd.read_csv('data/processed/tiktok_processed.csv')
df['clean_description'] = df['clean_description'].fillna('').astype(str)
df['is_viral'] = (df['engagement_rate_per_play'] > 0.20).astype(int)  # You can adjust threshold

# --- Feature Engineering ---
numerical_features = [
    'create_hour', 'viral_coefficient_norm',
    'hashtag_count', 'description_length', 'sentiment_polarity'
]
categorical_features = [
    'day_of_week', 'is_weekend', 'time_period', 'author'
]
text_feature = 'clean_description'

# --- Preprocessing Pipeline ---
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
])

# --- Feature Selection with RFECV ---
X = df[numerical_features + categorical_features]
y = df['is_viral']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False  # For time-based split
)

rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight='balanced')
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('rf', rf)
])

tscv = TimeSeriesSplit(n_splits=5)
selector = RFECV(
    estimator=rf,
    step=1,
    cv=tscv,
    scoring='roc_auc',
    n_jobs=-1
)
X_train_processed = preprocessor.fit_transform(X_train)
selector.fit(X_train_processed, y_train)

selected_features = np.array(numerical_features + list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)))[selector.support_]
print("Selected features:", selected_features)

# --- Model Training with Selected Features ---
X_train_selected = selector.transform(X_train_processed)
X_test_processed = preprocessor.transform(X_test)
X_test_selected = selector.transform(X_test_processed)

rf_best = RandomForestClassifier(n_estimators=200, max_depth=7, random_state=42, class_weight='balanced', n_jobs=-1)
rf_best.fit(X_train_selected, y_train)
y_pred = rf_best.predict(X_test_selected)
y_proba = rf_best.predict_proba(X_test_selected)[:, 1]

print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))

# --- Timescale Prediction: Next 24h vs Next Week ---
df['create_time'] = pd.to_datetime(df['create_time'])
df = df.sort_values('create_time')

def label_viral_next_period(df, period='1D', threshold=0.20):
    df = df.copy()
    df['future_engagement'] = df['engagement_rate_per_play'].shift(-1)
    df['is_viral_next'] = (df['future_engagement'] > threshold).astype(int)
    return df

# Next 24 hours
df_24h = label_viral_next_period(df, period='1D')
# Next week
df_7d = label_viral_next_period(df, period='7D')

# You can repeat the above modeling steps for df_24h and df_7d with 'is_viral_next' as target

print("AI-driven trend forecasting complete.")

Loading data...
