In [None]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
# ==============================================================
#  Behaviour Simulation – Likes Prediction (80/20 split)
#  FIXED: NaNs → SimpleImputer before the model
# ==============================================================

import pandas as pd
import numpy as np
import os, joblib, warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score

from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

from sklearn.base import BaseEstimator, TransformerMixin

# -------------------------------
# 1. CONFIG
# -------------------------------
CSV_PATH       = "behaviour_simulation_train.xlsx - Sheet1.csv"
CHECKPOINT_DIR = "model_checkpoints"
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# -------------------------------
# 2. LOAD DATA
# -------------------------------
print("Loading data...")
df = pd.read_csv(CSV_PATH, on_bad_lines='skip')
print(f"Rows: {df.shape[0]:,} | Columns: {df.shape[1]}")
print("Columns:", list(df.columns))

df = df[['id','date','likes','content','username','media','inferred_company']].copy()

# -------------------------------
# 3. TARGET & SPLIT
# -------------------------------
df = df.dropna(subset=['likes'])
y = np.log1p(df['likes'].astype(float))

X = df[['date','content','username','media','inferred_company']]

y_bins = pd.qcut(y, q=10, duplicates='drop')
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y_bins
)
print(f"Train: {X_train.shape[0]:,} | Test: {X_test.shape[0]:,}")

# -------------------------------
# 4. CUSTOM TRANSFORMERS
# -------------------------------
class DateFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X):
        d = pd.to_datetime(X['date'], errors='coerce')
        return pd.DataFrame({
            'year'      : d.dt.year,
            'month'     : d.dt.month,
            'day'       : d.dt.day,
            'hour'      : d.dt.hour,
            'weekday'   : d.dt.weekday,
            'is_weekend': d.dt.weekday.isin([5,6]).astype(int),
            'tod'       : pd.cut(d.dt.hour,
                                 bins=[-1,6,12,18,24],
                                 labels=['night','morning','afternoon','evening'])
        })

class TextFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X):
        txt = X['content'].fillna('').str.replace(r'<mention>|<hyperlink>', ' ', regex=True)
        return pd.DataFrame({
            'clean_text' : txt,
            'txt_len'    : txt.str.len(),
            'word_cnt'   : txt.str.split().str.len(),
            'n_mentions' : X['content'].str.count('<mention>'),
            'n_links'    : X['content'].str.count('<hyperlink>'),
            'n_hashtags' : X['content'].str.count(r'#\w+'),
            'has_emoji'  : txt.str.contains(r'[^\x00-\x7F]', regex=True).astype(int),
            'shout_words': txt.apply(lambda s: sum(w.isupper() for w in s.split()))
        })

class MediaFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X):
        m = X['media'].fillna('')
        typ = np.full(len(m), 'none', dtype=object)
        typ = np.where(m.str.contains('Photo'), 'photo', typ)
        typ = np.where(m.str.contains('Video'), 'video', typ)
        typ = np.where(m.str.contains('Gif'),  'gif',   typ)
        return pd.DataFrame({
            'media_type': typ,
            'media_cnt' : m.str.count(r'Photo|Video|Gif'),
            'has_thumb' : m.str.contains('thumbnailUrl').astype(int)
        })

class ToDataFrame(BaseEstimator, TransformerMixin):
    def __init__(self, columns): self.columns = columns
    def fit(self, X, y=None): return self
    def transform(self, X):
        idx = X.index if hasattr(X, 'index') else pd.RangeIndex(X.shape[0])
        return pd.DataFrame(X, columns=self.columns, index=idx)

class CategoricalImputer(BaseEstimator, TransformerMixin):
    def __init__(self, columns, fill_value='missing'):
        self.columns = columns
        self.fill_value = fill_value
    def fit(self, X, y=None): return self
    def transform(self, X):
        X = X.copy()
        for c in self.columns:
            X[c] = X[c].fillna(self.fill_value).astype(str)
        return X

# -------------------------------
# 5. FEATURE PIPELINE
# -------------------------------
raw_features = ColumnTransformer(
    transformers=[
        ('date',  DateFeatures(),  ['date']),
        ('text',  TextFeatures(),  ['content']),
        ('media', MediaFeatures(), ['media'])
    ],
    remainder='passthrough'
)

raw_cols = [
    'year','month','day','hour','weekday','is_weekend','tod',
    'clean_text','txt_len','word_cnt','n_mentions','n_links','n_hashtags',
    'has_emoji','shout_words',
    'media_type','media_cnt','has_thumb',
    'username','inferred_company'
]

final_preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False),
            ['username','inferred_company','media_type','tod']),
        ('num', StandardScaler(),
            ['txt_len','word_cnt','n_mentions','n_links','n_hashtags',
             'has_emoji','shout_words','media_cnt','has_thumb',
             'year','month','day','hour','weekday','is_weekend']),
        ('tfidf', TfidfVectorizer(max_features=800,
                                  stop_words='english',
                                  ngram_range=(1,2)),
            'clean_text')
    ],
    remainder='drop'
)

# -------------------------------
# 6. MODELS
# -------------------------------
models = {
    'Ridge'            : Ridge(alpha=1.0),
    'Lasso'            : Lasso(alpha=1.0, max_iter=3000),
    'ElasticNet'       : ElasticNet(alpha=1.0, l1_ratio=0.5, max_iter=3000),
    'DecisionTree'     : DecisionTreeRegressor(max_depth=12, random_state=42),
    'ExtraTrees'       : ExtraTreesRegressor(n_estimators=200, max_depth=12,
                                            random_state=42, n_jobs=-1),
    'RandomForest'     : RandomForestRegressor(n_estimators=300, max_depth=12,
                                              random_state=42, n_jobs=-1),
    'GradientBoosting' : GradientBoostingRegressor(
                            n_estimators=400, learning_rate=0.05,
                            max_depth=6, subsample=0.8, random_state=42),
    'XGBoost'          : xgb.XGBRegressor(
                            n_estimators=500, learning_rate=0.05,
                            max_depth=7, subsample=0.8, colsample_bytree=0.8,
                            objective='reg:squarederror', random_state=42, n_jobs=-1),
    'LightGBM'         : lgb.LGBMRegressor(
                            n_estimators=500, learning_rate=0.05,
                            max_depth=7, subsample=0.8, colsample_bytree=0.8,
                            random_state=42, n_jobs=-1, verbose=-1),
    'CatBoost'         : cb.CatBoostRegressor(
                            iterations=500, learning_rate=0.05,
                            depth=7, subsample=0.8,
                            random_seed=42, verbose=False, thread_count=-1)
}

# -------------------------------
# 7. TRAIN / SAVE / EVALUATE
# -------------------------------
def evaluate(y_true, y_pred):
    y_true_o = np.expm1(y_true)
    y_pred_o = np.expm1(y_pred)
    return {
        'MAE'  : mean_absolute_error(y_true_o, y_pred_o),
        'MAPE' : mean_absolute_percentage_error(y_true_o, y_pred_o),
        'R2'   : r2_score(y_true_o, y_pred_o)
    }

results = []
cat_cols_to_impute = ['username','inferred_company','media_type','tod']

for name, model in models.items():
    print(f"\nTraining {name} ...")
    pipe = Pipeline([
        ('raw',          raw_features),
        ('to_df',        ToDataFrame(columns=raw_cols)),
        ('cat_imputer',  CategoricalImputer(columns=cat_cols_to_impute)),
        ('final_prep',   final_preprocess),
        ('imputer',      SimpleImputer(strategy='constant', fill_value=0)),  # <-- NEW
        ('model',        model)
    ])
    pipe.fit(X_train, y_train)

    ckpt = os.path.join(CHECKPOINT_DIR, f"{name}_likes.pkl")
    joblib.dump(pipe, ckpt)
    print(f"Checkpoint → {ckpt}")

    y_pred = pipe.predict(X_test)
    mets = evaluate(y_test, y_pred)

    results.append({
        'Model' : name,
        'MAE'   : mets['MAE'],
        'MAPE'  : mets['MAPE'],
        'R2'    : mets['R2']
    })

# -------------------------------
# 8. COMPARISON TABLE
# -------------------------------
res_df = pd.DataFrame(results)
res_df = res_df.round({'MAE':1, 'MAPE':4, 'R2':4})
res_df = res_df.sort_values('MAE')

print("\n" + "="*80)
print(" TEST SET PERFORMANCE (80/20 split) – NaNs fixed")
print("="*80)
print(res_df.to_string(index=False))
print("="*80)

res_df.to_csv("model_comparison_test.csv", index=False)
print("CSV saved → model_comparison_test.csv")

Loading data...
Rows: 17,331 | Columns: 7
Columns: ['id', 'date', 'likes', 'content', 'username', 'media', 'inferred_company']
Train: 13,864 | Test: 3,467

Training Ridge ...
Checkpoint → model_checkpoints/Ridge_likes.pkl

Training Lasso ...
Checkpoint → model_checkpoints/Lasso_likes.pkl

Training ElasticNet ...
Checkpoint → model_checkpoints/ElasticNet_likes.pkl

Training DecisionTree ...
Checkpoint → model_checkpoints/DecisionTree_likes.pkl

Training ExtraTrees ...
Checkpoint → model_checkpoints/ExtraTrees_likes.pkl

Training RandomForest ...
Checkpoint → model_checkpoints/RandomForest_likes.pkl

Training GradientBoosting ...
Checkpoint → model_checkpoints/GradientBoosting_likes.pkl

Training XGBoost ...
Checkpoint → model_checkpoints/XGBoost_likes.pkl

Training LightGBM ...
Checkpoint → model_checkpoints/LightGBM_likes.pkl

Training CatBoost ...
Checkpoint → model_checkpoints/CatBoost_likes.pkl

 TEST SET PERFORMANCE (80/20 split) – NaNs fixed
           Model   MAE         MAPE    