In [1]:
# """
# train_severity_pipeline.py

# End-to-end pipeline:
# - Load CSV (expects columns: 'processed_text' and 'severity_score')
# - Preprocess (lowercase, remove URLs/HTML, remove punctuation/numbers, KEEP stopwords)
# - Extract Sentence-BERT embeddings (all-MiniLM-L6-v2)
# - Extract psycholinguistic features using Empath (open alternative to LIWC)
# - Concatenate features, train LightGBM regressor
# - Evaluate and save models/artifacts
# - Provide infer_severity(text) function for single-text inference

# Requirements (install before running):
# pip install pandas numpy tqdm sentence-transformers empath lightgbm scikit-learn joblib nltk
# """

# import os
# import re
# import json
# import argparse
# from tqdm import tqdm
# import numpy as np
# import pandas as pd
# import joblib
# import logging

# # NLP libs
# import nltk
# from nltk.tokenize import word_tokenize
# nltk.download('punkt', quiet=True)

# # Sentence Embeddings
# from sentence_transformers import SentenceTransformer

# # Empath (psycholinguistic features)
# from empath import Empath

# # ML
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
# from sklearn.preprocessing import StandardScaler
# import lightgbm as lgb

# # Setup logging
# logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
# logger = logging.getLogger(__name__)

# # ---------------------------
# # Config / Paths (edit below)
# # ---------------------------
# INPUT_CSV = "reddit_with_post_severity.csv"    # expects columns: processed_text, severity_score
# OUTPUT_DIR = "models_pipeline"
# EMBED_CACHE = os.path.join(OUTPUT_DIR, "embeddings.npy")
# EMPATH_CACHE = os.path.join(OUTPUT_DIR, "empath_feats.npy")
# SYMBOLS_CACHE = os.path.join(OUTPUT_DIR, "empath_cols.json")
# MODEL_PATH = os.path.join(OUTPUT_DIR, "lightgbm_severity.pkl")
# SCALER_PATH = os.path.join(OUTPUT_DIR, "feature_scaler.pkl")
# EMBEDDER_PATH = os.path.join(OUTPUT_DIR, "embedder_all-MiniLM.pkl")  # optionally save embedder
# EMPATHOR_PATH = os.path.join(OUTPUT_DIR, "empath_lexicon.pkl")
# SEED = 42

# # ---------------------------
# # Preprocessing
# # ---------------------------
# def preprocess_keep_stopwords(text):
#     """
#     Lowercase, remove URLs and HTML, remove punctuation/numbers,
#     tokenize (keeps stopwords).
#     Returns cleaned string (tokens joined by spaces).
#     """
#     if not isinstance(text, str):
#         text = str(text) if text is not None else ""
#     text = text.lower()
#     # remove URLs
#     text = re.sub(r'https?://\S+|www\.\S+', ' ', text)
#     # remove HTML tags
#     text = re.sub(r'<[^>]+>', ' ', text)
#     # remove characters that are not letters or spaces (keep spaces)
#     # we intentionally remove numbers/punctuation
#     text = re.sub(r'[^a-z\s]', ' ', text)
#     # collapse whitespace
#     text = re.sub(r'\s+', ' ', text).strip()
#     # tokenize to preserve consistent tokenization (but we return joined string)
#     tokens = word_tokenize(text)
#     return " ".join(tokens)

# # ---------------------------
# # Empath features
# # ---------------------------
# def build_empath_features(texts, empath_analyzer=None, cache_path=None):
#     """
#     texts: list of strings
#     empath_analyzer: Empath() instance (if None, it will be created)
#     Returns:
#         - features_array: np.ndarray shape (n_texts, n_features)
#         - columns: list of feature names
#     Caches results to disk if cache_path provided.
#     """
#     if empath_analyzer is None:
#         empath_analyzer = Empath()
#     if cache_path and os.path.exists(cache_path):
#         logger.info("Loading empath features from cache: %s", cache_path)
#         feats = np.load(cache_path, allow_pickle=True)
#         columns = json.load(open(SYMBOLS_CACHE, 'r'))
#         return feats, columns

#     logger.info("Computing Empath features for %d texts...", len(texts))
#     rows = []
#     # Empath returns a dict of categories → counts or normalized values (we ask normalize=True)
#     for t in tqdm(texts, desc="Empath features"):
#         try:
#             d = empath_analyzer.analyze(t, normalize=True)
#         except Exception:
#             # fallback to zeros for safety
#             d = {}
#         rows.append(d)
#     df = pd.DataFrame(rows).fillna(0.0)
#     columns = list(df.columns)
#     feats = df.values.astype(float)

#     if cache_path:
#         os.makedirs(os.path.dirname(cache_path), exist_ok=True)
#         np.save(cache_path, feats)
#         json.dump(columns, open(SYMBOLS_CACHE, 'w'))
#         logger.info("Saved Empath cache -> %s and columns -> %s", cache_path, SYMBOLS_CACHE)

#     return feats, columns

# # ---------------------------
# # Embeddings extraction
# # ---------------------------
# def compute_or_load_embeddings(texts, model_name="all-MiniLM-L6-v2", cache_path=None, device=None):
#     """
#     Returns numpy array of embeddings (n_texts, dim).
#     If cache exists, loads it.
#     """
#     if cache_path and os.path.exists(cache_path):
#         logger.info("Loading embeddings from cache: %s", cache_path)
#         return np.load(cache_path)

#     logger.info("Loading SentenceTransformer model: %s", model_name)
#     embedder = SentenceTransformer(model_name, device=device)
#     # batch encode
#     logger.info("Encoding %d texts with embedder...", len(texts))
#     embeddings = embedder.encode(texts, batch_size=64, show_progress_bar=True)
#     embeddings = np.array(embeddings, dtype=np.float32)

#     if cache_path:
#         os.makedirs(os.path.dirname(cache_path), exist_ok=True)
#         np.save(cache_path, embeddings)
#         try:
#             joblib.dump(embedder, EMBEDDER_PATH)
#         except Exception:
#             pass
#         logger.info("Saved embeddings cache -> %s", cache_path)

#     return embeddings

# # ---------------------------
# # Combine and train
# # ---------------------------
# def train_model(X, y, output_model_path=MODEL_PATH, output_scaler_path=SCALER_PATH, test_size=0.2):
#     """
#     Trains a LightGBM regressor on X,y. Saves model and scaler.
#     Returns trained model and scaler and (X_test,y_test,y_pred).
#     """
#     logger.info("Splitting data (test_size=%s)...", test_size)
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=SEED)

#     logger.info("Scaling features (StandardScaler)...")
#     scaler = StandardScaler()
#     X_train_scaled = scaler.fit_transform(X_train)
#     X_test_scaled = scaler.transform(X_test)

#     logger.info("Training LightGBM regressor...")
#     params = {
#         'objective': 'regression',
#         'metric': 'rmse',
#         'verbosity': -1,
#         'random_state': SEED
#     }

#     # Use sklearn API for simpler save/load
#     model = lgb.LGBMRegressor(n_estimators=1000, learning_rate=0.05, num_leaves=31, random_state=SEED)
#     model.fit(
#         X_train_scaled, y_train,
#         eval_set=[(X_test_scaled, y_test)],
#         eval_metric='rmse',
#         early_stopping_rounds=50,
#         verbose=50
#     )

#     logger.info("Predicting on test set...")
#     y_pred = model.predict(X_test_scaled)

#     logger.info("Saving model and scaler...")
#     os.makedirs(os.path.dirname(output_model_path), exist_ok=True)
#     joblib.dump(model, output_model_path)
#     joblib.dump(scaler, output_scaler_path)
#     logger.info("Model saved to %s", output_model_path)
#     logger.info("Scaler saved to %s", output_scaler_path)

#     return model, scaler, (X_test, y_test, y_pred)

# # ---------------------------
# # Evaluation utilities
# # ---------------------------
# def evaluate_preds(y_true, y_pred):
#     mse = mean_squared_error(y_true, y_pred)
#     rmse = np.sqrt(mse)
#     mae = mean_absolute_error(y_true, y_pred)
#     r2 = r2_score(y_true, y_pred)
#     logger.info("Evaluation -> RMSE: %.4f | MAE: %.4f | R2: %.4f", rmse, mae, r2)
#     return {'rmse': rmse, 'mae': mae, 'r2': r2}

# # ---------------------------
# # Inference
# # ---------------------------
# def load_artifacts(model_path=MODEL_PATH, scaler_path=SCALER_PATH, embed_cache=EMBED_CACHE, empath_cols_path=SYMBOLS_CACHE):
#     """Load trained model and scaler. Also load embedder from disk if available."""
#     model = joblib.load(model_path)
#     scaler = joblib.load(scaler_path)
#     # load columns for empath space
#     if os.path.exists(empath_cols_path):
#         empath_cols = json.load(open(empath_cols_path, 'r'))
#     else:
#         empath_cols = None
#     embedder = None
#     try:
#         if os.path.exists(EMBEDDER_PATH):
#             embedder = joblib.load(EMBEDDER_PATH)
#     except Exception:
#         embedder = None
#     return model, scaler, embedder, empath_cols

# def infer_severity(text, model, scaler, embedder, empath_cols, empath_analyzer=None):
#     """Single-text inference. Returns severity score (0-1)."""
#     # preprocess
#     txt = preprocess_keep_stopwords(text)
#     # embed
#     if embedder is None:
#         embedder = SentenceTransformer('all-MiniLM-L6-v2')  # fallback
#     emb = embedder.encode([txt])
#     emb = np.array(emb, dtype=np.float32)
#     # empath features
#     if empath_analyzer is None:
#         empath_analyzer = Empath()
#     empath_dict = empath_analyzer.analyze(txt, normalize=True)
#     # ensure columns order
#     if empath_cols is None:
#         empath_cols = sorted(list(empath_dict.keys()))
#     empath_row = [empath_dict.get(c, 0.0) for c in empath_cols]
#     empath_row = np.array(empath_row, dtype=np.float32).reshape(1, -1)
#     # combine
#     X_new = np.hstack([emb, empath_row])
#     # scale
#     X_new_scaled = scaler.transform(X_new)
#     # predict
#     pred = model.predict(X_new_scaled)[0]
#     return float(max(0.0, min(1.0, pred)))

# # ---------------------------
# # Main execution
# # ---------------------------
# def main(input_csv=INPUT_CSV, overwrite_cache=False, test_size=0.2, device=None):
#     logger.info("Loading input CSV: %s", input_csv)
#     df = pd.read_csv(input_csv)

#     # Expect columns
#     if 'processed_text' not in df.columns or 'severity_score' not in df.columns:
#         raise ValueError("Input CSV must contain 'processed_text' and 'severity_score' columns")

#     # Preprocess (ensure stopwords are kept)
#     logger.info("Preprocessing texts (keep stopwords)...")
#     df['processed_text_clean'] = df['processed_text'].fillna("").astype(str).apply(preprocess_keep_stopwords)

#     texts = df['processed_text_clean'].tolist()
#     y = df['severity_score'].astype(float).values

#     # embeddings
#     if not overwrite_cache and os.path.exists(EMBED_CACHE):
#         embeddings = np.load(EMBED_CACHE)
#     else:
#         embeddings = compute_or_load_embeddings(texts, cache_path=EMBED_CACHE, device=device)

#     # empath features
#     if not overwrite_cache and os.path.exists(EMPATH_CACHE) and os.path.exists(SYMBOLS_CACHE):
#         empath_feats = np.load(EMPATH_CACHE, allow_pickle=True)
#         empath_cols = json.load(open(SYMBOLS_CACHE, 'r'))
#     else:
#         empath_analyzer = Empath()
#         empath_feats, empath_cols = build_empath_features(texts, empath_analyzer=empath_analyzer, cache_path=EMPATH_CACHE)

#     # Combine features (align dims)
#     logger.info("Combining embeddings (%s) + empath features (%s)...", embeddings.shape, empath_feats.shape)
#     X = np.hstack([embeddings, empath_feats])

#     # Train model
#     model, scaler, test_triplet = train_model(X, y, output_model_path=MODEL_PATH, output_scaler_path=SCALER_PATH, test_size=test_size)
#     X_test, y_test, y_pred = test_triplet

#     # Evaluate
#     eval_stats = evaluate_preds(y_test, y_pred)

#     # Save additional artifacts (empath columns)
#     os.makedirs(OUTPUT_DIR, exist_ok=True)
#     json.dump({'empath_cols': empath_cols, 'embed_dim': embeddings.shape[1]}, open(os.path.join(OUTPUT_DIR, "meta.json"), 'w'))

#     logger.info("Pipeline finished. Model, scaler and artifacts saved to %s", OUTPUT_DIR)
#     return model, scaler, embeddings, empath_feats, empath_cols, eval_stats

# # ---------------------------
# # If run as script
# # ---------------------------
# if __name__ == "__main__":
#     parser = argparse.ArgumentParser(description="Train severity model: embeddings + empath + LightGBM")
#     parser.add_argument("--input", type=str, default=INPUT_CSV, help="Input CSV path (must contain processed_text and severity_score)")
#     parser.add_argument("--outdir", type=str, default=OUTPUT_DIR, help="Output folder for models and caches")
#     parser.add_argument("--overwrite", action="store_true", help="Overwrite caches (recompute embeddings/empath)")
#     parser.add_argument("--test_size", type=float, default=0.2, help="Test split fraction")
#     parser.add_argument("--device", type=str, default=None, help="Device for sentence-transformers (e.g. 'cuda' or 'cpu')")
#     args = parser.parse_args()

#     # update global paths based on outdir
#     OUTPUT_DIR = args.outdir
#     EMBED_CACHE = os.path.join(OUTPUT_DIR, "embeddings.npy")
#     EMPATH_CACHE = os.path.join(OUTPUT_DIR, "empath_feats.npy")
#     SYMBOLS_CACHE = os.path.join(OUTPUT_DIR, "empath_cols.json")
#     MODEL_PATH = os.path.join(OUTPUT_DIR, "lightgbm_severity.pkl")
#     SCALER_PATH = os.path.join(OUTPUT_DIR, "feature_scaler.pkl")
#     EMBEDDER_PATH = os.path.join(OUTPUT_DIR, "embedder_all-MiniLM.pkl")
#     os.makedirs(OUTPUT_DIR, exist_ok=True)

#     main(input_csv=args.input, overwrite_cache=args.overwrite, test_size=args.test_size, device=args.device)


In [14]:
import pandas as pd
import numpy as np
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# =====================
# 1. Load your dataset
# =====================
def load_data(file_path):
    df = pd.read_csv(file_path)
    if 'processed_text' not in df.columns:
        raise ValueError("CSV must contain a 'processed_text' column")
    return df

# =====================
# 2. BERT/RoBERTa Sentiment & Emotion
# =====================
def get_bert_features(df):
    sentiment_analyzer = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
    emotion_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None)

    sentiments = []
    emotions = []
    
    for text in df['processed_text']:
        # Sentiment
        sent_res = sentiment_analyzer(text[:512])[0]  # limit to 512 tokens
        sentiment_score = sent_res['score'] if sent_res['label'] == 'POSITIVE' else -sent_res['score']
        sentiments.append(sentiment_score)

        # Emotion
        emo_res = emotion_classifier(text[:512])
        emo_dict = {e['label']: e['score'] for e in emo_res[0]}
        emotions.append(emo_dict)

    df['bert_sentiment'] = sentiments
    emotion_df = pd.DataFrame(emotions).fillna(0)
    df = pd.concat([df, emotion_df], axis=1)
    return df

# =====================
# 3. LIWC & DLATK Features (placeholder)
# =====================
def get_psycholinguistic_features(df):
    # Placeholder: Replace with LIWC/DLATK output merge
    df['liwc_self_ref'] = np.random.rand(len(df))
    df['liwc_affect'] = np.random.rand(len(df))
    df['dlatk_cognition'] = np.random.rand(len(df))
    return df

# =====================
# 4. Train Ridge Regression Model
# =====================
def train_model(df):
    X_text = df['processed_text']
    y = df['severity_score']

    tfidf = TfidfVectorizer(max_features=5000)
    X_tfidf = tfidf.fit_transform(X_text)

    X_train, X_test, y_train, y_test = train_test_split(
        X_tfidf, y, test_size=0.2, random_state=42
    )

    model = Ridge(alpha=1.0)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    print(f"Test MSE: {mean_squared_error(y_test, y_pred):.4f}")
    print(f"Test R²: {r2_score(y_test, y_pred):.4f}")

    return model, tfidf

# =====================
# 5. Predict severity for new text
# =====================
def predict_severity(text, model, tfidf):
    text_vec = tfidf.transform([text])
    score = model.predict(text_vec)[0]
    return max(0.0, min(1.0, score))

# =====================
# Run everything in Jupyter
# =====================
if __name__ == "__main__":
    df = load_data("output/reddit_with_post_severity.csv")
    
    # Only run BERT & emotion mapping if not already in CSV
    df = get_bert_features(df)
    df = get_psycholinguistic_features(df)

    model, tfidf = train_model(df)

    # Example prediction
    sample_text = "I feel so hopeless and alone."
    print(f"Predicted severity: {predict_severity(sample_text, model, tfidf):.3f}")


Device set to use cpu
Device set to use cpu


KeyboardInterrupt: 

In [3]:
"""
train_severity_pipeline.py

End-to-end pipeline:
- Load CSV (expects columns: 'processed_text' and 'severity_score')
- Preprocess (lowercase, remove URLs/HTML, remove punctuation/numbers, KEEP stopwords)
- Extract Sentence-BERT embeddings (all-MiniLM-L6-v2)
- Extract psycholinguistic features using Empath (open alternative to LIWC)
- Concatenate features, train LightGBM regressor
- Evaluate and save models/artifacts
- Provide infer_severity(text) function for single-text inference

Requirements (install before running):
pip install pandas numpy tqdm sentence-transformers empath lightgbm scikit-learn joblib nltk
"""

import os
import re
import json
import argparse
from tqdm import tqdm
import numpy as np
import pandas as pd
import joblib
import logging

# NLP libs
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt', quiet=True)

# Sentence Embeddings
from sentence_transformers import SentenceTransformer

# Empath (psycholinguistic features)
from empath import Empath

# ML
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb

# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
logger = logging.getLogger(__name__)

# ---------------------------
# Config / Paths (edit below)
# ---------------------------
INPUT_CSV = "reddit_with_post_severity.csv"    # expects columns: processed_text, severity_score
OUTPUT_DIR = "models_pipeline"
EMBED_CACHE = os.path.join(OUTPUT_DIR, "embeddings.npy")
EMPATH_CACHE = os.path.join(OUTPUT_DIR, "empath_feats.npy")
SYMBOLS_CACHE = os.path.join(OUTPUT_DIR, "empath_cols.json")
MODEL_PATH = os.path.join(OUTPUT_DIR, "lightgbm_severity.pkl")
SCALER_PATH = os.path.join(OUTPUT_DIR, "feature_scaler.pkl")
EMBEDDER_PATH = os.path.join(OUTPUT_DIR, "embedder_all-MiniLM.pkl")  # optionally save embedder
EMPATHOR_PATH = os.path.join(OUTPUT_DIR, "empath_lexicon.pkl")
SEED = 42

In [4]:
# ---------------------------
# Preprocessing
# ---------------------------
def preprocess_keep_stopwords(text):
    """
    Lowercase, remove URLs and HTML, remove punctuation/numbers,
    tokenize (keeps stopwords).
    Returns cleaned string (tokens joined by spaces).
    """
    if not isinstance(text, str):
        text = str(text) if text is not None else ""
    text = text.lower()
    # remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)
    # remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)
    # remove characters that are not letters or spaces (keep spaces)
    # we intentionally remove numbers/punctuation
    text = re.sub(r'[^a-z\s]', ' ', text)
    # collapse whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # tokenize to preserve consistent tokenization (but we return joined string)
    tokens = word_tokenize(text)
    return " ".join(tokens)

In [5]:
# ---------------------------
# Empath features
# ---------------------------
def build_empath_features(texts, empath_analyzer=None, cache_path=None):
    """
    texts: list of strings
    empath_analyzer: Empath() instance (if None, it will be created)
    Returns:
        - features_array: np.ndarray shape (n_texts, n_features)
        - columns: list of feature names
    Caches results to disk if cache_path provided.
    """
    if empath_analyzer is None:
        empath_analyzer = Empath()
    if cache_path and os.path.exists(cache_path):
        logger.info("Loading empath features from cache: %s", cache_path)
        feats = np.load(cache_path, allow_pickle=True)
        columns = json.load(open(SYMBOLS_CACHE, 'r'))
        return feats, columns

    logger.info("Computing Empath features for %d texts...", len(texts))
    rows = []
    # Empath returns a dict of categories → counts or normalized values (we ask normalize=True)
    for t in tqdm(texts, desc="Empath features"):
        try:
            d = empath_analyzer.analyze(t, normalize=True)
        except Exception:
            # fallback to zeros for safety
            d = {}
        rows.append(d)
    df = pd.DataFrame(rows).fillna(0.0)
    columns = list(df.columns)
    feats = df.values.astype(float)

    if cache_path:
        os.makedirs(os.path.dirname(cache_path), exist_ok=True)
        np.save(cache_path, feats)
        json.dump(columns, open(SYMBOLS_CACHE, 'w'))
        logger.info("Saved Empath cache -> %s and columns -> %s", cache_path, SYMBOLS_CACHE)

    return feats, columns

In [6]:
# ---------------------------
# Embeddings extraction
# ---------------------------
def compute_or_load_embeddings(texts, model_name="all-MiniLM-L6-v2", cache_path=None, device=None):
    """
    Returns numpy array of embeddings (n_texts, dim).
    If cache exists, loads it.
    """
    if cache_path and os.path.exists(cache_path):
        logger.info("Loading embeddings from cache: %s", cache_path)
        return np.load(cache_path)

    logger.info("Loading SentenceTransformer model: %s", model_name)
    embedder = SentenceTransformer(model_name, device=device)
    # batch encode
    logger.info("Encoding %d texts with embedder...", len(texts))
    embeddings = embedder.encode(texts, batch_size=64, show_progress_bar=True)
    embeddings = np.array(embeddings, dtype=np.float32)

    if cache_path:
        os.makedirs(os.path.dirname(cache_path), exist_ok=True)
        np.save(cache_path, embeddings)
        try:
            joblib.dump(embedder, EMBEDDER_PATH)
        except Exception:
            pass
        logger.info("Saved embeddings cache -> %s", cache_path)

    return embeddings

In [7]:
# ---------------------------
# Combine and train
# ---------------------------
def train_model(X, y, output_model_path=MODEL_PATH, output_scaler_path=SCALER_PATH, test_size=0.2):
    """
    Trains a LightGBM regressor on X,y. Saves model and scaler.
    Returns trained model and scaler and (X_test,y_test,y_pred).
    """
    logger.info("Splitting data (test_size=%s)...", test_size)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=SEED)

    logger.info("Scaling features (StandardScaler)...")
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    logger.info("Training LightGBM regressor...")
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'random_state': SEED
    }

    # Use sklearn API for simpler save/load
    model = lgb.LGBMRegressor(n_estimators=1000, learning_rate=0.05, num_leaves=31, random_state=SEED)
    model.fit(
        X_train_scaled, y_train,
        eval_set=[(X_test_scaled, y_test)],
        eval_metric='rmse',
        early_stopping_rounds=50,
        verbose=50
    )

    logger.info("Predicting on test set...")
    y_pred = model.predict(X_test_scaled)

    logger.info("Saving model and scaler...")
    os.makedirs(os.path.dirname(output_model_path), exist_ok=True)
    joblib.dump(model, output_model_path)
    joblib.dump(scaler, output_scaler_path)
    logger.info("Model saved to %s", output_model_path)
    logger.info("Scaler saved to %s", output_scaler_path)

    return model, scaler, (X_test, y_test, y_pred)


In [8]:
# ---------------------------
# Evaluation utilities
# ---------------------------
def evaluate_preds(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    logger.info("Evaluation -> RMSE: %.4f | MAE: %.4f | R2: %.4f", rmse, mae, r2)
    return {'rmse': rmse, 'mae': mae, 'r2': r2}


In [9]:
# ---------------------------
# Inference
# ---------------------------
def load_artifacts(model_path=MODEL_PATH, scaler_path=SCALER_PATH, embed_cache=EMBED_CACHE, empath_cols_path=SYMBOLS_CACHE):
    """Load trained model and scaler. Also load embedder from disk if available."""
    model = joblib.load(model_path)
    scaler = joblib.load(scaler_path)
    # load columns for empath space
    if os.path.exists(empath_cols_path):
        empath_cols = json.load(open(empath_cols_path, 'r'))
    else:
        empath_cols = None
    embedder = None
    try:
        if os.path.exists(EMBEDDER_PATH):
            embedder = joblib.load(EMBEDDER_PATH)
    except Exception:
        embedder = None
    return model, scaler, embedder, empath_cols

def infer_severity(text, model, scaler, embedder, empath_cols, empath_analyzer=None):
    """Single-text inference. Returns severity score (0-1)."""
    # preprocess
    txt = preprocess_keep_stopwords(text)
    # embed
    if embedder is None:
        embedder = SentenceTransformer('all-MiniLM-L6-v2')  # fallback
    emb = embedder.encode([txt])
    emb = np.array(emb, dtype=np.float32)
    # empath features
    if empath_analyzer is None:
        empath_analyzer = Empath()
    empath_dict = empath_analyzer.analyze(txt, normalize=True)
    # ensure columns order
    if empath_cols is None:
        empath_cols = sorted(list(empath_dict.keys()))
    empath_row = [empath_dict.get(c, 0.0) for c in empath_cols]
    empath_row = np.array(empath_row, dtype=np.float32).reshape(1, -1)
    # combine
    X_new = np.hstack([emb, empath_row])
    # scale
    X_new_scaled = scaler.transform(X_new)
    # predict
    pred = model.predict(X_new_scaled)[0]
    return float(max(0.0, min(1.0, pred)))

In [10]:
# ---------------------------
# Main execution
# ---------------------------
def main(input_csv=INPUT_CSV, overwrite_cache=False, test_size=0.2, device=None):
    logger.info("Loading input CSV: %s", input_csv)
    df = pd.read_csv(input_csv)

    # Expect columns
    if 'processed_text' not in df.columns or 'severity_score' not in df.columns:
        raise ValueError("Input CSV must contain 'processed_text' and 'severity_score' columns")

    # Preprocess (ensure stopwords are kept)
    logger.info("Preprocessing texts (keep stopwords)...")
    df['processed_text_clean'] = df['processed_text'].fillna("").astype(str).apply(preprocess_keep_stopwords)

    texts = df['processed_text_clean'].tolist()
    y = df['severity_score'].astype(float).values

    # embeddings
    if not overwrite_cache and os.path.exists(EMBED_CACHE):
        embeddings = np.load(EMBED_CACHE)
    else:
        embeddings = compute_or_load_embeddings(texts, cache_path=EMBED_CACHE, device=device)

    # empath features
    if not overwrite_cache and os.path.exists(EMPATH_CACHE) and os.path.exists(SYMBOLS_CACHE):
        empath_feats = np.load(EMPATH_CACHE, allow_pickle=True)
        empath_cols = json.load(open(SYMBOLS_CACHE, 'r'))
    else:
        empath_analyzer = Empath()
        empath_feats, empath_cols = build_empath_features(texts, empath_analyzer=empath_analyzer, cache_path=EMPATH_CACHE)

    # Combine features (align dims)
    logger.info("Combining embeddings (%s) + empath features (%s)...", embeddings.shape, empath_feats.shape)
    X = np.hstack([embeddings, empath_feats])

    # Train model
    model, scaler, test_triplet = train_model(X, y, output_model_path=MODEL_PATH, output_scaler_path=SCALER_PATH, test_size=test_size)
    X_test, y_test, y_pred = test_triplet

    # Evaluate
    eval_stats = evaluate_preds(y_test, y_pred)

    # Save additional artifacts (empath columns)
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    json.dump({'empath_cols': empath_cols, 'embed_dim': embeddings.shape[1]}, open(os.path.join(OUTPUT_DIR, "meta.json"), 'w'))

    logger.info("Pipeline finished. Model, scaler and artifacts saved to %s", OUTPUT_DIR)
    return model, scaler, embeddings, empath_feats, empath_cols, eval_stats


In [11]:
# ---------------------------
# If run as script
# ---------------------------
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Train severity model: embeddings + empath + LightGBM")
    parser.add_argument("--input", type=str, default=INPUT_CSV, help="Input CSV path (must contain processed_text and severity_score)")
    parser.add_argument("--outdir", type=str, default=OUTPUT_DIR, help="Output folder for models and caches")
    parser.add_argument("--overwrite", action="store_true", help="Overwrite caches (recompute embeddings/empath)")
    parser.add_argument("--test_size", type=float, default=0.2, help="Test split fraction")
    parser.add_argument("--device", type=str, default=None, help="Device for sentence-transformers (e.g. 'cuda' or 'cpu')")
    args = parser.parse_args()

    # update global paths based on outdir
    OUTPUT_DIR = args.outdir
    EMBED_CACHE = os.path.join(OUTPUT_DIR, "embeddings.npy")
    EMPATH_CACHE = os.path.join(OUTPUT_DIR, "empath_feats.npy")
    SYMBOLS_CACHE = os.path.join(OUTPUT_DIR, "empath_cols.json")
    MODEL_PATH = os.path.join(OUTPUT_DIR, "lightgbm_severity.pkl")
    SCALER_PATH = os.path.join(OUTPUT_DIR, "feature_scaler.pkl")
    EMBEDDER_PATH = os.path.join(OUTPUT_DIR, "embedder_all-MiniLM.pkl")
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    main(input_csv=args.input, overwrite_cache=args.overwrite, test_size=args.test_size, device=args.device)

usage: ipykernel_launcher.py [-h] [--input INPUT] [--outdir OUTDIR]
                             [--overwrite] [--test_size TEST_SIZE]
                             [--device DEVICE]
ipykernel_launcher.py: error: unrecognized arguments: -f C:\Users\praja\AppData\Roaming\jupyter\runtime\kernel-4b06ddd9-f94e-426f-abff-405b9badcebe.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
