In [None]:
import re
import json
import joblib
import logging
import numpy as np
import pandas as pd
from typing import List, Tuple, Optional
import concurrent.futures
import multiprocessing
import pickle


import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, hamming_loss, f1_score,
    precision_score, recall_score, jaccard_score,
    label_ranking_average_precision_score, label_ranking_loss,
    coverage_error, accuracy_score
)
from sklearn.multiclass import OneVsRestClassifier
from sklearn.decomposition import PCA
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm


import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
projcect_root = os.path.abspath(os.path.join('../../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
if projcect_root not in sys.path:
    sys.path.append(projcect_root)
from data_engineering.data import GetCleanedData

In [None]:
def set_seed(seed: int):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    logging.info(f"Random seed set to {seed}")


def setup_logging(log_file: str):
    os.makedirs(os.path.dirname(log_file), exist_ok=True)
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(log_file),
            logging.StreamHandler()
        ]
    )
    logging.info("Logging is set up.")

def preprocess_dataframe(df: pd.DataFrame, min_samples: int) -> pd.DataFrame:
    logging.info("Starting data preprocessing.")

    df['clean_title'] = df['title']
    df['clean_abstract'] = df['abstract']
    df['clean_keywords'] = df['keywords']

    df['combined_text'] = df['clean_title'].astype(str) + ' ' + df['clean_abstract'].astype(str) + ' ' + df['clean_keywords'].astype(str)

    df = df[df['combined_text'].str.strip() != '']

    before_filter = len(df)
    class_counts = pd.Series([classes for classes in df['subject_area']]).value_counts()
    valid_classes = class_counts[class_counts >= min_samples].index.tolist()
    # filter only subject_area that are in valid_classes
    df = df[df['subject_area'].apply(lambda x: x in valid_classes)]
    logging.info(f"Removed {before_filter - len(df)} samples with no valid labels after filtering.")


    class_counts = pd.Series([classes for classes in df['subject_area']]).value_counts()
    logging.info(f"Class Distribution After Handling Imbalance:\n{class_counts}")

    return df


def encode_labels(df: pd.DataFrame) -> Tuple[np.ndarray, MultiLabelBinarizer]:
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(df['subject_area'])
    logging.info(f"Number of classes: {len(mlb.classes_)}")
    return y, mlb


def split_data_iterative_stratification(X: pd.Series, y: np.ndarray, test_size: float, random_state: int):
    logging.info("Splitting data using iterative stratification.")
    msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    for train_index, test_index in msss.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
    logging.info(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")
    return X_train, X_test, y_train, y_test


def initialize_embedding_generator(model_name: str, device: str = None) -> Tuple[AutoTokenizer, AutoModel, str]:
    logging.info(f"Loading tokenizer and model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    device = device if device else ('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()
    logging.info(f"Model loaded on device: {device}")
    return tokenizer, model, device


def get_cls_embeddings(text_list: List[str], tokenizer: AutoTokenizer, model: AutoModel, device: str, batch_size: int = 16) -> np.ndarray:
    embeddings = []
    with torch.no_grad():
        for i in tqdm(range(0, len(text_list), batch_size), desc="Generating Embeddings"):
            batch_text = text_list[i:i+batch_size]
            encoded_input = tokenizer.batch_encode_plus(
                batch_text,
                padding=True,
                truncation=True,
                max_length=512,
                return_tensors='pt'
            )
            encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
            outputs = model(**encoded_input)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]
            embeddings.append(cls_embeddings.cpu().numpy())
    return np.vstack(embeddings)


def find_optimal_thresholds(y_true, y_scores) -> np.ndarray:
    n_classes = y_true.shape[1]
    thresholds = np.zeros(n_classes)
    for i in range(n_classes):
        best_thr = 0.5
        best_f1 = 0.0
        for thr in np.linspace(0, 1, 101):
            y_pred_label = (y_scores[:, i] >= thr).astype(int)
            score = f1_score(y_true[:, i], y_pred_label, zero_division=0)
            if score > best_f1:
                best_f1 = score
                best_thr = thr
        thresholds[i] = best_thr
    return thresholds


def apply_thresholds(y_scores, thresholds) -> np.ndarray:
    return (y_scores >= thresholds).astype(int)


def evaluate_model(y_true, y_pred, y_scores, target_names: list) -> dict:
    logging.info("Evaluating the model.")
    
    # Existing Metrics
    report = classification_report(y_true, y_pred, target_names=target_names, zero_division=0)
    hl = hamming_loss(y_true, y_pred)
    micro_precision = precision_score(y_true, y_pred, average='micro', zero_division=0)
    micro_recall = recall_score(y_true, y_pred, average='micro', zero_division=0)
    micro_f1 = f1_score(y_true, y_pred, average='micro', zero_division=0)
    macro_precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
    macro_recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
    macro_f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)
    jaccard = jaccard_score(y_true, y_pred, average='samples', zero_division=0)
    
    # New Multi-Label Metrics
    lr_ap = label_ranking_average_precision_score(y_true, y_scores)
    lr_loss = label_ranking_loss(y_true, y_scores)
    cov_error = coverage_error(y_true, y_scores)
    subset_acc = accuracy_score(y_true, y_pred)
    
    metrics = {
        'classification_report': report,
        'hamming_loss': hl,
        'micro_precision': micro_precision,
        'micro_recall': micro_recall,
        'micro_f1': micro_f1,
        'macro_precision': macro_precision,
        'macro_recall': macro_recall,
        'macro_f1': macro_f1,
        'jaccard_score': jaccard,
        'label_ranking_average_precision_score': lr_ap,
        'label_ranking_loss': lr_loss,
        'coverage_error': cov_error,
        'subset_accuracy': subset_acc
    }
    return metrics


def print_evaluation_metrics(metrics):
    print("Evaluation metrics:")
    print(f"Micro Precision: {metrics['micro_precision']}")
    print(f"Micro Recall: {metrics['micro_recall']}")
    print(f"Micro F1 Score: {metrics['micro_f1']}")
    print(f"Macro Precision: {metrics['macro_precision']}")
    print(f"Macro Recall: {metrics['macro_recall']}")
    print(f"Macro F1 Score: {metrics['macro_f1']}")
    print(f"Hamming Loss: {metrics['hamming_loss']}")
    print(f"Jaccard Score: {metrics['jaccard_score']}")
    print(f"Label Ranking Average Precision Score: {metrics['label_ranking_average_precision_score']}")
    print(f"Label Ranking Loss: {metrics['label_ranking_loss']}")
    print(f"Coverage Error: {metrics['coverage_error']}")
    print(f"Subset Accuracy: {metrics['subset_accuracy']}")
    print("\nClassification Report:")
    print(metrics['classification_report'])


def save_configuration(config: dict, save_path: str):
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    with open(save_path, 'w') as f:
        json.dump(config, f, indent=4)
    logging.info(f"Configuration saved to {save_path}")


def save_preprocessed_data(df: pd.DataFrame, save_path: str):
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    df.to_csv(save_path, index=False)
    logging.info(f"Preprocessed data saved to {save_path}")


def save_tokenizer_and_model(tokenizer: AutoTokenizer, model: AutoModel, save_dir: str):
    os.makedirs(save_dir, exist_ok=True)
    tokenizer.save_pretrained(save_dir)
    model.save_pretrained(save_dir)
    logging.info(f"Tokenizer and model saved to {save_dir}")


def save_thresholds(thresholds: np.ndarray, save_path: str):
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    with open(save_path, 'wb') as f:
        pickle.dump(thresholds, f)
    logging.info(f"Thresholds saved to {save_path}")


In [None]:
###################################
# Configuration
###################################
config = {
    "data_dirs": ["../data/2018",
                  "../data/2019",
                    "../data/2020",
                    "../data/2021",
                    "../data/2022",
                    "../data/2023",
                  ],
    "log_file": "/logs/training.log",
    "embedding_model": "allenai/scibert_scivocab_uncased",
    "test_size": 0.2,
    "random_state": 42,
    "min_samples_per_class": 100,
    "batch_size": 32,
    "model_save_path": "/models/LogisticRegression_hyper_01/2018_2023/multi_label_classifier.pkl",
    "mlb_save_path": "/models/LogisticRegression_hyper_01/2018_2023/mlb.pkl",
    "tokenizer_model_save_dir": "/models/LogisticRegression_hyper_01/2018_2023/tokenizer_model/",
    "thresholds_save_path": "/models/LogisticRegression_hyper_01/2018_2023/thresholds.pkl",
    "config_save_path": "/models/LogisticRegression_hyper_01/2018_2023/config.json",
    "preprocessed_data_save_path": "/models/LogisticRegression_hyper_01/2018_2023/preprocessed_data.csv",
    "metrics_save_path" : "/models/LogisticRegression_hyper_01/2018_2023/evaluation_metrics.json"
    "best_params_save_path" "/models/LogisticRegression_hyper_01/2018_2023/best_params.json"
}


In [None]:
###################################
# Setup Logging
###################################
setup_logging(config['log_file'])



In [None]:
###################################
# Save Configuration
###################################
save_configuration(config, config['config_save_path'])


In [None]:
###################################
# Set Seed
###################################
set_seed(config['random_state'])


In [None]:
###################################
# Load Data
###################################
# Get it from the data engineering module
df = GetCleanedData()

In [None]:
###################################
# Preprocess Data
###################################
df = preprocess_dataframe(df, min_samples=config['min_samples_per_class'])

In [None]:
###################################
# Encode Labels
###################################
y, mlb = encode_labels(df)


In [None]:
###################################
# Split Data
###################################
X_train_text, X_test_text, y_train, y_test = split_data_iterative_stratification(
    df['combined_text'], y, test_size=config['test_size'], random_state=config['random_state']
)



In [None]:
# Further split training set for validation (threshold tuning)
X_train_text, X_val_text, y_train, y_val = train_test_split(
    X_train_text, y_train, test_size=0.1, random_state=config['random_state']
)
logging.info(f"After validation split - Training samples: {len(X_train_text)}, Validation samples: {len(X_val_text)}")


In [None]:
###################################
# Generate Embeddings
###################################
tokenizer, model, device = initialize_embedding_generator(config['embedding_model'])
X_train_embeddings = get_cls_embeddings(X_train_text.tolist(), tokenizer, model, device, batch_size=config['batch_size'])
X_val_embeddings = get_cls_embeddings(X_val_text.tolist(), tokenizer, model, device, batch_size=config['batch_size'])
X_test_embeddings = get_cls_embeddings(X_test_text.tolist(), tokenizer, model, device, batch_size=config['batch_size'])



In [None]:
###################################
# Save Tokenizer and Embedding Model
###################################
save_tokenizer_and_model(tokenizer, model, config['tokenizer_model_save_dir'])


In [None]:
###################################
# Create a Pipeline with Fixed Hyperparameters
###################################
# Define fixed hyperparameters
pca_n_components = 256 
logistic_regression_C = 0.01 
logistic_regression_class_weight = 'balanced'

base_lr = LogisticRegression(
    solver='saga',
    C=logistic_regression_C,
    max_iter=5000,
    n_jobs=-1,
    random_state=config['random_state'],
    class_weight=logistic_regression_class_weight,
    verbose=1
)
ovr = OneVsRestClassifier(base_lr)
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=pca_n_components)),
    ('classifier', ovr)
])

# Define parameter grid
param_grid = {
    'pca__n_components': [128, 256, 512],
    'classifier__estimator__C': [0.01, 0.1, 1, 10],
    'classifier__estimator__class_weight': [None, 'balanced']
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,
    scoring='f1_micro',
    n_jobs=-1,
    verbose=2
)

# Train the model with GridSearchCV
logging.info("Starting model training with GridSearchCV.")
grid_search.fit(X_train_embeddings, y_train)
logging.info("Model training with GridSearchCV completed.")

# Replace the original pipeline with the best estimator
pipeline = grid_search.best_estimator_

# Save the best parameters from GridSearchCV
best_params = grid_search.best_params_
best_params_save_path = "../models/LogisticRegression_03/2018_2019//best_params.json"
os.makedirs(os.path.dirname(best_params_save_path), exist_ok=True)
with open(best_params_save_path, 'w') as f:
    json.dump(best_params, f, indent=4)
logging.info(f"Best parameters saved to {best_params_save_path}")


In [None]:
best_params = grid_search.best_params_
best_params_save_path = "../models/LogisticRegression_03/2018_2019//best_params.json"
os.makedirs(os.path.dirname(best_params_save_path), exist_ok=True)
with open(best_params_save_path, 'w') as f:
    json.dump(best_params, f, indent=4)
logging.info(f"Best parameters saved to {best_params_save_path}")

In [None]:
###################################
# Threshold Tuning on Validation Set
###################################
logging.info("Starting threshold tuning on validation set.")
y_val_scores = []
scaler = pipeline.named_steps['scaler']
pca = pipeline.named_steps['pca']
classifier = pipeline.named_steps['classifier']


In [None]:
# Transform validation data
val_features = scaler.transform(X_val_embeddings)
val_features = pca.transform(val_features)

# Get scores from each estimator
for estimator in classifier.estimators_:
    y_val_scores.append(estimator.predict_proba(val_features)[:, 1])
y_val_scores = np.array(y_val_scores).T

thresholds = find_optimal_thresholds(y_val, y_val_scores)
logging.info(f"Optimal thresholds per class: {thresholds}")



In [None]:
###################################
# Save Thresholds
###################################
save_thresholds(thresholds, config['thresholds_save_path'])


In [None]:
###################################
# Predict on Test Set
###################################
logging.info("Generating predictions on test set.")
y_test_scores = []
test_features = scaler.transform(X_test_embeddings)
test_features = pca.transform(test_features)
for estimator in classifier.estimators_:
    y_test_scores.append(estimator.predict_proba(test_features)[:, 1])
y_test_scores = np.array(y_test_scores).T

y_pred = apply_thresholds(y_test_scores, thresholds)


In [None]:
###################################
# Evaluate
###################################
metrics = evaluate_model(y_test, y_pred, y_test_scores, target_names=mlb.classes_)
print_evaluation_metrics(metrics)


In [None]:
###################################
# Save Model and Label Binarizer
###################################
os.makedirs('models', exist_ok=True)
joblib.dump(pipeline, config['model_save_path'])
joblib.dump(mlb, config['mlb_save_path'])
logging.info("Pipeline and label binarizer saved.")


In [None]:
###################################
# Save Evaluation Metrics
###################################

os.makedirs(os.path.dirname(config['metrics_save_path']), exist_ok=True)
with open(config['metrics_save_path'], 'w') as f:
    json.dump(metrics, f, indent=4)
logging.info(f"Evaluation metrics saved to {config['metrics_save_path']}")


