In [None]:
import os
import re
import json
import joblib
import logging
import numpy as np
import pandas as pd
from typing import List, Tuple, Optional
import concurrent.futures
import multiprocessing
import pickle

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, accuracy_score,
    precision_score, recall_score, f1_score
)
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from scipy.stats import loguniform

import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm


In [None]:
def set_seed(seed: int):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    logging.info(f"Random seed set to {seed}")

def setup_logging(log_file: str):
    os.makedirs(os.path.dirname(log_file), exist_ok=True)
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(log_file),
            logging.StreamHandler()
        ]
    )
    logging.info("Logging is set up.")

def load_file(filename: str) -> Optional[dict]:
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            return json.load(file)
    except json.JSONDecodeError as e:
        logging.error(f"JSONDecodeError for file {filename}: {e}")
    except Exception as e:
        logging.error(f"Unexpected error loading file {filename}: {e}")
    return None

def load_files_from_dir(directory: str, executor: concurrent.futures.ThreadPoolExecutor) -> List[dict]:
    if not os.path.isdir(directory):
        logging.warning(f"Directory does not exist: {directory}")
        return []
    filenames = [
        os.path.join(directory, f)
        for f in os.listdir(directory)
        if os.path.isfile(os.path.join(directory, f)) and (f.lower().endswith('.json') or '.' not in f)
    ]
    if not filenames:
        logging.warning(f"No JSON files found in directory: {directory}")
        return []

    future_to_file = {executor.submit(load_file, filename): filename for filename in filenames}
    loaded_data = []
    for future in concurrent.futures.as_completed(future_to_file):
        filename = future_to_file[future]
        try:
            data = future.result()
            if data is not None:
                loaded_data.append(data)
            else:
                logging.warning(f"Data is None for file: {filename}")
        except Exception as e:
            logging.error(f"Error processing file {filename}: {e}")
    return loaded_data

def load_data(data_dirs: List[str]) -> List[dict]:
    logging.info("Starting data loading process.")
    all_data = []
    num_workers = min(32, (multiprocessing.cpu_count() or 1) + 4)

    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = {
            executor.submit(load_files_from_dir, directory, executor): directory
            for directory in data_dirs
        }

        for future in concurrent.futures.as_completed(futures):
            directory = futures[future]
            try:
                data = future.result()
                all_data.extend(data)
                logging.info(f"Total data loaded so far: {len(all_data)}")
            except Exception as e:
                logging.error(f"Error loading data from directory {directory}: {e}")

    logging.info(f"Completed data loading. Total records loaded: {len(all_data)}")
    return all_data

def extract_fields(data: List[dict]) -> pd.DataFrame:
    logging.info("Extracting fields from data.")
    titles = []
    abstracts = []
    keywords = []
    subject_areas = []

    for item in data:
        coredata = item.get('abstracts-retrieval-response', {}).get('coredata', {})
        titles.append(coredata.get('dc:title', ''))
        abstracts.append(coredata.get('dc:description', ''))

        authkeywords = item.get('abstracts-retrieval-response', {}).get('authkeywords', {})
        if isinstance(authkeywords, dict):
            ak = authkeywords.get('author-keyword', [])
            if isinstance(ak, list):
                kws = [kw.get('$', '') for kw in ak]
            elif isinstance(ak, dict):
                kws = [ak.get('$', '')]
            else:
                kws = []
        else:
            kws = []
        keywords.append(kws)

        sa_list = item.get('abstracts-retrieval-response', {}).get('subject-areas', {}).get('subject-area', [])
        subject_areas.append([area.get('$', '') for area in sa_list])

    df = pd.DataFrame({
        'title': titles,
        'abstract': abstracts,
        'keywords': keywords,
        'subject_areas': subject_areas
    })
    logging.info(f"Extracted fields into DataFrame with shape: {df.shape}")
    return df

def clean_text(text: str, lemmatizer: WordNetLemmatizer, stop_words: set) -> str:
    if not isinstance(text, str):
        return ''
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

def preprocess_dataframe(df: pd.DataFrame, min_samples: int) -> pd.DataFrame:
    logging.info("Starting data preprocessing.")
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    df['clean_title'] = df['title'].apply(lambda x: clean_text(x, lemmatizer, stop_words))
    df['clean_abstract'] = df['abstract'].apply(lambda x: clean_text(x, lemmatizer, stop_words))
    df['clean_keywords'] = df['keywords'].apply(
        lambda kws: clean_text(' '.join(kws), lemmatizer, stop_words) if isinstance(kws, list) else ''
    )

    df['combined_text'] = df['clean_title'].astype(str) + ' ' + df['clean_abstract'].astype(str) + ' ' + df['clean_keywords'].astype(str)

    initial_count = len(df)
    df = df[df['combined_text'].str.strip() != '']
    df = df[df['subject_areas'].map(lambda d: len(d)) > 0]
    logging.info(f"Dropped {initial_count - len(df)} rows due to empty text or missing subject areas.")

    # Filter to keep only subject areas ending with "(all)"
    df['subject_areas'] = df['subject_areas'].apply(lambda x: [cls for cls in x if cls.endswith("(all)")])

    before_filter = len(df)
    df = df[df['subject_areas'].map(lambda d: len(d)) > 0]
    logging.info(f"Removed {before_filter - len(df)} samples without subject areas ending with '(all)'.")

    # Now, handle multiple "(all)" subject areas by keeping only the most frequent one
    # First, find the frequency of each "(all)" subject area
    subject_counts = pd.Series([cls for classes in df['subject_areas'] for cls in classes]).value_counts()
    logging.info(f"Subject counts:\n{subject_counts}")

    # Define a function to keep the most frequent "(all)" subject area
    def keep_most_frequent_subject(subjects):
        if len(subjects) == 1:
            return subjects[0]
        else:
            # Sort subjects based on their frequency
            sorted_subjects = sorted(subjects, key=lambda x: subject_counts.get(x, 0), reverse=True)
            return sorted_subjects[0]

    # Apply the function to create a new 'subject_area' column
    df['subject_area'] = df['subject_areas'].apply(keep_most_frequent_subject)

    # Drop the old 'subject_areas' column
    df = df.drop(columns=['subject_areas'])
    logging.info(f"DataFrame shape after processing subject areas: {df.shape}")

    # Handle class imbalance by removing classes with fewer than min_samples
    class_counts = df['subject_area'].value_counts()
    valid_classes = class_counts[class_counts >= min_samples].index.tolist()
    logging.info(f"Valid classes after applying min_samples_per_class={min_samples}:\n{class_counts[class_counts >= min_samples]}")

    df = df[df['subject_area'].isin(valid_classes)]
    logging.info(f"Final DataFrame shape after filtering classes: {df.shape}")

    return df

def encode_labels(df: pd.DataFrame) -> Tuple[np.ndarray, LabelEncoder]:
    le = LabelEncoder()
    y = le.fit_transform(df['subject_area'])
    logging.info(f"Number of classes: {len(le.classes_)}")
    return y, le

def split_data(X: pd.Series, y: np.ndarray, test_size: float, random_state: int) -> Tuple:
    logging.info("Splitting data into train and test sets.")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    logging.info(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")
    return X_train, X_test, y_train, y_test

def initialize_embedding_generator(model_name: str, device: str = None) -> Tuple[AutoTokenizer, AutoModel, str]:
    logging.info(f"Loading tokenizer and model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    device = device if device else ('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()
    logging.info(f"Model loaded on device: {device}")
    return tokenizer, model, device

def get_cls_embeddings(text_list: List[str], tokenizer: AutoTokenizer, model: AutoModel, device: str, batch_size: int = 32) -> np.ndarray:
    embeddings = []
    with torch.no_grad():
        for i in tqdm(range(0, len(text_list), batch_size), desc="Generating Embeddings"):
            batch_text = text_list[i:i+batch_size]
            encoded_input = tokenizer.batch_encode_plus(
                batch_text,
                padding=True,
                truncation=True,
                max_length=512,
                return_tensors='pt'
            )
            encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
            outputs = model(**encoded_input)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]
            embeddings.append(cls_embeddings.cpu().numpy())
    return np.vstack(embeddings)

def evaluate_model(y_true, y_pred, target_names: list) -> dict:
    logging.info("Evaluating the model.")
    
    report = classification_report(y_true, y_pred, target_names=target_names, zero_division=0)
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    
    metrics = {
        'classification_report': report,
        'accuracy': accuracy,
        'precision_weighted': precision,
        'recall_weighted': recall,
        'f1_weighted': f1
    }
    return metrics

def print_evaluation_metrics(metrics):
    print("Evaluation metrics:")
    print(f"Accuracy: {metrics['accuracy']}")
    print(f"Weighted Precision: {metrics['precision_weighted']}")
    print(f"Weighted Recall: {metrics['recall_weighted']}")
    print(f"Weighted F1 Score: {metrics['f1_weighted']}")
    print("\nClassification Report:")
    print(metrics['classification_report'])

def save_configuration(config: dict, save_path: str):
    # Create a copy of config to avoid modifying the original
    config_to_save = config.copy()
    
    # Remove 'param_dist' if present
    if 'hyperparameter_tuning' in config_to_save:
        config_to_save['hyperparameter_tuning'] = config_to_save['hyperparameter_tuning'].copy()
        if 'param_dist' in config_to_save['hyperparameter_tuning']:
            del config_to_save['hyperparameter_tuning']['param_dist']
    
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    with open(save_path, 'w') as f:
        json.dump(config_to_save, f, indent=4)
    logging.info(f"Configuration saved to {save_path}")

def save_preprocessed_data(df: pd.DataFrame, save_path: str):
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    df.to_csv(save_path, index=False)
    logging.info(f"Preprocessed data saved to {save_path}")

def save_tokenizer_and_model(tokenizer: AutoTokenizer, model: AutoModel, save_dir: str):
    os.makedirs(save_dir, exist_ok=True)
    tokenizer.save_pretrained(save_dir)
    model.save_pretrained(save_dir)
    logging.info(f"Tokenizer and model saved to {save_dir}")

def save_model(model, save_path: str):
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    joblib.dump(model, save_path)
    logging.info(f"Model saved to {save_path}")

def save_label_encoder(le: LabelEncoder, save_path: str):
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    joblib.dump(le, save_path)
    logging.info(f"Label Encoder saved to {save_path}")

def save_evaluation_metrics(metrics: dict, save_path: str):
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    with open(save_path, 'w') as f:
        json.dump(metrics, f, indent=4)
    logging.info(f"Evaluation metrics saved to {save_path}")

def save_best_parameters(best_params: dict, save_path: str):
    """
    Saves the best hyperparameters to a JSON file.

    Args:
        best_params (dict): The best hyperparameters found by the search.
        save_path (str): The file path where the best parameters will be saved.
    """
    try:
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        with open(save_path, 'w') as f:
            json.dump(best_params, f, indent=4)
        logging.info(f"Best hyperparameters saved to {save_path}")
    except Exception as e:
        logging.error(f"Failed to save best hyperparameters: {e}")


In [None]:
###################################
# Configuration
###################################
config = {
    "data_dirs": ["../../data/2018",
                  "../../data/2019",
                  "../../data/2020",
                  "../../data/2021",
                  "../../data/2022",
                  "../../data/2023"],
    "log_file": "../logs/training.log",
    "embedding_model": "allenai/scibert_scivocab_uncased",
    "test_size": 0.2,
    "random_state": 42,
    "min_samples_per_class": 50, 
    "batch_size": 16,
    "model_save_path": "/models/LogisticRegression_single_hyper_01/2018_2023/single_label_classifier.pkl",
    "label_encoder_save_path": "/models/LogisticRegression_single_hyper_01/2018_2023/label_encoder.pkl",
    "tokenizer_model_save_dir": "/models/LogisticRegression_single_hyper_01/2018_2023/tokenizer_model/",
    "preprocessed_data_save_path": "/models/LogisticRegression_single_hyper_01/2018_2023/preprocessed_data.csv",
    "metrics_save_path" : "/models/LogisticRegression_single_hyper_01/2018_2023/evaluation_metrics.json",
    "config_save_path": "/models/LogisticRegression_single_hyper_01/2018_2023/config.json",
    "best_params_save_path": "/models/LogisticRegression_single_hyper_01/2018_2023/best_params.json",  
    "hyperparameter_tuning": {
        "enabled": True,
        "method": "random",
        "param_grid": {
            'pca__n_components': [256, 512, 768],
            'classifier__C': [0.01, 0.1, 1, 10, 100],
            'classifier__penalty': ['l1', 'l2'],
        },
        # For RandomizedSearchCV
        "param_dist": {
            'pca__n_components': [256, 512, 768],
            'classifier__C': loguniform(1e-3, 1e3),
            'classifier__penalty': ['l1', 'l2'],
        },
        "cv": 3,
        "scoring": 'f1_weighted',
        "n_iter": 10,  # Only for RandomizedSearchCV
    }
}


In [None]:
###################################
# Setup Logging
###################################
setup_logging(config['log_file'])


In [None]:
###################################
# Save Configuration
###################################
save_configuration(config, config['config_save_path'])


In [None]:
###################################
# Set Seed
###################################
set_seed(config['random_state'])


In [None]:
###################################
# Load Data
###################################
raw_data = load_data(config['data_dirs'])
df = extract_fields(raw_data)


In [None]:
###################################
# Save Preprocessed Data
###################################
save_preprocessed_data(df, config['preprocessed_data_save_path'])


In [None]:
###################################
# Preprocess Data
###################################
df = preprocess_dataframe(df, min_samples=config['min_samples_per_class'])


In [None]:
###################################
# Encode Labels
###################################
y, le = encode_labels(df)


In [None]:
###################################
# Split Data
###################################
X_train_text, X_test_text, y_train, y_test = split_data(
    df['combined_text'], y, test_size=config['test_size'], random_state=config['random_state']
)


In [None]:
###################################
# Generate Embeddings
###################################
tokenizer, model, device = initialize_embedding_generator(config['embedding_model'])
X_train_embeddings = get_cls_embeddings(X_train_text.tolist(), tokenizer, model, device, batch_size=config['batch_size'])
X_test_embeddings = get_cls_embeddings(X_test_text.tolist(), tokenizer, model, device, batch_size=config['batch_size'])


In [None]:
###################################
# Save Tokenizer and Embedding Model
###################################
save_tokenizer_and_model(tokenizer, model, config['tokenizer_model_save_dir'])


In [None]:
###################################
# Create a Pipeline
###################################
# Define initial hyperparameters
pca_n_components = 512  
logistic_regression_C = 0.1 
logistic_regression_class_weight = 'balanced'

# Initialize Logistic Regression for multi-class classification
lr = LogisticRegression(
    solver='saga',
    C=logistic_regression_C,
    max_iter=8000,
    n_jobs=-1,
    random_state=config['random_state'],
    class_weight=logistic_regression_class_weight,
    verbose=1
)

# Create the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=pca_n_components)),
    ('classifier', lr)
])


In [None]:
###################################
# Hyperparameter Tuning
###################################
if config['hyperparameter_tuning']['enabled']:
    method = config['hyperparameter_tuning']['method']
    cv = config['hyperparameter_tuning']['cv']
    scoring = config['hyperparameter_tuning']['scoring']
    
    if method == "grid":
        param_grid = config['hyperparameter_tuning']['param_grid']
        search = GridSearchCV(
            estimator=pipeline,
            param_grid=param_grid,
            cv=cv,
            scoring=scoring,
            verbose=2,
            n_jobs=-1
        )
    elif method == "random":
        param_dist = config['hyperparameter_tuning']['param_dist']
        n_iter = config['hyperparameter_tuning']['n_iter']
        search = RandomizedSearchCV(
            estimator=pipeline,
            param_distributions=param_dist,
            n_iter=n_iter,
            cv=cv,
            scoring=scoring,
            verbose=2,
            random_state=config['random_state'],
            n_jobs=-1
        )
    else:
        raise ValueError("Invalid hyperparameter tuning method. Choose 'grid' or 'random'.")

    # Train the model with hyperparameter tuning
    logging.info(f"Starting hyperparameter tuning using {method} search.")
    search.fit(X_train_embeddings, y_train)
    logging.info("Hyperparameter tuning completed.")
    logging.info(f"Best parameters found: {search.best_params_}")
    logging.info(f"Best cross-validation {scoring}: {search.best_score_}")

    # Replace pipeline with the best estimator
    best_pipeline = search.best_estimator_
    
    # Save the best hyperparameters
    save_best_parameters(search.best_params_, config['best_params_save_path'])
    
    # # Optionally, save the GridSearchCV or RandomizedSearchCV object
    # search_save_path = "../models/LogisticRegression_single_02/2018_2023/hyperparameter_search.pkl"
    # joblib.dump(search, search_save_path)
    # logging.info(f"Hyperparameter search object saved to {search_save_path}")
else:
    best_pipeline = pipeline


In [None]:
###################################
# Train the Pipeline (If Not Tuned)
###################################
if not config['hyperparameter_tuning']['enabled']:
    logging.info("Starting model training.")
    best_pipeline.fit(X_train_embeddings, y_train)
    logging.info("Model training completed.")


In [None]:
###################################
# Predict on Test Set
###################################
logging.info("Generating predictions on test set.")
y_test_pred = best_pipeline.predict(X_test_embeddings)


In [None]:
###################################
# Evaluate
###################################
metrics = evaluate_model(y_test, y_test_pred, target_names=le.classes_)
print_evaluation_metrics(metrics)


In [None]:
###################################
# Save Model and Label Encoder
###################################
save_model(best_pipeline, config['model_save_path'])
save_label_encoder(le, config['label_encoder_save_path'])


In [None]:
###################################
# Save Evaluation Metrics
###################################
save_evaluation_metrics(metrics, config['metrics_save_path'])


In [None]:
###################################
# Save Best Hyperparameters
###################################
# The best parameters have already been saved in the Hyperparameter Tuning section.
# If you prefer to have a separate saving step, you can uncomment the following lines:

# best_params = search.best_params_
# save_best_parameters(best_params, config['best_params_save_path'])
