# Installing dependencies
## Please make a copy of this notebook.

# Huggingface login
You will require your personal token.

In [None]:
# !huggingface-cli login

# Part 1: Load Data

## Downloading the train and test dataset

In [2]:
from datasets import load_dataset

dataset_train = load_dataset("CISProject/FOX_NBC", split="train")
dataset_test = load_dataset("CISProject/FOX_NBC", split="test")

In [3]:
'''
import numpy as np
import torch
from transformers import BertTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

def preprocess_data(data, mode="train", vectorizer=None, max_features=4096, max_seq_length=128, num_proc=4):
    import torch
    def positional_encoding(seq_len, d_model):
        import torch
        import numpy as np
        pos_enc = np.zeros((seq_len, d_model))
        for pos in range(seq_len):
            for i in range(0, d_model, 2):
                pos_enc[pos, i] = np.sin(pos / (10000 ** ((2 * i) / d_model)))
                if i + 1 < d_model:
                    pos_enc[pos, i + 1] = np.cos(pos / (10000 ** ((2 * (i + 1)) / d_model)))
        return torch.tensor(pos_enc, dtype=torch.float)
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    # Initialize TF-IDF vectorizer if not provided
    if vectorizer is None and mode == "train":
        vectorizer = TfidfVectorizer(max_features=max_features)

    # Fit TF-IDF only in train mode
    if mode == "train":
        vectorizer.fit(data["title"])
        print("TF-IDF vectorizer fitted on training data.")

    def process_batch(batch):
        headlines = batch["title"]
        agencies = batch["news"]

        # TF-IDF transformation (batch-wise)
        if mode == "train" or tfidf_vectorizer is not None:
            freq_inputs = vectorizer.transform(headlines).toarray()
        else:
            raise ValueError("TF-IDF vectorizer must be provided in test mode.")

        # Tokenization (batch-wise)
        tokenized = tokenizer(
            headlines,
            padding="max_length",
            truncation=True,
            max_length=max_seq_length,
            return_tensors="pt"
        )

        # Stack input_ids and attention_mask along a new dimension
        input_ids = tokenized["input_ids"]
        attention_mask = tokenized["attention_mask"]

        # Ensure consistent stacking: (batch_size, 2, seq_len)
        seq_inputs = torch.stack([input_ids, attention_mask], dim=1)

        # Positional encoding
        pos_inputs = positional_encoding(max_seq_length, 512).unsqueeze(0).expand(len(headlines), -1, -1)

        # Labels
        labels = [1.0 if agency == "fox" else 0.0 for agency in agencies]

        return {
            "freq_inputs": torch.tensor(freq_inputs),
            "seq_inputs": seq_inputs,
            "pos_inputs": pos_inputs,
            "labels": torch.tensor(labels),
        }

    # Use `map` with batching and parallelism
    processed_data = data.map(
        process_batch,
        batched=True,
        batch_size=32,
        num_proc=num_proc
    )

    return processed_data, vectorizer
'''

import numpy as np
import torch
import re
from transformers import BertTokenizer
from sklearn.feature_extraction.text import CountVectorizer

def preprocess_data(data,
                    mode="train",
                    vectorizer=None,
                    w2v_model=None,
                    max_features=4096,
                    max_seq_length=128,
                    num_proc=4):
    # Ensure w2v_model is provided if we are replacing positional encoding
    if w2v_model is None:
        raise ValueError("w2v_model must be provided for Word2Vec embeddings.")

    # Preprocessing function for headlines
    def clean_text(text):
        import re
        # Convert to lowercase
        text = text.lower()
        # Remove punctuation (you can refine the regex based on your needs)
        text = re.sub(r'[^\w\s]', '', text)
        text = text.strip()
        return text

    # Pre-clean headlines in the entire dataset before fitting vectorizer
    if mode == "train":
        cleaned_titles = [clean_text(t) for t in data["title"]]
    else:
        # If not train mode, we assume vectorizer and w2v_model already fitted
        cleaned_titles = [clean_text(t) for t in data["title"]]

    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    # Initialize CountVectorizer if not provided
    if vectorizer is None and mode == "train":
        # CountVectorizer defaults to lowercase=True, but we already cleaned texts
        # Adjust ngram_range as per your needs (1,2)
        vectorizer = CountVectorizer(max_features=max_features, ngram_range=(1,2))
        vectorizer.fit(cleaned_titles)
        print("N-gram vectorizer fitted on training data.")

    def process_batch(batch):
        # Batch cleaning
        headlines = [clean_text(h) for h in batch["title"]]
        agencies = batch["news"]

        # N-gram features
        if mode == "train" or vectorizer is not None:
            freq_inputs = vectorizer.transform(headlines).toarray()
        else:
            raise ValueError("N-gram vectorizer must be provided in test mode.")

        # Tokenization (BERT)
        tokenized = tokenizer(
            headlines,
            padding="max_length",
            truncation=True,
            max_length=max_seq_length,
            return_tensors="pt"
        )

        input_ids = tokenized["input_ids"]      # (batch_size, max_seq_length)
        attention_mask = tokenized["attention_mask"]  # (batch_size, max_seq_length)

        # seq_inputs: shape (batch_size, 2, seq_length)
        seq_inputs = torch.stack([input_ids, attention_mask], dim=1)

        # Generate Word2Vec embeddings
        # Convert ids back to tokens for W2V lookup
        # Note: tokenizer.convert_ids_to_tokens will give tokens with WordPiece prefixes (like ##ing)
        # You might need to handle that, or use the original tokens from headlines after splitting.
        # Here, we will re-tokenize the cleaned headlines ourselves to match w2v_model’s vocab.
        batch_w2v_vectors = []
        embedding_dim = w2v_model.vector_size
        for text in headlines:
            # Simple split by whitespace after cleaning
            tokens = text.split()
            w2v_vectors = []
            for tk in tokens[:max_seq_length]:
                if tk in w2v_model:
                    w2v_vectors.append(w2v_model[tk])
                else:
                    # OOV token -> zero vector
                    w2v_vectors.append(np.zeros(embedding_dim))
            # Pad if less than max_seq_length
            while len(w2v_vectors) < max_seq_length:
                w2v_vectors.append(np.zeros(embedding_dim))
            # If more than max_seq_length, truncate
            w2v_vectors = w2v_vectors[:max_seq_length]

            batch_w2v_vectors.append(w2v_vectors)

        w2v_inputs = torch.tensor(batch_w2v_vectors, dtype=torch.float)  # (batch_size, max_seq_length, embedding_dim)

        # Labels
        labels = [1.0 if agency == "fox" else 0.0 for agency in agencies]

        return {
            "freq_inputs": torch.tensor(freq_inputs).float(),
            "seq_inputs": seq_inputs,
            "pos_inputs": w2v_inputs,
            "labels": torch.tensor(labels),
        }

    # Use `map` with batching and parallelism
    processed_data = data.map(
        process_batch,
        batched=True,
        batch_size=32,
        num_proc=num_proc
    )

    return processed_data, vectorizer



'\nimport numpy as np\nimport torch\nimport re\nfrom transformers import BertTokenizer\nfrom sklearn.feature_extraction.text import CountVectorizer\n\ndef preprocess_data(data,\n                    mode="train",\n                    vectorizer=None,\n                    w2v_model=None,\n                    max_features=4096,\n                    max_seq_length=128,\n                    num_proc=4):\n    # Ensure w2v_model is provided if we are replacing positional encoding\n    if w2v_model is None:\n        raise ValueError("w2v_model must be provided for Word2Vec embeddings.")\n\n    # Preprocessing function for headlines\n    def clean_text(text):\n        import re\n        # Convert to lowercase\n        text = text.lower()\n        # Remove punctuation (you can refine the regex based on your needs)\n        text = re.sub(r\'[^\\w\\s]\', \'\', text)\n        text = text.strip()\n        return text\n\n    # Pre-clean headlines in the entire dataset before fitting vectorizer\n    i

In [2]:
import numpy as np
import torch
import re
from transformers import BertTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import KeyedVectors

def preprocess_data(data,
                    mode="train",
                    vectorizer=None,
                    w2v_model=None,
                    max_features=4096,
                    max_seq_length=128,
                    num_proc=4):
    if w2v_model is None:
        raise ValueError("w2v_model must be provided for Word2Vec embeddings.")

    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    # 1. Clean text once
    def clean_text(examples):
        import re
        cleaned = []
        for text in examples["title"]:
            text = text.lower()
            text = re.sub(r'[^\w\s]', '', text)
            text = text.strip()
            cleaned.append(text)
        return {"clean_title": cleaned}

    data = data.map(clean_text, batched=True, num_proc=num_proc)

    # 2. Fit CountVectorizer on training data if needed
    if mode == "train" and vectorizer is None:
        # Collect all cleaned titles to fit
        all_titles = data["clean_title"]
        vectorizer = CountVectorizer(max_features=max_features, ngram_range=(1,2))
        vectorizer.fit(all_titles)
        print("N-gram vectorizer fitted on training data.")

    # 3. Transform titles with vectorizer once
    def vectorize_batch(examples):
        freq = vectorizer.transform(examples["clean_title"]).toarray()
        return {"freq_inputs": freq}

    data = data.map(vectorize_batch, batched=True, batch_size=32, num_proc=num_proc)

    # 4. Tokenize with BERT once
    def tokenize_batch(examples):
        tokenized = tokenizer(
            examples["clean_title"],
            padding="max_length",
            truncation=True,
            max_length=max_seq_length
        )
        return {
            "input_ids": tokenized["input_ids"],
            "attention_mask": tokenized["attention_mask"]
        }

    data = data.map(tokenize_batch, batched=True, batch_size=32, num_proc=num_proc)

    # 5. Convert titles into tokens for W2V
    def split_tokens(examples):
        tokens_list = [t.split() for t in examples["clean_title"]]
        return {"tokens": tokens_list}

    data = data.map(split_tokens, batched=True, num_proc=num_proc)

    # Build an embedding dictionary for all unique tokens (do this once before embedding map)
    unique_tokens = set()
    for tokens in data["tokens"]:
        unique_tokens.update(tokens)

    embedding_dim = w2v_model.vector_size
    embedding_dict = {}
    for tk in unique_tokens:
        if tk in w2v_model:
            embedding_dict[tk] = w2v_model[tk]
        else:
            embedding_dict[tk] = np.zeros(embedding_dim)

    # 6. Create W2V embeddings for each example
    def w2v_embedding_batch(examples):
        batch_w2v = []
        for tokens in examples["tokens"]:
            vectors = [embedding_dict[tk] for tk in tokens[:max_seq_length]]
            # Pad if needed
            if len(vectors) < max_seq_length:
                vectors += [np.zeros(embedding_dim)]*(max_seq_length - len(vectors))
            batch_w2v.append(vectors)
        return {"pos_inputs": batch_w2v}

    data = data.map(w2v_embedding_batch, batched=True, batch_size=32, num_proc=num_proc)

    # 7. Create labels
    def make_labels(examples):
        labels = [1.0 if agency == "fox" else 0.0 for agency in examples["news"]]
        return {"labels": labels}

    data = data.map(make_labels, batched=True, num_proc=num_proc)

    # Convert freq_inputs and pos_inputs to torch tensors in a final map step (if needed)
    def to_tensors(examples):
        # freq_inputs is already a numpy array. Convert to float if not already.
        freq_inputs = torch.tensor(examples["freq_inputs"], dtype=torch.float)
        input_ids = torch.tensor(examples["input_ids"], dtype=torch.long)
        attention_mask = torch.tensor(examples["attention_mask"], dtype=torch.long)
        pos_inputs = torch.tensor(examples["pos_inputs"], dtype=torch.float)
        labels = torch.tensor(examples["labels"], dtype=torch.float)

        # seq_inputs shape: (batch_size, 2, seq_len)
        seq_inputs = torch.stack([input_ids, attention_mask], dim=1)

        return {
            "freq_inputs": freq_inputs,
            "seq_inputs": seq_inputs,
            "pos_inputs": pos_inputs,
            "labels": labels
        }

    # Apply final conversion to tensors
    # Note: If your dataset is large, ensure enough memory. Otherwise, you can convert to tensors in the dataloader.
    processed_data = data.map(to_tensors, batched=True, batch_size=32, num_proc=num_proc)

    return processed_data, vectorizer


In [3]:
from gensim.models import KeyedVectors
w2v_model = KeyedVectors.load_word2vec_format("./GoogleNews-vectors-negative300.bin", binary=True)

dataset_train,vectorizer = preprocess_data(
    data=dataset_train,
    mode="train",
    w2v_model=w2v_model,
    max_features=8192,
    max_seq_length=128
)

dataset_test, _ = preprocess_data(
    data=dataset_test,
    mode="test",
    vectorizer=vectorizer,
    w2v_model=w2v_model,
    max_features=8192,
    max_seq_length=128
)

NameError: name 'dataset_train' is not defined

In [None]:
print(dataset_train)
print(dataset_test)

# Part 2: Model

## Defining the Custom Model

In [None]:
# TODO: import all packages necessary for your custom model
import pandas as pd
import os
from torch.utils.data import DataLoader
from transformers import PreTrainedModel, PretrainedConfig, AutoConfig, AutoModel

from model.network import Classifier
from model.frequential import FreqNetwork
from model.sequential import SeqNetwork
from model.positional import PosNetwork

class CustomConfig(PretrainedConfig):
    model_type = "headlineclassifier"

    def __init__(
        self,
        base_exp_dir="./exp/fox_nbc/",
        # dataset={"data_dir": "./data/CASE_NAME/data.csv", "transform": True},
        train={
            "learning_rate": 5e-4,
            "learning_rate_alpha": 0.05,
            "end_iter": 10,
            "batch_size": 64,
            "warm_up_end": 2,
            "anneal_end": 5,
            "save_freq": 5,
            "val_freq": 1,
        },
        model={
            "freq": {
                "tfidf_input_dim": 8192,
                "tfidf_output_dim": 128,
                "tfidf_hidden_dim": 512,
                "n_layers": 4,
                "skip_in": [80],
                "weight_norm": True,
            },
            "seq": {
                "input_dim": 768, #512
                "output_dim": 128,
                "hidden_dim": 256,
                "lstm_in": 768,
                "n_layers": 4,
                "skip_in": [80],
                "weight_norm": True,
                "freeze": True,
                "use_LSTM": False,
            },
            "pos": {
                "input_dim": 512,
                "output_dim": 128,
                "hidden_dim": 256,
                "n_layers": 4,
                "skip_in": [80],
                "weight_norm": True,
            },
            "cls": {
                "combined_input": 384,
                "combined_dim": 128,
                "num_classes": 1,
                "n_layers": 4,
                "skip_in": [80],
                "weight_norm": True,
            },
        },
        **kwargs,
    ):
        super().__init__(**kwargs)

        self.base_exp_dir = base_exp_dir
        # self.dataset = dataset
        self.train = train
        self.model = model

# TODO: define all parameters needed for your model, as well as calling the model itself
class CustomModel(PreTrainedModel):
    config_class = CustomConfig

    def __init__(self, config):
        super().__init__(config)
        self.conf = config
        self.freq = FreqNetwork(**self.conf.model["freq"])
        self.seq = SeqNetwork(**self.conf.model["seq"])
        self.pos = PosNetwork(**self.conf.model["pos"])
        self.cls = Classifier(**self.conf.model["cls"])
        self.cls1 = Classifier(combined_input = 128,
                 combined_dim = 64,
                 num_classes = 1,
                 n_layers =2,
                 skip_in=(4,),
                 weight_norm=True)
        self.cls2 = Classifier(combined_input = 128,
                 combined_dim = 64,
                 num_classes = 1,
                 n_layers =2,
                 skip_in=(4,),
                 weight_norm=True)
        self.cls3 = Classifier(combined_input = 128,
                 combined_dim = 64,
                 num_classes = 1,
                 n_layers =2,
                 skip_in=(4,),
                 weight_norm=True)

    def forward(self, x):
        freq_inputs = x["freq_inputs"]
        seq_inputs = x["seq_inputs"]
        pos_inputs = x["pos_inputs"]
        seq_feature = self.seq(seq_inputs[:,0,:],seq_inputs[:,1,:]) # Shape: (batch_size, 128)
        freq_feature = self.freq(freq_inputs) # Shape: (batch_size, 128)
        pos_feature = self.pos(pos_inputs) #Shape: (batch_size, 128)
        inputs = torch.cat((seq_feature, freq_feature, pos_feature), dim=1)  # Shape: (batch_size, 384)
        # inputs = torch.cat((seq_feature, freq_feature), dim=1)  # Shape: (batch_size,256)
        x = inputs
        outputs = self.cls(x)
        freq_outputs = self.cls1(freq_feature)
        seq_outputs =  self.cls2(seq_feature)
        pos_outputs =  self.cls3(pos_feature)
        return outputs,freq_outputs,seq_outputs,pos_outputs

    def save_model(self, save_path):
        """Save the model locally using the Hugging Face format."""
        self.save_pretrained(save_path)

    def push_model(self, repo_name):
        """Push the model to the Hugging Face Hub."""
        self.push_to_hub(repo_name)

In [None]:
from huggingface_hub import hf_hub_download

AutoConfig.register("headlineclassifier", CustomConfig)
AutoModel.register(CustomConfig, CustomModel)
config = CustomConfig()
model = CustomModel(config)

REPO_NAME = "CISProject/News-Headline-Classifier-Notebook" # TODO: PROVIDE A STRING TO YOUR REPO ON HUGGINGFACE

In [None]:
import torch
from tqdm import tqdm
import os


class Trainer:
    def __init__(self, model, train_loader, val_loader, config, device="cuda"):
        self.model = model.to(device)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.device = device
        self.conf = config

        self.end_iter = self.conf.train["end_iter"]
        self.save_freq = self.conf.train["save_freq"]
        self.val_freq = self.conf.train["val_freq"]

        self.batch_size = self.conf.train['batch_size']
        self.learning_rate = self.conf.train['learning_rate']
        self.learning_rate_alpha = self.conf.train['learning_rate_alpha']
        self.warm_up_end = self.conf.train['warm_up_end']
        self.anneal_end = self.conf.train['anneal_end']

        self.optimizer = torch.optim.Adam(model.parameters(), lr=self.learning_rate)
        self.criterion = torch.nn.BCEWithLogitsLoss()

        self.save_path = os.path.join(self.conf.base_exp_dir, "checkpoints")
        os.makedirs(self.save_path, exist_ok=True)

        self.iter_step = 0

        self.val_loss = None

    def get_cos_anneal_ratio(self):
        if self.anneal_end == 0.0:
            return 1.0
        else:
            return np.min([1.0, self.iter_step / self.anneal_end])

    def update_learning_rate(self):
        if self.iter_step < self.warm_up_end:
            learning_factor = self.iter_step / self.warm_up_end
        else:
            alpha = self.learning_rate_alpha
            progress = (self.iter_step - self.warm_up_end) / (self.end_iter - self.warm_up_end)
            learning_factor = (np.cos(np.pi * progress) + 1.0) * 0.5 * (1 - alpha) + alpha

        for g in self.optimizer.param_groups:
            g['lr'] = self.learning_rate * learning_factor

    def train(self):
        for epoch in range(self.end_iter):
            self.update_learning_rate()
            self.model.train()
            epoch_loss = 0.0
            correct = 0
            total = 0

            for batch_inputs, labels in tqdm(self.train_loader, desc=f"Epoch {epoch + 1}/{self.end_iter}"):
                # Extract features

                freq_inputs = batch_inputs["freq_inputs"].to(self.device)
                seq_inputs = batch_inputs["seq_inputs"].to(self.device)
                pos_inputs = batch_inputs["pos_inputs"].to(self.device)
                y_train = labels.to(self.device)[:,None]

                # Forward pass
                preds,freq_outputs,seq_outputs,pos_outputs = self.model({"freq_inputs": freq_inputs, "seq_inputs": seq_inputs, "pos_inputs": pos_inputs})
                loss = self.criterion(preds, y_train)
                '''
                loss = self.criterion(freq_outputs, y_train) + self.criterion(seq_outputs, y_train) + self.criterion(pos_outputs, y_train)

                pred_freq = (torch.sigmoid(freq_outputs) > 0.5).float()
                pred_seq = (torch.sigmoid(seq_outputs) > 0.5).float()
                pred_pos = (torch.sigmoid(pos_outputs) > 0.5).float()

                # Compute majority vote (at least 2 out of 3 must be 1)
                preds = ((pred_freq + pred_seq + pred_pos) >= 2).float()
                '''
                preds = (torch.sigmoid(preds) > 0.5).int()
                # Backward pass
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

                # Metrics
                epoch_loss += loss.item()
                total += y_train.size(0)
                # print(preds.shape)
                correct += (preds == y_train).sum().item()

            # Log epoch metrics
            print(f"Train Loss: {epoch_loss / len(self.train_loader):.4f}")
            print(f"Train Accuracy: {correct / total:.4f}")

            # Validation and Save Checkpoints
            if (epoch + 1) % self.val_freq == 0:
                self.val()
            if (epoch + 1) % self.save_freq == 0:
                self.save_checkpoint(epoch + 1)

            # Update learning rate
            self.iter_step += 1
            self.update_learning_rate()


    def val(self):
        self.model.eval()
        val_loss = 0.0
        correct = 0
        total = 0

        with torch.no_grad():
            for batch_inputs, labels in tqdm(self.val_loader, desc="Validation", leave=False):
                freq_inputs = batch_inputs["freq_inputs"].to(self.device)
                seq_inputs = batch_inputs["seq_inputs"].to(self.device)
                pos_inputs = batch_inputs["pos_inputs"].to(self.device)
                y_val = labels.to(self.device)[:,None]

                preds,freq_outputs,seq_outputs,pos_outputs = self.model({"freq_inputs": freq_inputs, "seq_inputs": seq_inputs, "pos_inputs": pos_inputs})
                loss = self.criterion(preds, y_val)
                '''
                loss = self.criterion(freq_outputs, y_val) + self.criterion(seq_outputs, y_val) + self.criterion(pos_outputs, y_val)

                #pred_freq = (torch.sigmoid(freq_outputs) > 0.5).float()
                #pred_seq = (torch.sigmoid(seq_outputs) > 0.5).float()
                #pred_pos = (torch.sigmoid(pos_outputs) > 0.5).float()

                # Compute majority vote (at least 2 out of 3 must be 1)
                # preds = ((pred_freq + pred_seq + pred_pos) >= 2).float()
                '''
                preds = (torch.sigmoid(preds)>0.5).float()
                val_loss += loss.item()
                total += y_val.size(0)
                correct += (preds == y_val).sum().item()
        if self.val_loss is None or val_loss < self.val_loss:
            self.val_loss = val_loss
            self.save_checkpoint("best")
        # Log validation metrics
        print(f"Validation Loss: {val_loss / len(self.val_loader):.4f}")
        print(f"Validation Accuracy: {correct / total:.4f}")

    def save_checkpoint(self, epoch):
        """Save model in Hugging Face format."""
        checkpoint_dir = os.path.join(self.save_path, f"checkpoint_epoch_{epoch}")
        if epoch =="best":
            checkpoint_dir = os.path.join(self.save_path, "best")
        self.model.save_pretrained(checkpoint_dir)
        print(f"Checkpoint saved at {checkpoint_dir}")

In [None]:
from torch.utils.data import DataLoader

# Define a collate function to handle the batched data
def collate_fn(batch):
    freq_inputs = torch.stack([torch.tensor(item["freq_inputs"]) for item in batch])
    seq_inputs = torch.stack([torch.tensor(item["seq_inputs"]) for item in batch])
    pos_inputs = torch.stack([torch.tensor(item["pos_inputs"]) for item in batch])
    labels = torch.tensor([torch.tensor(item["labels"]) for item in batch])
    return {"freq_inputs": freq_inputs, "seq_inputs": seq_inputs, "pos_inputs": pos_inputs}, labels

train_loader = DataLoader(dataset_train, batch_size=config.train["batch_size"], shuffle=True,collate_fn=collate_fn)
test_loader = DataLoader(dataset_test, batch_size=config.train["batch_size"], shuffle=False,collate_fn=collate_fn)
trainer = Trainer(model, train_loader, test_loader, config)

# Train the model
trainer.train()
# Save the final model in Hugging Face format
final_save_path = os.path.join(config.base_exp_dir, "checkpoints")
model.save_pretrained(final_save_path)
print(f"Final model saved at {final_save_path}")


## Evaluate Model

In [None]:
from transformers import AutoConfig, AutoModel
from sklearn.metrics import accuracy_score, classification_report
def load_last_checkpoint(checkpoint_dir):
    # Find all checkpoints in the directory
    checkpoints = [f for f in os.listdir(checkpoint_dir) if f.startswith("checkpoint_epoch_")]
    if not checkpoints:
        raise FileNotFoundError(f"No checkpoints found in {checkpoint_dir}!")
    # Sort checkpoints by epoch number
    checkpoints.sort(key=lambda x: int(x.split("_")[-1]))

    # Load the last checkpoint
    last_checkpoint = os.path.join(checkpoint_dir, checkpoints[-1])
    # print(f"Loading checkpoint from {last_checkpoint}")
    # Load the best checkpoint
    if os.path.join(checkpoint_dir, "best") is not None:
        last_checkpoint = os.path.join(checkpoint_dir, "best")
    print(f"Loading checkpoint from {last_checkpoint}")
    # Load model and config
    config = AutoConfig.from_pretrained(last_checkpoint)
    model = AutoModel.from_pretrained(last_checkpoint, config=config)
    return model

# Step 1: Define paths and setup
checkpoint_dir = os.path.join(config.base_exp_dir, "checkpoints")  # Directory where checkpoints are stored
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = load_last_checkpoint(checkpoint_dir)
model.to(device)

criterion = torch.nn.BCEWithLogitsLoss()

def evaluate_model(model, val_loader, criterion, device="cuda"):
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch_inputs, labels in tqdm(val_loader, desc="Testing", leave=False):
            freq_inputs = batch_inputs["freq_inputs"].to(device)
            seq_inputs = batch_inputs["seq_inputs"].to(device)
            pos_inputs = batch_inputs["pos_inputs"].to(device)
            labels = labels[:,None].to(device)

            preds,freq_outputs,seq_outputs,pos_outputs = model({"freq_inputs": freq_inputs, "seq_inputs": seq_inputs, "pos_inputs": pos_inputs})
            loss = criterion(preds, labels)
            '''
            loss = criterion(freq_outputs, labels) + criterion(seq_outputs, labels) + criterion(pos_outputs, labels)

            pred_freq = (torch.sigmoid(freq_outputs) > 0.5).float()
            pred_seq = (torch.sigmoid(seq_outputs) > 0.5).float()
            pred_pos = (torch.sigmoid(pos_outputs) > 0.5).float()

            # Compute majority vote (at least 2 out of 3 must be 1)
            preds = ((pred_freq + pred_seq + pred_pos) >= 2).float()
            '''
            preds = (torch.sigmoid(preds) > 0.5).float()
            val_loss += loss.item()
            total += labels.size(0)
            # preds = (torch.sigmoid(preds) > 0.5).int()
            correct += (preds == labels).sum().item()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return accuracy_score(all_labels, all_preds), classification_report(all_labels, all_preds)


accuracy, report = evaluate_model(model, test_loader, criterion)
print(f"Accuracy: {accuracy:.4f}")
print(report)


# Part 3. Pushing the Model to the Hugging Face

In [None]:
model.push_model(REPO_NAME)

### NOTE: You need to ensure that your Hugging Face token has both read and write access to your repository and Hugging Face organization.

In [None]:
# Load model directly
from transformers import AutoModel, AutoConfig
config = AutoConfig.from_pretrained("CISProject/News-Headline-Classifier-Notebook")
model = AutoModel.from_pretrained("CISProject/News-Headline-Classifier-Notebook",config = config)

In [None]:
from transformers import AutoConfig, AutoModel
from sklearn.metrics import accuracy_score, classification_report

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = torch.nn.BCEWithLogitsLoss()

def evaluate_model(model, val_loader, criterion, device="cuda"):
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch_inputs, labels in tqdm(val_loader, desc="Testing", leave=False):
            freq_inputs = batch_inputs["freq_inputs"].to(device)
            seq_inputs = batch_inputs["seq_inputs"].to(device)
            pos_inputs = batch_inputs["pos_inputs"].to(device)
            labels = labels[:,None].to(device)

            preds,freq_outputs,seq_outputs,pos_outputs = model({"freq_inputs": freq_inputs, "seq_inputs": seq_inputs, "pos_inputs": pos_inputs})
            loss = criterion(preds, labels)
            '''
            loss = criterion(freq_outputs, labels) + criterion(seq_outputs, labels) + criterion(pos_outputs, labels)

            pred_freq = (torch.sigmoid(freq_outputs) > 0.5).float()
            pred_seq = (torch.sigmoid(seq_outputs) > 0.5).float()
            pred_pos = (torch.sigmoid(pos_outputs) > 0.5).float()

            # Compute majority vote (at least 2 out of 3 must be 1)
            preds = ((pred_freq + pred_seq + pred_pos) >= 2).int()
            '''
            preds = (torch.sigmoid(preds) > 0.5).float()
            val_loss += loss.item()
            total += labels.size(0)
            correct += (preds == labels).sum().item()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return accuracy_score(all_labels, all_preds), classification_report(all_labels, all_preds)


accuracy, report = evaluate_model(model, test_loader, criterion)
print(f"Accuracy: {accuracy:.4f}")
print(report)
