In [1]:
import pandas as pd
import string
import re
import nltk
from nltk.corpus import stopwords
from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer

In [2]:
class DataFrames:
    def __init__(
        self,
        train_file_name: str,
        test_file_name: str,
        evaluation_file_name: str,
    ) -> None:
        self.content_col_name: str = "content"
        self.content_clean_col_name: str = "content_clean"
        self.content_clean_col_name_no_UNK: str = "content_clean_no_UNK"
        self.content_clean_col_name_truc: str = "content_clean_truc"
        self.label_col_name: str = "label"
        self.train: pd.DataFrame = self.load_data(train_file_name)
        self.test: pd.DataFrame = self.load_data(test_file_name)
        self.evaluation: pd.DataFrame = self.load_data(evaluation_file_name)

    def load_data(self, file_name: str) -> pd.DataFrame:
        df = pd.read_csv(f"dataset/{file_name}.csv", sep=";")
        df.dropna(inplace=True)
        if not self.content_clean_col_name in df.columns:
            df[self.content_col_name] = df["title"] + " " + df["text"]
            df = df.drop(columns=["Unnamed: 0"])
        return df

    def get_datasets(self) -> list[pd.DataFrame]:
        return [self.train, self.test, self.evaluation]

    def get_info(self) -> str:
        test_info = self.test.shape
        train_info = self.train.shape
        evaluation_info = self.evaluation.shape
        return f"DataFrame Shapes:\n\tTrain: {train_info}\n\tTest: {test_info}\n\tEvaluation: {evaluation_info}\n"

    def save_clean(self, token_limit: int = 10000, num_words_trunc: int = 256) -> None:
        # CLEAN
        self._init_clean_content([self.train, self.test, self.evaluation])
        self.train = self.clean_df(self.train)
        self.test = self.clean_df(self.test)
        self.evaluation = self.clean_df(self.evaluation)

        most_common_words = self.get_most_common_words_counter(
            [self.test, self.train],
            token_limit,
            self.content_clean_col_name,
        )
        self.train = DataFrames.set_least_common_UNK(
            self.train, "content_clean", most_common_words
        )
        self.test = DataFrames.set_least_common_UNK(
            self.test, "content_clean", most_common_words
        )
        self.evaluation = DataFrames.set_least_common_UNK(
            self.evaluation, "content_clean", most_common_words
        )
        self.train = DataFrames.drop_least_common(
            self.train,
            self.content_clean_col_name,
            self.content_clean_col_name_no_UNK,
            most_common_words,
        )
        self.test = DataFrames.drop_least_common(
            self.test,
            self.content_clean_col_name,
            self.content_clean_col_name_no_UNK,
            most_common_words,
        )
        self.evaluation = DataFrames.drop_least_common(
            self.evaluation,
            self.content_clean_col_name,
            self.content_clean_col_name_no_UNK,
            most_common_words,
        )
        self.train = DataFrames.trunc_text(
            self.train,
            self.content_clean_col_name_no_UNK,
            self.content_clean_col_name_truc,
            num_words_trunc,
        )
        self.test = DataFrames.trunc_text(
            self.test,
            self.content_clean_col_name_no_UNK,
            self.content_clean_col_name_truc,
            num_words_trunc,
        )
        self.evaluation = DataFrames.trunc_text(
            self.evaluation,
            self.content_clean_col_name_no_UNK,
            self.content_clean_col_name_truc,
            num_words_trunc,
        )
        print(self.test.columns)
        # SAVE
        cols_to_save_clean: list[str] = [
            self.content_clean_col_name,
            self.content_clean_col_name_no_UNK,
            self.content_clean_col_name_truc,
            self.label_col_name,
        ]
        self.train.to_csv(
            f"dataset/train_clean.csv",
            sep=";",
            columns=cols_to_save_clean,
        )
        self.test.to_csv(
            f"dataset/test_clean.csv",
            sep=";",
            columns=cols_to_save_clean,
        )
        self.evaluation.to_csv(
            f"dataset/evaluation_clean.csv",
            sep=";",
            columns=cols_to_save_clean,
        )

    def num_unique_words(self, col_name: str) -> int:
        result: set = set()
        df = pd.concat([self.train, self.test, self.evaluation], ignore_index=True)
        df[col_name].str.lower().str.split().apply(result.update)
        return len(result)

    def get_vocab(self, col_name) -> dict[str, int]:
        vectorizer = CountVectorizer()
        for df in [self.train, self.test, self.evaluation]:
            vectorizer.fit_transform(df[col_name].values)
        return vectorizer.vocabulary_

    def _init_clean_content(self, dfs: list[pd.DataFrame]) -> list[pd.DataFrame]:
        for df in dfs:
            df[self.content_clean_col_name] = df[self.content_col_name]
        return dfs

    def clean_df(self, df: pd.DataFrame) -> pd.DataFrame:
        df = self.to_lower(df, self.content_clean_col_name)
        df = self.remove_punctuation(df, self.content_clean_col_name)
        df = self.remove_stopword(df, self.content_clean_col_name)
        return df

    @staticmethod
    def to_lower(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
        df[col_name] = df[col_name].apply(lambda x: str(x).lower())
        return df

    @staticmethod
    def remove_punctuation(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
        re_punctuation = f'[{re.escape(string.punctuation)}"”“]'
        df[col_name] = df[col_name].apply(
            lambda x: re.sub(re_punctuation, " ", str(x))
            .lower()
            .replace("'s", "")
            .replace("’s", "")
        )
        return df

    @staticmethod
    def remove_stopword(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
        stop_words = set(stopwords.words("english"))
        df[col_name] = df[col_name].apply(
            lambda x: " ".join(
                word for word in str(x).split() if not word in stop_words
            )
        )
        return df

    @staticmethod
    def get_most_common_words_counter(
        dfs: list[pd.DataFrame],
        token_limit: int,
        col_name: str,
    ) -> Counter:
        word_counter: Counter = Counter()
        for df in dfs:
            if col_name not in df.columns:
                raise ValueError("Each DataFrame must have a 'clean_content' column")
            tokens = " ".join(df[col_name].astype(str)).split()
            word_counter.update(tokens)
        return Counter(dict(word_counter.most_common(token_limit)))

    @staticmethod
    def set_least_common_UNK(
        df: pd.DataFrame,
        col_name: str,
        most_common_words: Counter,
    ) -> pd.DataFrame:
        df[col_name] = df[col_name].apply(
            lambda x: " ".join(
                [
                    word if word in most_common_words else "<UNK>"
                    for word in str(x).split()
                ]
            )
        )
        return df

    @staticmethod
    def drop_least_common(
        df: pd.DataFrame,
        col_name: str,
        col_name_no_unk: str,
        most_common_words: Counter,
    ) -> pd.DataFrame:
        df[col_name_no_unk] = df[col_name].apply(
            lambda x: " ".join(
                [word for word in str(x).split() if word in most_common_words]
            )
        )
        return df

    @staticmethod
    def trunc_text(
        df: pd.DataFrame,
        col_name: str,
        col_name_trunc: str,
        trunc_num: int,
    ) -> pd.DataFrame:
        df[col_name_trunc] = df[col_name].apply(
            lambda x: " ".join(str(x).split()[:trunc_num])
        )
        return df

    @staticmethod
    def label_to_str(label: int) -> str:
        return "Fake" if label == 1 else "Not Fake"

In [4]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import (
    pack_padded_sequence,
    pad_sequence,
    pad_packed_sequence,
    PackedSequence,
)
from sklearn.metrics import classification_report

In [5]:
class GRUClassifier(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        embedding_dim: int,
        hidden_dim: int,
        num_layers: int,
        dropout: float,
        lr: float,
        *kwargs,
        **args,
    ) -> None:
        super().__init__(*kwargs, **args)
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.embedding = nn.Embedding(vocab_size, embedding_dim).to(self.device)
        self.gru = nn.GRU(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0,
        ).to(self.device)
        self.fc = nn.Linear(hidden_dim, 1).to(self.device)
        self.sigmoid = nn.Sigmoid()
        self.criterion = nn.BCEWithLogitsLoss()
        self.optimizer = optim.Adam(
            list(self.embedding.parameters())
            + list(self.gru.parameters())
            + list(self.fc.parameters()),
            lr=lr,
        )

    def __str__(self) -> str:
        return (
            f"GRUClassifier:\n"
            f"Vocab size: {self.embedding.num_embeddings}, "
            f"Embedding dim: {self.embedding.embedding_dim}, "
            f"Hidden dim: {self.gru.hidden_size}, "
            f"Layers: {self.gru.num_layers}, "
        )

    def print_info(self) -> None:
        print(self)

    def _pass(self, packed_input: PackedSequence) -> torch.Tensor:
        packed_output, _ = self.gru(packed_input)
        gru_output, _ = pad_packed_sequence(packed_output, batch_first=True)
        output = self.fc(gru_output)
        output = self.sigmoid(output)
        return output

    def forward(
        self,
        X: PackedSequence,
        y: torch.Tensor,
        num_epoch: int,
        batch_size: int,
    ) -> float:
        self.embedding.train()
        self.gru.train()
        total_loss: float = 0

        for epoch in range(num_epoch):
            running_loss = 0.0
            for i in range(0, len(X), batch_size):
                batch_X = X[i : i + batch_size]
                batch_y = y[i : i + batch_size]
                if batch_X[0] is None or batch_X[1] is None:
                    continue
                batch_X = PackedSequence(
                    batch_X[0].to(self.device),
                    batch_X[1],
                )
                batch_y = batch_y.to(self.device)
                self.optimizer.zero_grad()
                output = self._pass(batch_X)
                loss = self.criterion(output.squeeze(), batch_y.float())
                loss.backward()
                self.optimizer.step()
                running_loss += loss.item()
            total_loss += running_loss / len(X)
            print(f"Epoch [{epoch+1}/{num_epoch}], Loss: {running_loss/len(X)}")
        return total_loss / num_epoch

    def predict(self, X: PackedSequence):
        self.embedding.eval()
        self.gru.eval()
        self.fc.eval()
        with torch.no_grad():
            X = X.to(self.device)
            gru_output, _ = self.gru(X)
            logits = self.fc(gru_output[:, -1, :]).squeeze(1)
            return torch.sigmoid(logits).cpu().numpy()

    def print_report(self, y_test, y_hat) -> None:
        report = classification_report(y_test, y_hat, target_names=["Not Fake", "Fake"])
        print(f"{self.__class__.__name__} Report:\n{report}")

    def get_embedded(self, X, y, vocab: dict[str, int]):
        self.embedding.eval()
        all_embeddings = []
        valid_indices = []
        valid_labels = []
        for idx, text in enumerate(X):
            if text.strip():
                words = text.split()
                indices = [vocab.get(word, -1) for word in words]
                indices = list(filter(lambda x: x != -1, indices))
                if len(indices) > 0:
                    indices_tensor = torch.tensor(indices, dtype=torch.long).to(
                        self.device
                    )
                    text_embedding = self.embedding(indices_tensor)
                    all_embeddings.append(text_embedding)
                    valid_indices.append(idx)
                    valid_labels.append(y[idx])
        lengths = torch.tensor([seq.size(0) for seq in all_embeddings])
        sorted_lengths, sorted_idx = lengths.sort(0, descending=True)
        sorted_sequences = [all_embeddings[i] for i in sorted_idx]
        padded_sequences = pad_sequence(sorted_sequences, batch_first=True).to(
            self.device
        )
        packed_input = pack_padded_sequence(
            padded_sequences,
            sorted_lengths,
            batch_first=True,
            enforce_sorted=True,
        )
        return packed_input.cpu(), torch.tensor(valid_labels).cpu()


def print_alocated_gpu_mem():
    allocated_memory = torch.cuda.memory_allocated()
    cached_memory = torch.cuda.memory_reserved()
    allocated_memory_gb = allocated_memory / (1024**3)
    cached_memory_gb = cached_memory / (1024**3)
    print(f"Memory allocated: {allocated_memory_gb:.2f} GB")
    print(f"Memory cached: {cached_memory_gb:.2f} GB")


def run_gru(
    data_frames: DataFrames,
    content_col_name: str,
    label_col_name: str,
    vocab_size: int,
    embedding_dim: int = 8,
    hidden_dim: int = 64,
    num_layers: int = 1,
    dropout: float = 0.25,
    lr: float = 0.001,
    epochs: int = 2,
    batch_size: int = 2,
) -> float:
    gru = GRUClassifier(
        vocab_size=vocab_size,
        embedding_dim=embedding_dim,
        hidden_dim=hidden_dim,
        num_layers=num_layers,
        dropout=dropout,
        lr=lr,
    )
    print_alocated_gpu_mem()

    print("LOADING VOCAB AND PREPARE DATA")
    vocab = data_frames.get_vocab(content_col_name)
    X_embedded, y_tenstor = gru.get_embedded(
        data_frames.train[content_col_name].values,
        data_frames.train[label_col_name].values,
        vocab,
    )
    print("LOADING VOCAB AND PREPARE DATA: FINISHED")
    print_alocated_gpu_mem()

    print("START TRAIN")
    # TRAIN
    start_time_gru = time.time()
    gru.forward(
        X_embedded,
        y_tenstor,
        epochs,
        batch_size,
    )
    end_time_gru = time.time()
    print("END TRAIN")
    del X_embedded, y_tenstor

    # TEST
    X_embedded_test, y_tenstor_test = gru.get_embedded(
        data_frames.test[content_col_name].values,
        data_frames.test[label_col_name].values,
        vocab,
    )
    predictions = gru.predict(X_embedded_test)
    gru.print_report(y_tenstor_test, predictions)

    # EVAL
    rand_sample = data_frames.evaluation.sample(1)
    X_embedded_eval, _ = gru.get_embedded(
        rand_sample[content_col_name].values,
        rand_sample[label_col_name].values,
        vocab,
    )
    pred_sample = gru.predict(X_embedded_eval)
    label_str_true: str = data_frames.label_to_str(pred_sample[0])
    label_str_predicted: str = data_frames.label_to_str(pred_sample[0])
    print(f"is:        {label_str_true}\npredicted: {label_str_predicted}")
    return end_time_gru - start_time_gru

In [7]:
# DATA
data_frames = DataFrames("train", "test", "evaluation")
data_frames_unique = data_frames.num_unique_words("content")
print(f"Unique words dataframes: {data_frames_unique}")
# data_frames.save_clean(token_limit=1000)

data_frames_clean = DataFrames("train_clean", "test_clean", "evaluation_clean")
data_frames_clean_unique = data_frames_clean.num_unique_words("content_clean")
print(f"Unique words dataframes clean: {data_frames_clean_unique}")

Unique words dataframes: 394396
Unique words dataframes clean: 1001


In [8]:
# TESTS WITH TIME
times: dict[str, float] = {}

In [9]:
torch.cuda.empty_cache()
gru_time_clean = run_gru(
    data_frames_clean,
    data_frames_clean.content_clean_col_name_truc,
    data_frames_clean.label_col_name,
    data_frames_clean_unique,
)
times["GRU Clean"] = gru_time_clean
torch.cuda.empty_cache()

for fname, ftime in times.items():
    print(f"{fname}: {ftime:.5f} seconds")

Memory allocated: 0.00 GB
Memory cached: 0.00 GB
LOADING VOCAB AND PREPARE DATA
LOADING VOCAB AND PREPARE DATA: FINISHED
Memory allocated: 0.03 GB
Memory cached: 0.49 GB
START TRAIN


OutOfMemoryError: CUDA out of memory. Tried to allocate 4.32 GiB. GPU 0 has a total capacity of 7.92 GiB of which 3.93 GiB is free. Including non-PyTorch memory, this process has 3.43 GiB memory in use. Of the allocated memory 3.04 GiB is allocated by PyTorch, and 281.47 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)