In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torch.optim import Adam
import re
import spacy

import warnings
warnings.filterwarnings("ignore")

import logging
logging.basicConfig(level=logging.DEBUG)

from tqdm import tqdm
import os
from utils import ModelJob, Attention

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == "cuda":
    print("running on gpu!!!")
else:
    print("cpu :(")

running on gpu!!!
running on gpu!!!


In [3]:
class AgNewsDataset(torch.utils.data.Dataset):
    def __init__(self, df, 
                vocab_size, min_frequency, 
                mode="train", vocab=None):
        super(AgNewsDataset).__init__()
        logging.info("reading dataframe")
        self.df = df
        self.nlp = spacy.load(name="en_core_web_sm")
        self.vocab_size = vocab_size
        self.min_frequency = min_frequency
        logging.info(f"mode: {mode}")
        if mode == "train":
            logging.info("preprocessing dataframe")
            self.preprocess_df()
            logging.info("creating vocabulary")
            self.vocab = self.build_vocab()
        else:
            self.vocab = vocab
            self.preprocess_df()
            logging.info("preprocessing dataframe")
        logging.info("converting tokens to index")
        self.df["text_idx"] = self.df["processed"].apply(lambda ts: [self.vocab.get(t, self.vocab_size) 
                                                                                     for t in ts])
        self.df["len"] = self.df["text_idx"].apply(len)
    def preprocess_df(self):
        self.df.columns = map(lambda x: x.lower(), self.df.columns)
        self.df["text"] = self.df["title"] + " " + self.df["description"]
        self.df["text"] = self.df["text"].str.lower()
        self.df["processed"] = self.df["text"].apply(self.preprocess)

    def preprocess(self, text):
        text = text.lower()
        text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
        text = " ".join(text.split())
        doc = self.nlp(text, disable=["tok2vec", "tagger", "parser", "attribute_ruler","ner"])
        lemmas = [token.lemma_ for token in doc if not token.is_stop]
        return lemmas
    
    def build_vocab(self):
        freq_dict = dict()
        for index, row in self.df.iterrows():
            for token in row["processed"]:
                freq_dict[token] = freq_dict.get(token, 0)+1
        freq_dict = [(word, frequency) for word, frequency in freq_dict.items()
                    if frequency >= self.min_frequency]
        freq_dict = sorted(freq_dict, key = lambda x: x[1], reverse=True)
        freq_dict = freq_dict[:self.vocab_size]
        freq_dict = dict(freq_dict)
        vocab = {token : idx+1 for idx, (token, _) in enumerate(freq_dict.items())}
        return vocab
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        X = self.df.iloc[idx]["text_idx"]
        length = self.df.iloc[idx]["len"]
        y = torch.tensor(self.df.iloc[idx]["class index"]-1)
        return {"X": X,
                "lengths": length,
                "y": y}

def collate_fn(batch):
    X = [torch.tensor(row["X"]) for row in batch]
    lengths = [torch.tensor(row["lengths"]) for row in batch]
    y = [torch.tensor(row["y"]) for row in batch]
    X, y = pad_sequence(X, batch_first=True, padding_value=0), torch.tensor(y)
    lengths = torch.tensor(lengths)
    return X.to(device), lengths, y.to(device)
    
class NewsClassifierModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size,fc_units, num_classes):
        super(NewsClassifierModel, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size+1,
                                      embedding_dim=100, padding_idx=0)
        self.rnn = nn.LSTM(input_size=embedding_dim,
                          hidden_size=hidden_size,
                          bidirectional=True, 
                          batch_first=True)
        self.dropout = nn.Dropout(p=0.2)
        self.fc = nn.Linear(in_features=2*hidden_size, 
                           out_features=fc_units)
        self.out = nn.Linear(in_features=fc_units,
                            out_features=4)
        self.num_classes = num_classes
        
    def forward(self, x):
        (sequence, lengths) = x
        emb = self.embedding(sequence)
        emb_packed = pack_padded_sequence(emb, lengths=lengths, 
                                          batch_first=True, enforce_sorted=False)
        output, [h_t, c_t] = self.rnn(emb_packed)
        hidden_states = torch.cat((h_t[-2,:, :], h_t[-1,:, :]), dim=1)
        hidden_states_dp = self.dropout(hidden_states)
        fc_out = F.relu(self.fc(hidden_states_dp))
        out = self.out(fc_out)
        return out

In [4]:
if __name__ == "__main__":
    data_path = "/content/drive/MyDrive/Colab Notebooks/ag_news"
    df = pd.read_csv(os.path.join(data_path,"train.csv"))
    df = df.sample(n=50000, random_state=9)
    logging.info("Read Dataframe")

    from sklearn.model_selection import train_test_split
    df_train, df_test = train_test_split(df, stratify=df["Class Index"], random_state=9)
    df_train.shape, df_test.shape
    logging.info("Train Test Split")
    
    logging.info("Creating Datasets")
    train_ds = AgNewsDataset(df=df_train,
                    vocab_size=1000, 
                    min_frequency=25, 
                    mode="train", vocab=None)
    test_ds = AgNewsDataset(df=df_test,
            vocab_size=1000, 
            min_frequency=25, 
            mode="test", vocab=train_ds.vocab)
    logging.info(f"Dataset lengths:: train: {len(train_ds)}, test: {len(test_ds)}")

    train_dl = DataLoader(train_ds, batch_size=256, shuffle=True, collate_fn=collate_fn)
    test_dl = DataLoader(test_ds, batch_size=256, shuffle=True, collate_fn=collate_fn)
    
    model = NewsClassifierModel(vocab_size=1000, 
                      embedding_dim=100, 
                      hidden_size=128,
                      fc_units=256,
                      num_classes=4
                     )
    model = model.to(device)
    loss_func = nn.CrossEntropyLoss()
    optimizer = Adam(params=model.parameters(), lr=0.001)
    
    model_run =  ModelJob(model=model,
                    dataloaders = {"train": train_dl, "test":test_dl},
                    criterion=loss_func,
                    optimizer=optimizer,
                    n_epochs=5,
                    phases=["train", "test"],
                    )
    logging.info("Started Training")
    model_run.train_step()
    

INFO:root:Read Dataframe
INFO:root:Train Test Split
INFO:root:Creating Datasets
INFO:root:reading dataframe
INFO:root:mode: train
INFO:root:preprocessing dataframe
INFO:numexpr.utils:NumExpr defaulting to 2 threads.
INFO:root:creating vocabulary
INFO:root:converting tokens to index
INFO:root:reading dataframe
INFO:root:mode: test
INFO:root:preprocessing dataframe
INFO:root:converting tokens to index
INFO:root:Dataset lengths:: train: 37500, test: 12500
INFO:root:Started Training


EPOCH: 1 out of 5
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
	MODE: train : LOSS: 0.8681720495223999 : ACCURACY: 0.63991779088974
|||||||||||||||||||||||||||||||||||||||||||||||||
	MODE: test : LOSS: 0.5587812066078186 : ACCURACY: 0.8093162775039673
EPOCH: 2 out of 5
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
	MODE: train : LOSS: 0.4529595971107483 : ACCURACY: 0.839598536491394
|||||||||||||||||||||||||||||||||||||||||||||||||
	MODE: test : LOSS: 0.44855937361717224 : ACCURACY: 0.8596572875976562
EPOCH: 3 out of 5
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
	MODE: train : LOSS: 0.3783169388771057 : ACCURACY: 0.8670237064361572
|||||||||||||||||||||||||||||||||||||||||||||||||
	