# 0. import dependencies

In [164]:
import torch             # pytorch framework for CNN
import numpy as np       # numpy for fast multidimensional array manipulation
import random            
import pandas as pd      # pandas to deal with datasets
import nltk              # natural language toolkit to preprocess the text
import re                # regular expression to preprocess text
import collections
import itertools
import json

## 0.1 set device and default tensor type
**set computing device to GPU if exists (change runtime type from colab)**

In [165]:
cuda = True
device = torch.device("cuda" if cuda and torch.cuda.is_available() else "cpu")

**set default tensor type to cuda float tensor if availabe**

In [166]:
torch.set_default_tensor_type("torch.FloatTensor")
if device == "cuda":
    torch.set_default_tensor_type("torch.cuda.FloatTensor")
print(device)

cpu


# 1. load data
## 1.1 create dataframe for the dataset using pandas

In [167]:
url = "https://raw.githubusercontent.com/SamirGouda/text_classifier/398183a3f714225e827afe0350550e751e6df33b/news.csv"
df = pd.read_csv(url, header=0)

## 1.2 shuffle data

In [168]:
df = df.sample(frac=1).reset_index(drop=True)
# examine the data
df.head()

Unnamed: 0,title,category
0,LEONARD FRUSTRATED FOR WILKINSON,Sports
1,BEST OF ECT NEWS Biometrics: ThinkPad and Beyond,Sci/Tech
2,Lighter-Hit Fla. Area Gets #36;21.5 From FEMA...,World
3,House Toughens Penalties on P2Ps,Sci/Tech
4,Humans may need fewer genes than thought,Sci/Tech


# 2. preprocess the data

## 2.1 import dependencies

In [169]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer # not used yet
import re

### 2.1.1 download stopwords and create stopwords and stemmer instances

In [170]:
nltk.download("stopwords")
STOPWORDS = stopwords.words("english")
porter = PorterStemmer() # not used in current version
# check the first 5 stopwords
print(STOPWORDS[:5])

['i', 'me', 'my', 'myself', 'we']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SamirGouda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 2.2 preprocess data

### 2.2.1 preprocess function
**removes stopwords, words in paranthesis, spaces**

In [171]:
def preprocess(text, stopwords=STOPWORDS):
    """ conditional preprocessing on the given text """
    # transform upper case letters to lower
    text = text.lower()
    # remove stopwords using regular expressions
    pattern = re.compile(r'\b(' + r'|'.join(stopwords)+ r')\b\s*')   # create pattern instance of stopwords
    text = pattern.sub('', text)                                     # replace stopwords with empty char
    # remove words in paranthesis
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub(r"([-;;.,!?<=>])", r" \1 ", text)                  # removes punctuation
    text = re.sub('[^A-Za-z0-9]+', ' ', text)                        # remove non alphanumeric chars
    text = re.sub(' +', ' ', text)                                   # remove multiple spaces
    text =text.strip()                                               # remove leading and trailing spaces
    return text

### 2.2.2 apply preprocess func on dataframe

In [172]:
preprocessed_df = df.copy()
preprocessed_df.title = preprocessed_df.title.apply(preprocess)
print(f"{df.title.values[0]}\n\n{preprocessed_df.title.values[0]}")

LEONARD FRUSTRATED FOR WILKINSON

leonard frustrated wilkinson


## 2.3 split data

### 2.3.1 splitting function

In [173]:
from sklearn.model_selection import train_test_split
def train_val_test_split(X, y, train_size):
    X_train, X_, y_train, y_= train_test_split(X, y, train_size= train_size, stratify= y) 
    X_val, X_test, y_val, y_test= train_test_split(X_, y_, train_size= 0.5, stratify= y_)
    return X_train, y_train, X_val, y_val, X_test, y_test 

### 2.3.2 split data

In [174]:
X = preprocessed_df['title'].values
y = preprocessed_df['category'].values
X_train, y_train, X_val, y_val, X_test, y_test = train_val_test_split(X, y, train_size=0.9)

## 2.4 encode data

### 2.4.1 label encoder class

In [175]:
class LabelEncoder(object):
    def __init__(self, class_to_index = {}):
        self.class_to_index = class_to_index
        self.index_to_class = {v: k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())
        
    def __len__(self):
        return len(self.class_to_index)
    
    def __str__(self):
        return "<Label Encoder (Num_Classes = {})>".format(len(self))
    
    def fit(self, y):
        classes = np.unique(y)
        for i, class_ in enumerate(classes):
            self.class_to_index[class_] = i 
        self.index_to_class = {v: k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())
        return self
    
    def encode(self, y):
        encoded = np.zeros(len(y), dtype=int)
        for i, class_ in enumerate(y):
            encoded[i] = self.class_to_index[class_]
        return encoded
    
    def decode(self, y):
        classes = []
        for idx in y:
            classes.append(self.index_to_class[idx])
        return classes
    
    def save(self, fp):
        with open(fp, 'w') as fp:
            contents = {'class_to_index': self.class_to_index}
            json.dump(contents, fp, indent=4, sort_keys=False)
            
    @classmethod
    def load(cls, fp):
        with open(fp, 'r') as fp:
            kwargs = json.load(fp=fp)
        return cls(**kwargs)

### 2.4.2 encode labels

In [176]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
classes = label_encoder.classes
NUM_CLASSES = len(label_encoder)
print(classes)
# convert labels to tokens
y_train = label_encoder.encode(y_train)
y_val = label_encoder.encode(y_val)
y_test = label_encoder.encode(y_test)

['Business', 'Sci/Tech', 'Sports', 'World']


In [177]:
# class weights
counts = np.bincount(y_train)
class_weights = {i: 1.0/ count for i, count in enumerate(counts)}
print(class_weights)

{0: 3.7037037037037037e-05, 1: 3.7037037037037037e-05, 2: 3.7037037037037037e-05, 3: 3.7037037037037037e-05}


## 2.5 Tokenize the features
### 2.5.1 tokenize class

In [178]:
from more_itertools import take # not used in this version
from collections import Counter

class Tokenizer(object):
    def __init__(self, char_level, num_tokens=None, pad_token='<PAD>', oov_token='<UNK>', token_to_index= None):
        self.char_level = char_level
        self.seperator = '' if self.char_level else ' '
        if num_tokens: num_tokens -=2
        self.num_tokens = num_tokens
        self.pad_token = pad_token
        self.oov_token = oov_token
        if not token_to_index:
            token_to_index = {pad_token:0, oov_token:1}
        self.token_to_index = token_to_index
        self.index_to_token = {v:k for k, v in token_to_index.items()}

    def __len__(self):
        return len(self.token_to_index)

    def __str__(self):
        return "<Tokenizer (num_tokens= {})>".format(len(self))
  
    def fit_on_texts(self, texts):
        if not self.char_level:
            texts = [text.split(" ") for text in texts]
        all_tokens = [token for text in texts for token in text]
        counts = Counter(all_tokens).most_common(self.num_tokens)
        self.min_token_freq = counts[-1][1]
        for token, count in counts:
            index = len(self)
            self.token_to_index[token] = index
            self.index_to_token[index] = token
        return self

    def texts_to_sequences(self,  texts):
        sequences = []
        for text in texts:
            if not self.char_level:
                text = text.split(" ")
            sequence = []
            for token in text:
                sequence.append(self.token_to_index.get(token, self.token_to_index[self.oov_token]))
            sequences.append(np.asarray(sequence))
        return sequences
  
    def sequences_to_texts(self, sequences):
        texts = []
        for sequence in sequences:
            text = []
            for index in sequence:
                text.append(self.index_to_token.get(index, self.oov_token))
            texts.append(self.seperator.join([token for token in text]))
        return texts

    def save(self, fp):
        with open(fp, 'w') as fp:
            contents = {'char_level': self.char_level, 'oov_token': self.oov_token, 'token_to_index': self.token_to_index}
            json.dump(contents, fp, indent=2, sort_keys=False)
  
    @classmethod
    def load(cls, fp):
        with open(fp, 'r') as fp:
            kwargs = json.load(fp=fp)
        return cls(**kwargs)

### 2.5.2 tokenize the text

In [179]:
tokenizer = Tokenizer(char_level=False, num_tokens=500)
tokenizer.fit_on_texts(texts=X_train)
# sample of token
print(take(5, tokenizer.token_to_index.items()))
print(f"least frequency tokens: {tokenizer.min_token_freq}") # to adjust num_tokens
# convert text to sequences of indices
X_train = tokenizer.texts_to_sequences(texts= X_train)
X_val = tokenizer.texts_to_sequences(texts= X_val)
X_test = tokenizer.texts_to_sequences(texts= X_test)
preprossed_text = tokenizer.sequences_to_texts([X_train[0]])[0]
print("text to indices\n"
      f"preprocess {preprossed_text}\n"
      f"tokenized {X_train[0]}"
)

[('<PAD>', 0), ('<UNK>', 1), ('39', 2), ('b', 3), ('gt', 4)]
least frequency tokens: 215
text to indices
preprocess astros 7 <UNK> 5
tokenized [420  98   1  76]


## 2.6 onehot encoding for tokens
### 2.6.1 onehot function

In [180]:
def to_categorical(sequence, num_classes):
    one_hot = np.zeros((len(sequence), num_classes))
    for i, value in enumerate(sequence):
        one_hot[i, value] = 1
    return one_hot

### 2.6.2 convert tokens to one hot encoding

In [181]:
vocab_size = len(tokenizer)
X_train = [to_categorical(seq, num_classes=vocab_size) for seq in X_train]
X_val = [to_categorical(seq, num_classes=vocab_size) for seq in X_val]
X_test = [to_categorical(seq, num_classes=vocab_size) for seq in X_test]

## 2.7 pad the features
**to have same length**
### 2.7.1 pad function

In [182]:
def pad_sequences(sequences, max_seq_len=0):
    max_seq_len = max(max_seq_len, max(len(sequence) for sequence in sequences))
    num_classes = sequences[0].shape[-1]
    padded_sequences = np.zeros((len(sequences), max_seq_len, num_classes))
    for i, sequence in enumerate(sequences):
        padded_sequences[i][:len(sequence)] = sequence
    return padded_sequences

In [183]:
print(X_train[0].shape, X_train[1].shape, X_train[2].shape)
print(pad_sequences(X_train).shape)

(4, 500) (6, 500) (5, 500)
(108000, 19, 500)


## 2.7 create dataset from dataframe
### 2.7.1 dataset class

In [184]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, X, y, max_filter_size):
        self.X = X
        self.y = y
        self.max_filter_size = max_filter_size
    def __len__(self):
        return len(self.y)
  
    def __str__(self):
        return f"<Dataset(N={len(self)}>"

    def __getitem__(self, index):
        X = self.X[index]
        y = self.y[index]
        return [X, y]

    def collate_fn(self, batch):
        """ processing on batch 
        X, y = [], []
        for _data, _label in batch:
          X.append(_data)
          y.append(_label)
        X = pad_sequences(X, max_seq_len=self.max_filter_size)
        return torch.FloatTensor(X), torch.LongTensor(y)

        """
        batch = np.array(batch, dtype=object)
        X = batch[:,0]
        y = np.stack(batch[:,1], axis=0)
        # pad sequences
        X = pad_sequences(X, max_seq_len=self.max_filter_size)

        # cast to tensors
        X = torch.FloatTensor(X.astype(np.int32))
        y = torch.LongTensor(y.astype(np.int32))
        return X, y
    

    def create_dataloader(self, batch_size, shuffle=False, drop_last=False):
        return torch.utils.data.DataLoader(dataset= self, batch_size=batch_size, shuffle=shuffle,
                                       drop_last=drop_last, pin_memory=True, collate_fn=self.collate_fn)

### 2.7.2 create dataset

In [185]:
FILTER_SIZE = 1 # unigram
train_dataset = Dataset(X_train, y_train, max_filter_size=FILTER_SIZE)
val_dataset = Dataset(X_val, y_val, max_filter_size=FILTER_SIZE)
test_dataset = Dataset(X_test, y_test, max_filter_size=FILTER_SIZE)

### 2.7.3 create dataloaders

In [186]:
BATCH_SIZE = 64
train_dataloader = train_dataset.create_dataloader(BATCH_SIZE)
val_dataloader = val_dataset.create_dataloader(BATCH_SIZE)
test_dataloader = test_dataset.create_dataloader(BATCH_SIZE)
batch_X, batch_y = next(iter(test_dataloader))
print(f"X: {batch_X.size()}\n"  f"y: {batch_y[0]}")

X: torch.Size([64, 9, 500])
y: 0


# 3. build cnn model

In [187]:
import torch.nn.functional as F
import torch.nn as nn

## 3.1 CNN Class

In [188]:
class CNN(nn.Module):
    def __init__(self, input_size, num_classes, filter_size, num_filters, hidden_dim, dropout_prop):
        super(CNN, self).__init__()
        # conv layers
        self.conv = nn.Conv1d(in_channels=input_size, out_channels=num_filters, kernel_size=filter_size, padding="same")
        self.bn = nn.BatchNorm1d(num_features=num_filters)
        # fully connected layers
        self.fc1 = nn.Linear(in_features=num_filters, out_features=hidden_dim)
        self.dropout = nn.Dropout(p=dropout_prop)
        self.fc2 = nn.Linear(hidden_dim, num_classes)
        
    def init_weights(self):
        nn.init.xavier_normal_(self.conv.weight, gain=1)
        nn.init.xavier_normal_(self.fc1.weight, gain=nn.init.calculate_gain('relu'))
        nn.init.xavier_normal_(self.fc2.weight, gain=1)
        
    def forward(self, inputs, channel_first=False, apply_softmax=False):
        X = inputs
        if not channel_first:
            X = torch.transpose(X, 2, 1)     # rearrange to have channel first (m, c, w, h)
        X = self.conv(X)
        X = F.max_pool1d(X, X.size(2)).squeeze(2)
        # fully connected layers
        X = self.fc1(X)
        X = self.dropout(X)
        X = self.fc2(X)
        if apply_softmax:
            X = F.softmax(X, dim=1)
        return X
    
            

### 3.1.1 initialize model

In [189]:
NUM_FILTERS = 50
HIDDEN_DIM = 100
DROPOUT_P = 0.1
filter_size = 3
num_filters = 32

model = CNN(vocab_size, NUM_CLASSES, filter_size, num_filters, HIDDEN_DIM, DROPOUT_P)
model = model.to(device)
print(model.named_parameters)

<bound method Module.named_parameters of CNN(
  (conv): Conv1d(500, 32, kernel_size=(3,), stride=(1,), padding=same)
  (bn): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=32, out_features=100, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (fc2): Linear(in_features=100, out_features=4, bias=True)
)>


## 3.2 Trainer
### 3.2.1

In [190]:
class Trainer(object):
    def __init__(self, model, device):
        # set params
        self.model = model
        self.device = device

    def init_loss_fn(self, class_weight):
        weight = torch.Tensor(list(class_weight.values())).to(self.device)
        self.loss_fn = nn.CrossEntropyLoss(weight=weight)
 
    def init_optimizer(self, optimizer_type, learning_rate, lambda_):
        self.optimizer = optimizer_type(params=self.model.parameters(), lr=learning_rate, weight_decay=lambda_)

    def init_scheduler(self, mode, factor, patience):
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, mode=mode, factor=factor, patience=patience)


    def train_step(self, dataloader):
        # set model to train mode
        self.model.train()  
        loss = 0.0
        # iterate over train minibatches in dataloader
        for i, batch in enumerate(dataloader):
            # step
            batch = [item.to(self.device) for item in batch] # set device
            features, targets = batch[0], batch[-1]                     # features = inputs, labels = targets = outputs
            self.optimizer.zero_grad()                                    # reset gradients
            A = self.model.forward(features)                              # forwardprop
            J = self.loss_fn(A, targets)                                  # cost function
            J.backward()                                                  # backprop
            self.optimizer.step()                                         # update params

            # cumulative metrics
            loss += (J.detach().item()- loss)/ (i+1)

        return loss  

    def eval_step(self, dataloader):
        # set model to eval mode
        self.model.eval()
        loss = 0.0
        y_trues, y_probs = [], []
        # iterate over val minibatches in dataloader
        for i, batch in enumerate(dataloader):
            # step
            batch = [item.to(self.device) for item in batch] # set device
            features, labels = batch[0], batch[-1]                        # features = inputs, labels = targets = outputs
            A = self.model.forward(features)                              # forwardprop
            J = self.loss_fn(A, labels).item()                            # cost function
            # cumulative metrics
            loss += (J- loss)/ (i+1)
            # store outputs
            y_prob = torch.sigmoid(A).cpu().detach().numpy()
            y_probs.extend(y_prob)
            y_trues.extend(labels.cpu().detach().numpy())
        return loss, np.vstack(y_trues), np.vstack(y_probs)

    def predict_step(self, dataloader):
        # set model to eval mode
        self.model.eval()
        y_probs = []
        # iterate over val minibatches
        with torch.no_grad():
            for i, batch in enumerate(dataloader):
                features, labels = batch[0], batch[-1]                             # features = inputs, labels = targets = outputs
                A = self.model.forward(features, apply_softmax=True)               # forwardprop
                # store outputs
                y_probs.extend(A)
        return np.vstack(y_probs)

    def train(self, num_epochs, patience, train_dataloader, val_dataloader):
        best_val_loss = np.inf
        for epoch in range(num_epochs):
            # steps
            train_loss = self.train_step(dataloader= train_dataloader)
            val_loss, _, _ = self.eval_step(dataloader= val_dataloader)
            self.scheduler.step(val_loss)

          # early stopping
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model = self.model
                patience_ = patience   # reset patience
            else:
                patience_ -= 1 
            if not patience_:
                print("early stopping!")
                break

          # logging
            print(
              f"epoch: {epoch},  "
              f"train_loss: {train_loss:.5f},  "
              f"val_loss: {val_loss:.5f},  "
              f"lr: {self.optimizer.param_groups[0]['lr']:.2E},  "
              f"patience: {patience_},  "
            )
        return best_model

### 3.2.2 instanitate trainer instance

In [191]:
from torch import optim
LEARNING_RATE = 1e-3
PATIENCE = 5
NUM_EPOCHS = 10
# initialize trainer
trainer = Trainer(model, device)
trainer.init_loss_fn(class_weight=class_weights)
trainer.init_optimizer(optim.Adam, LEARNING_RATE, lambda_=0)
trainer.init_scheduler(mode='min', factor=0.1, patience=3)

# 4. train model

In [192]:
best_model = trainer.train(NUM_EPOCHS, PATIENCE, train_dataloader, val_dataloader)

epoch: 0,  train_loss: 0.83996,  val_loss: 0.78428,  lr: 1.00E-03,  patience: 5,  
epoch: 1,  train_loss: 0.74631,  val_loss: 0.78112,  lr: 1.00E-03,  patience: 5,  
epoch: 2,  train_loss: 0.72226,  val_loss: 0.78078,  lr: 1.00E-03,  patience: 5,  
epoch: 3,  train_loss: 0.70441,  val_loss: 0.78397,  lr: 1.00E-03,  patience: 4,  
epoch: 4,  train_loss: 0.68958,  val_loss: 0.78853,  lr: 1.00E-03,  patience: 3,  
epoch: 5,  train_loss: 0.67701,  val_loss: 0.79378,  lr: 1.00E-03,  patience: 2,  
epoch: 6,  train_loss: 0.66626,  val_loss: 0.79943,  lr: 1.00E-04,  patience: 1,  
early stopping!


# 5. evaluate model

In [193]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

def get_metrics(y_true, y_pred, classes):
    performance ={'overall':{}, 'class':{}}
    metrics= precision_recall_fscore_support(y_true, y_pred, average='weighted')
    performance['overall']['precision'] = metrics[0]
    performance['overall']['recall'] = metrics[1]
    performance['overall']['f1'] = metrics[2]
    performance['overall']['num_samples'] = np.float64(len(y_true))

    metrics= precision_recall_fscore_support(y_true, y_pred, average=None)
    for i in range(len(classes)):
        performance['class'][classes[i]] = {'precision':metrics[0][i], 'recall':metrics[1][i], 'f1':metrics[2][i], 'num_samples': np.float64(metrics[3][i])}
    return performance

In [194]:
# evaluation
trainer_ = Trainer(best_model, device)
trainer_.init_loss_fn(class_weights)
trainer_.init_optimizer(optim.Adam, learning_rate=LEARNING_RATE, lambda_=0)
trainer_.init_scheduler(mode= 'min', factor=0.1, patience=PATIENCE)

test_loss, y_true, y_prob = trainer_.eval_step(test_dataloader)
y_pred = np.argmax(y_prob, axis=1)
performance = get_metrics(y_test, y_pred, classes= classes)
print(json.dumps(performance, indent= 4))

{
    "overall": {
        "precision": 0.7049331710012828,
        "recall": 0.6896666666666667,
        "f1": 0.68882045165279,
        "num_samples": 6000.0
    },
    "class": {
        "Business": {
            "precision": 0.7178487918939984,
            "recall": 0.614,
            "f1": 0.6618756737333813,
            "num_samples": 1500.0
        },
        "Sci/Tech": {
            "precision": 0.7303727200634417,
            "recall": 0.614,
            "f1": 0.6671495834842448,
            "num_samples": 1500.0
        },
        "Sports": {
            "precision": 0.5946579194001874,
            "recall": 0.846,
            "f1": 0.6984039625756743,
            "num_samples": 1500.0
        },
        "World": {
            "precision": 0.7768532526475038,
            "recall": 0.6846666666666666,
            "f1": 0.7278525868178596,
            "num_samples": 1500.0
        }
    }
}


# 6. test sample

In [195]:
def get_propability_distribution(y_prob, classes):
    results ={}
    for i, class_ in enumerate(classes):
        results[class_] = np.float64(y_prob[0][i])
    sorted_results = {k:v for k, v in sorted(results.items(), key=lambda item: item[1], reverse=True)}
    return sorted_results

In [196]:
# inference on test sample
test_text = "What a day for the new york stock market to go bust!"
sequences = tokenizer.texts_to_sequences([preprocess(test_text)])
print(tokenizer.sequences_to_texts(sequences))
X_infer = [to_categorical(sequence, len(tokenizer)) for sequence in sequences]
y_infer = label_encoder.encode([label_encoder.classes[0]]* len(X_infer))
infer_dataset = Dataset(X_infer, y_infer, max_filter_size= FILTER_SIZE)
infer_dataloader = infer_dataset.create_dataloader(64)

['day new <UNK> stock market go <UNK>']


In [197]:
# prediction
y_infer_prob = trainer_.predict_step(infer_dataloader)
y_infer_pred = np.argmax(y_prob, axis=1)

In [198]:
# class distribution
prob_dist = get_propability_distribution(y_infer_prob, classes)
print(json.dumps(prob_dist, indent=2))

{
  "Business": 0.9245333075523376,
  "Sci/Tech": 0.06664972007274628,
  "World": 0.008696649223566055,
  "Sports": 0.00012020469148410484
}
