<a href="https://colab.research.google.com/github/RachitBansal/RedditFlairDetector/blob/master/3_Modelling_DL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np

In [5]:
from google.colab import drive
drive.mount('drive')

Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).


**LSTM**

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data
import torchtext

In [0]:
torch.backends.cudnn.deterministic = True  

In [0]:
TEXT = data.Field(tokenize='spacy',batch_first=True,include_lengths=True, lower = True, )
LABEL = data.LabelField(dtype = torch.float,batch_first=True, lower = True)

In [0]:
def processURL(words):
    try:
      words = words.split('://')[1]
      words = words.split('/')
      return ' '.join([' '.join(word.split('.')) for word in words])
    except:
      return words

In [0]:
df = pd.read_csv('drive/My Drive/rMIDAS.csv')
# df['url'] = df['url'].apply(processURL)

In [0]:
# df.head()
df.to_csv('drive/My Drive/rMIDAS.csv', index = False)

In [30]:
df.head()

Unnamed: 0,created_utc,id,link_flair_text,num_comments,selftext,title,url
0,1558946000.0,btjiv5,Politics,3,,Remembering Jawaharlal Nehru on his death anni...,i redd it txph3eq1sp031 jpg
1,1501065000.0,6pn4gj,Non-Political,0,,[FRESH VIDEO] Armaan Malik - 'Barfani' (Babumo...,www youtube com watch?v=LzETSqumgCY
2,1574087000.0,dy3t9k,Non-Political,2,Today I had an appointment for my kid's passpo...,Whims and fancies of officers at PSK [NP],www reddit com r india comments dy3t9k whims_a...
3,1410478000.0,2g5gqn,Non-Political,1,,Rann Utsav - A festival in the deserts of Kutc...,www rannutsav net
4,1483609000.0,5m5j3f,Science/Technology,3,,CCMB signs MoUs with four startups,economictimes indiatimes com small-biz startup...


In [31]:
fields = [(None, None), (None, None), ('flair', LABEL), (None, None), (None, None), ('title', TEXT), (None, None)]
training_data=data.TabularDataset(path = 'drive/My Drive/rMIDAS.csv', format = 'csv', fields = fields, skip_header = True)

print(vars(training_data.examples[3]))

{'flair': 'non-political', 'title': ['rann', 'utsav', '-', 'a', 'festival', 'in', 'the', 'deserts', 'of', 'kutch', ',', 'gujarat', '.']}


In [32]:
for i in range(5):
  print(vars(training_data.examples[i]))

{'flair': 'politics', 'title': ['remembering', 'jawaharlal', 'nehru', 'on', 'his', 'death', 'anniversary']}
{'flair': 'non-political', 'title': ['[', 'fresh', 'video', ']', 'armaan', 'malik', '-', "'", 'barfani', "'", '(', 'babumoshai', 'bandookbaaz', ',', 'nawazuddin', 'siddiqui', ',', 'bidita', 'bag', ')']}
{'flair': 'non-political', 'title': ['whims', 'and', 'fancies', 'of', 'officers', 'at', 'psk', '[', 'np', ']']}
{'flair': 'non-political', 'title': ['rann', 'utsav', '-', 'a', 'festival', 'in', 'the', 'deserts', 'of', 'kutch', ',', 'gujarat', '.']}
{'flair': 'science/technology', 'title': ['ccmb', 'signs', 'mous', 'with', 'four', 'startups']}


In [0]:
train_data, valid_data = training_data.split(split_ratio=0.7)

In [0]:
vectors = torchtext.vocab.Vectors('wiki-news-300d-1M.vec', cache = './drive/My Drive/')

In [35]:
TEXT.build_vocab(train_data,min_freq=3, vectors = vectors)  
LABEL.build_vocab(train_data)

print("Size of topic vocab:",len(TEXT.vocab))

print("Size of flair vocab:",len(LABEL.vocab))

print(TEXT.vocab.freqs.most_common(11))  

print(LABEL.vocab.freqs.most_common(14))

print(TEXT.vocab.stoi)

Size of topic vocab: 40014
Size of flair vocab: 24
[(',', 103173), ('to', 90880), ('in', 89765), ('the', 82255), ('of', 69979), ('.', 64231), (':', 59067), ('india', 58084), ('-', 53017), ('?', 52648), ('a', 50147)]
[('non-political', 105894), ('politics', 91019), ('askindia', 32238), ('policy/economy', 18841), ('business/finance', 12816), ('science/technology', 11072), ('[r]eddiquette', 7303), ('coronavirus', 4841), ('sports', 4375), ('not in english.', 3272), ('entertainment', 3093), ('photography', 3027), ('food', 2361), ('not about india.', 1817)]


In [0]:
labels = len(LABEL.vocab)

In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

b_sz = 128

train_loader, val_loader = data.BucketIterator.splits(
    (train_data, valid_data),
    batch_size = b_sz,
    sort_key = lambda x: len(x.title),
    sort_within_batch=True,
    device = device)

In [0]:
import torch.nn as nn

class model(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim = 12, n_layers = 2, bidir = True, dropout = 0.2):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.lstm = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=n_layers,
                           bidirectional=bidir,
                           dropout=dropout,
                           batch_first=True)
        
        self.dense = nn.Linear(hidden_dim * 2, output_dim)
        self.softmax = nn.Softmax()
        
    def forward(self, text, text_lengths):
        embedded = self.embedding(text)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
        dense_outputs=self.dense(hidden)
        outputs=self.softmax(dense_outputs)
        
        return outputs

In [0]:
vocab_size = len(TEXT.vocab)
embedding_dim = 300
num_hidden_nodes = 32
num_output_nodes = labels
dropout = 0.2

model_ = model(vocab_size, embedding_dim, num_hidden_nodes, num_output_nodes, dropout = dropout)

In [40]:
print(model_)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model_):,} trainable parameters')

pretrained_embeddings = TEXT.vocab.vectors
model_.embedding.weight.data.copy_(pretrained_embeddings)

print(pretrained_embeddings.shape)

model(
  (embedding): Embedding(40014, 300)
  (lstm): LSTM(300, 32, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (dense): Linear(in_features=64, out_features=24, bias=True)
  (softmax): Softmax(dim=None)
)
The model has 12,116,352 trainable parameters
torch.Size([40014, 300])


In [0]:
import torch.optim as optim

optimizer = optim.Adam(model_.parameters(), lr = 0.001)
criterion = nn.CrossEntropyLoss()

def accuracy(preds, y):
    counts = 0
    for i in range(preds.shape[0]):
      counts += (torch.max(preds[i], 0)[1] == y[i]).float()
      
    return counts/preds.shape[0]
    
model_ = model_.to(device)
criterion = criterion.to(device)

In [0]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()  
    
    for batch in iterator:
        
        optimizer.zero_grad()

        text, text_lengths = batch.title
        flair = batch.flair

        text = text.to(device)
        flair = flair.type(torch.LongTensor).to(device)

        predictions = model(text, text_lengths).squeeze()

        try:
          loss = criterion(predictions, flair)
          acc = accuracy(predictions, flair)
        except:
          continue

        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()  
        epoch_acc += acc.item()

    return epoch_loss / (len(iterator)-1), epoch_acc / (len(iterator)-1)

In [0]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0

    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
        
            text, text_lengths = batch.title
            flair = batch.flair

            text = text.to(device)
            flair = flair.type(torch.LongTensor).to(device)
            
            predictions = model(text, text_lengths).squeeze()
            
            try:
              loss = criterion(predictions, flair)
              acc = accuracy(predictions, flair)
            except:
              continue
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / (len(iterator)-1), epoch_acc / (len(iterator)-1)

In [45]:
N_EPOCHS = 20
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    train_loss, train_acc = train(model_, train_loader, optimizer, criterion)
    
    valid_loss, valid_acc = evaluate(model_, val_loader, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model_.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc:.3f}')



	Train Loss: 2.599 | Train Acc: 0.650
	 Val. Loss: 2.681 | Val. Acc: 0.569
	Train Loss: 2.595 | Train Acc: 0.654
	 Val. Loss: 2.679 | Val. Acc: 0.571
	Train Loss: 2.589 | Train Acc: 0.659
	 Val. Loss: 2.675 | Val. Acc: 0.575
	Train Loss: 2.583 | Train Acc: 0.665
	 Val. Loss: 2.671 | Val. Acc: 0.579
	Train Loss: 2.578 | Train Acc: 0.670
	 Val. Loss: 2.671 | Val. Acc: 0.579
	Train Loss: 2.575 | Train Acc: 0.674
	 Val. Loss: 2.669 | Val. Acc: 0.581
	Train Loss: 2.571 | Train Acc: 0.678
	 Val. Loss: 2.669 | Val. Acc: 0.581
	Train Loss: 2.568 | Train Acc: 0.681
	 Val. Loss: 2.668 | Val. Acc: 0.582
	Train Loss: 2.565 | Train Acc: 0.684
	 Val. Loss: 2.668 | Val. Acc: 0.582
	Train Loss: 2.562 | Train Acc: 0.686
	 Val. Loss: 2.667 | Val. Acc: 0.583
	Train Loss: 2.560 | Train Acc: 0.688
	 Val. Loss: 2.667 | Val. Acc: 0.584
	Train Loss: 2.558 | Train Acc: 0.691
	 Val. Loss: 2.666 | Val. Acc: 0.584
	Train Loss: 2.556 | Train Acc: 0.693
	 Val. Loss: 2.666 | Val. Acc: 0.584
	Train Loss: 2.554 | Trai

In [0]:
!nvidia-smi

In [0]:
for batch in train_loader:
      optimizer.zero_grad()   
      
      text, text_lengths = batch.title   
      
      predictions = model_(text, text_lengths).squeeze()
      loss = criterion(predictions, batch.flair.type(torch.LongTensor))
      
      loss.backward()

## Transformer Based Models


In [1]:
!pip install transformers
!pip install seqeval
!pip install scipy
!pip install simpletransformers
!pip install pytorch-nlp
!pip install tensorboardx

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K     |▋                               | 10kB 30.0MB/s eta 0:00:01[K     |█▏                              | 20kB 823kB/s eta 0:00:01[K     |█▊                              | 30kB 1.2MB/s eta 0:00:01[K     |██▎                             | 40kB 1.6MB/s eta 0:00:01[K     |███                             | 51kB 1.0MB/s eta 0:00:01[K     |███▌                            | 61kB 1.2MB/s eta 0:00:01[K     |████                            | 71kB 1.4MB/s eta 0:00:01[K     |████▋                           | 81kB 1.6MB/s eta 0:00:01[K     |█████▎                          | 92kB 1.2MB/s eta 0:00:01[K     |█████▉                          | 102kB 1.4MB/s eta 0:00:01[K     |██████▍                         | 112kB 1.4MB/s eta 0:00:01[K     |███████                         | 122kB 1.4M

In [0]:
import torch
import torchnlp
from torchnlp.encoders import LabelEncoder
from sklearn.model_selection import train_test_split
from multiprocessing import cpu_count

In [0]:
df = pd.read_csv('./drive/My Drive/rMIDAS.csv')

In [0]:
encoder = LabelEncoder(df['link_flair_text'])

In [0]:
def func(x):
  z = np.zeros(24, dtype = int)
  z[x-1] = 1
  return tuple(z)

In [0]:
def prepare_data(df, flairs = 'link_flair_text', title = ['title'], drop = ['created_utc', 'id','link_flair_text','num_comments','selftext','title','url'], train = False):
  df_s = df
  df_s['text'] = ''
  for tit in title:
    df_s['text'] += df[tit]
  df_s['labels'] = pd.Series(encoder.batch_encode(list(df[flairs]))).apply(func)
  df_s = df_s.drop(drop, 'columns')
  train_df, eval_df = train_test_split(df_s, test_size=0.2)
  
  if(train):
    return train_df, eval_df
  else:
    return eval_df

In [0]:
eval_df = prepare_data(df)

In [0]:
train_df = pd.read_csv('drive/My Drive/rMIDASsupp.csv')

In [16]:
train_df.head(10)

Unnamed: 0,text,labels
0,Election Commission cancels notification for D...,"(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,BABAJI BHOOT,"(0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Incredible India,"(0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Indian Windows Scammer Prank - Ownage Pranks,"(0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"40,000 cloth artisans in Ludhiana stare at job...","(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
5,Politics Saar,"(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6,An interview with a Taliban Trained Suicide Bo...,"(0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7,"Chandrababu Naidu falls out of favour, undergo...","(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
8,"Kamaal R Khan objectifies women, Sonakshi Sinh...","(0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
9,Is this what image we have in this damn world,"(0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [17]:
train_df.shape

(334421, 2)

In [0]:
def toTuple(x):
  return tuple(map(int, x[1:-1].split(', ')))

In [0]:
train_df_x = train_df
train_df_x['labels'] = train_df['labels'].apply(toTuple)

In [0]:
args = {
    "output_dir": "outputs/",
    "cache_dir": "cache/",
    "best_model_dir": "outputs/best_model/",

    "fp16": False,
    "fp16_opt_level": "O1",
    "max_seq_length": 128,
    "train_batch_size": 128,
    "eval_batch_size": 128,
    "gradient_accumulation_steps": 1,
    "num_train_epochs": 2,
    "weight_decay": 0,
    "learning_rate": 4e-5,
    "adam_epsilon": 1e-8,
    "warmup_ratio": 0.06,
    "warmup_steps": 0,
    "max_grad_norm": 1.0,
    "do_lower_case": False,

    "logging_steps": 50,
    "evaluate_during_training": False,
    "evaluate_during_training_steps": 2000,
    "evaluate_during_training_verbose": False,
    "use_cached_eval_features": False,
    "save_eval_checkpoints": True,
    "no_cache": False,
    "save_model_every_epoch": True,
    "tensorboard_dir": None,

    "overwrite_output_dir": True,
    "reprocess_input_data": True,

    "process_count": cpu_count() - 2 if cpu_count() > 2 else 1,
    "n_gpu": 1,
    "silent": False,
    "use_multiprocessing": True,

    "wandb_project": None,
    "wandb_kwargs": {},

    "use_early_stopping": True,
    "early_stopping_patience": 3,
    "early_stopping_delta": 0,
    "early_stopping_metric": "eval_loss",
    "early_stopping_metric_minimize": True,

    "manual_seed": None,
    "encoding": None,
    "config": {},
}

In [0]:
eval_df = eval_df.reset_index()
eval_df = eval_df.drop('index', 'columns')

### DistilBERT

In [0]:
from simpletransformers.classification import MultiLabelClassificationModel

model = MultiLabelClassificationModel('distilbert', 'distilbert-base-uncased', num_labels=24, args = args)

#['bert', 'xlnet', 'xlm', 'roberta', 'distilbert']

In [46]:
model.train_model(train_df, show_running_loss = True)

HBox(children=(IntProgress(value=0, max=334421), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Epoch', max=2, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Current iteration', max=2613, style=ProgressStyle(description…

Running loss: 0.564763



Running loss: 0.103480


HBox(children=(IntProgress(value=0, description='Current iteration', max=2613, style=ProgressStyle(description…

Running loss: 0.098357



In [47]:
result, model_outputs, wrong_predictions = model.eval_model(eval_df)

HBox(children=(IntProgress(value=0, max=89257), HTML(value='')))




HBox(children=(IntProgress(value=0, max=698), HTML(value='')))




In [50]:
print(result)

{'LRAP': 0.7272260123327972, 'eval_loss': 0.08534275728643453}


In [58]:
i = np.random.randint(eval_df.shape[0])
print(encoder.decode(torch.tensor(model_outputs[i].argmax()+1)))
print(eval_df['text'].values[i])

Business/Finance
SoftBank to invest $1.4 billion in India's Paytm in bet on demand for mobile money


In [0]:
torch.save(model, './drive/My Drive/distilbert/model_dbert.json')

- result: The evaluation result in the form of a dict. By default, only the Label ranking average precision (LRAP) is reported for multilabel classification.

- model_outputs: A list of model outputs for each item in the evaluation dataset. This is useful if you need probabilities for each class rather than a single prediction. Note that a sigmoid function has been applied to each output to squash the values between 0 and .

- wrong_predictions: A list of InputFeature of each incorrect prediction. The text may be obtained from the InputFeature.text_a attribute. (The InputFeature class can be found in the utils.py file in the repo)

In [0]:
%cp -r /content/outputs/checkpoint-5226-epoch-2 ./drive/My\ Drive/distilbert/

### RoBERTA

In [0]:
from simpletransformers.classification import MultiLabelClassificationModel

model = MultiLabelClassificationModel('roberta', 'distilroberta-base', num_labels=24, args = args)

#['bert', 'xlnet', 'xlm', 'roberta', 'distilbert']

In [0]:
model.train_model(train_df, show_running_loss = True)

HBox(children=(IntProgress(value=0, max=334421), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Epoch', max=2, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Current iteration', max=2613, style=ProgressStyle(description…

Running loss: 0.564763



Running loss: 0.103480


HBox(children=(IntProgress(value=0, description='Current iteration', max=2613, style=ProgressStyle(description…

Running loss: 0.098357



In [78]:
result, model_outputs, wrong_predictions = model.eval_model(eval_df)

HBox(children=(IntProgress(value=0, max=89257), HTML(value='')))




HBox(children=(IntProgress(value=0, max=698), HTML(value='')))




In [79]:
print(result)

{'LRAP': 0.7800262406663525, 'eval_loss': 0.07405059098834466}


In [100]:
i = np.random.randint(eval_df.shape[0])
print(encoder.decode(torch.tensor(model_outputs[i].argmax()+1)))
print(eval_df['text'].values[i])

Sports
World C'ship Archery: Praveen Jadhav bags silver for India


In [0]:
torch.save(model, './drive/My Drive/distilbert/model_robbert.pkl')

In [0]:
%cp -r /content/outputs/checkpoint-2613-epoch-1 ./drive/My\ Drive/distilbert/

## ALBERT

In [0]:
from simpletransformers.classification import MultiLabelClassificationModel

model = MultiLabelClassificationModel('albert', 'albert-large-v1', num_labels=24, use_cuda=False ,args = args)

#['bert', 'xlnet', 'xlm', 'roberta', 'distilbert']

In [0]:
model.train_model(train_df, show_running_loss = True)

HBox(children=(IntProgress(value=0, max=334421), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Epoch', max=2, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Current iteration', max=2613, style=ProgressStyle(description…

In [0]:
result, model_outputs, wrong_predictions = model.eval_model(eval_df)

HBox(children=(IntProgress(value=0, max=89257), HTML(value='')))




HBox(children=(IntProgress(value=0, max=698), HTML(value='')))




In [0]:
print(result)

{'LRAP': 0.7800262406663525, 'eval_loss': 0.07405059098834466}


In [0]:
i = np.random.randint(eval_df.shape[0])
print(encoder.decode(torch.tensor(model_outputs[i].argmax()+1)))
print(eval_df['text'].values[i])

Sports
World C'ship Archery: Praveen Jadhav bags silver for India


In [0]:
torch.save(model, './drive/My Drive/distilbert/model_robbert.pkl')

In [0]:
%cp -r /content/outputs/checkpoint-2613-epoch-1 ./drive/My\ Drive/distilbert/

## ALBERT