# Custom Embeddings

In [27]:
import numpy as np
import pandas as pd
from pathlib import Path
import torch

In [None]:
basepath= '/content/drive/MyDrive/Colab_Notebooks/embeddings_NN'
folder= Path(basepath)
folder

You can find raw.csv here https://drive.google.com/file/d/131qhbc8qCnUf9wUwjy2HVqH9h5g5EC7C/view?usp=sharing

In [29]:
raw = folder / 'raw.csv'

In [30]:
raw_cs = pd.read_csv(raw,encoding='ISO-8859-1',
                      header=0,nrows=200000)
raw_fr = pd.DataFrame(raw_cs)
raw_fr.head()

Unnamed: 0.1,Unnamed: 0,Id,Title,Body
0,0,1,How to check if an uploaded file is an image w...,<p>I'd like to check if an uploaded file is an...
1,3,4,How do I replace special characters in a URL?,"<p>This is probably very simple, but I simply ..."
2,4,5,How to modify whois contact details?,<pre><code>function modify(.......)\n{\n $mco...
3,7,8,How to fetch an XML feed using asp.net,<p>I've decided to convert a Windows Phone 7 a...
4,8,9,.NET library for generating javascript?,<p>Do you know of a .NET library for generatin...


In [31]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
raw_csv = pd.read_csv(raw,encoding='ISO-8859-1',
                      names= ['1','2', 'Title', 'Body'],
                      usecols= ['Title', 'Body'],header=0,nrows=200000)
raw_df = pd.DataFrame(raw_csv)
raw_df.head()

Unnamed: 0,Title,Body
0,How to check if an uploaded file is an image w...,<p>I'd like to check if an uploaded file is an...
1,How do I replace special characters in a URL?,"<p>This is probably very simple, but I simply ..."
2,How to modify whois contact details?,<pre><code>function modify(.......)\n{\n $mco...
3,How to fetch an XML feed using asp.net,<p>I've decided to convert a Windows Phone 7 a...
4,.NET library for generating javascript?,<p>Do you know of a .NET library for generatin...


In [33]:
raw_df.shape

(200000, 2)

In [34]:
raw_s= raw_df
raw_s.reset_index(drop= True, inplace= True)

In [35]:
raw_s.head()

Unnamed: 0,Title,Body
0,How to check if an uploaded file is an image w...,<p>I'd like to check if an uploaded file is an...
1,How do I replace special characters in a URL?,"<p>This is probably very simple, but I simply ..."
2,How to modify whois contact details?,<pre><code>function modify(.......)\n{\n $mco...
3,How to fetch an XML feed using asp.net,<p>I've decided to convert a Windows Phone 7 a...
4,.NET library for generating javascript?,<p>Do you know of a .NET library for generatin...


In [36]:
from bs4 import BeautifulSoup
import random
import sklearn
from sklearn.base import BaseEstimator, TransformerMixin
import re
import spacy

In [37]:
# Import random function
import random

# Fix seed value
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [38]:
def basic_clean(text):
        
        '''
        This fuction removes HTML tags from text
        '''
        if (bool(BeautifulSoup(text, "html.parser").find())==True):         
            soup = BeautifulSoup(text, "html.parser")
            text = soup.get_text()
        else:
            pass
        return re.sub(r'[\n\r]',' ', text) 
def basic_clean_array(X):
      '''
      This function removes Html Tags from each text in X array
      '''
      return [basic_clean(text) for text in X]


In [39]:
raw_s['Body'] = basic_clean_array(raw_s['Body'])
raw_s['Title'] = basic_clean_array(raw_s['Title'])
raw_s.head()

Unnamed: 0,Title,Body
0,How to check if an uploaded file is an image w...,I'd like to check if an uploaded file is an im...
1,How do I replace special characters in a URL?,"This is probably very simple, but I simply can..."
2,How to modify whois contact details?,function modify(.......) { $mcontact = file_...
3,How to fetch an XML feed using asp.net,I've decided to convert a Windows Phone 7 app ...
4,.NET library for generating javascript?,Do you know of a .NET library for generating j...


In [40]:
class SpacyPreprocessor(BaseEstimator, TransformerMixin):
    np.random.seed(0)
    def __init__(self, lammetize=True, lower=True, remove_stop=True, remove_punct=True, remove_num=False):
        self.remove_stop = remove_stop
        self.remove_punct = remove_punct
        self.remove_num = remove_num
        self. lammetize = lammetize
        self.lower = lower

    # helpfer functions for basic cleaning 
    def basic_clean(self,text):
        return [re.sub(r'[\n\r]',' ',sentence) for sentence in text]

    # helper function for pre-processing with spacy
    def spacy_preprocessor(self,texts): 
        nlp=spacy.load(model, disable=['parser','ner'])
        ## Add @ as a prefix so that we can separate the word from its token
        ## Since we are using pretrained vectors - @ mentions will be different in the pre-trained vocab
        
        prefixes = list(nlp.Defaults.prefixes)
        prefixes += ['@']
        prefix_regex = spacy.util.compile_prefix_regex(prefixes)
        nlp.tokenizer.prefix_search = prefix_regex.search
     
        matcher = Matcher(nlp.vocab)
        if self.remove_stop:
            matcher.add("stop_words", [[{"is_stop" : True}]])
        if self.remove_punct:
            matcher.add("punctuation",[ [{"is_punct": True}]])
        if self.remove_num:
            matcher.add("numbers", [[{"like_num": True}]])
        Token.set_extension('is_remove', default=False,force=True)
        cleaned_text=[]

        for doc in nlp.pipe(texts,batch_size=64,disable=['parser','ner']):
            matches = matcher(doc)
            for _, start, end in matches:
                for token in doc[start:end]:
                    token._.is_remove =True
                    
            if self.lammetize:
                text = ' '.join(token.lemma_ for token in doc if (token._.is_remove==False))
            else:
                text = ' '.join(token.text for token in doc if (token._.is_remove==False))
            if self.lower:
                text=text.lower()
            cleaned_text.append(text)
        return cleaned_text

    def fit(self, X,y=None):
        return self

    def transform(self, X,y=None):
        x_clean = self.basic_clean(X)
        x_clean_final = self.spacy_preprocessor(x_clean)
        return x_clean_final

In [41]:
# pre_processor = SpacyPreprocessor()
# raw_s['Body'] = pre_processor.fit_transform(np.array(raw_s['Body']))

In [42]:
!pip install --upgrade gensim




In [43]:
from sklearn.model_selection import train_test_split

In [44]:
X = raw_s['Title']+" "+raw_s['Body']
#X_train, X_test = train_test_split(X, test_size = 8000, random_state=42)

In [45]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from sklearn.base import BaseEstimator, TransformerMixin

import gensim.downloader
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [46]:
sentences = [text.split() for text in X]

In [47]:
model_raw_cbow = Word2Vec(sentences,epochs=10, vector_size=150, window=10,
                          min_count=5, workers = 8, sg=0)  #sg=0 means CBOW

save the CBOW embeddings

In [48]:
model_raw_cbow.wv.save('/content/drive/MyDrive/Colab_Notebooks/nlp/embeddings_NN/Word2Vec_models/model_raw_cbow.bin')

In [49]:
model_cbow_vectors = KeyedVectors.load('/content/drive/MyDrive/Colab_Notebooks/nlp/embeddings_NN/Word2Vec_models/model_raw_cbow.bin')

In [50]:
from torchtext.vocab import Vocab, vocab
from torchtext.vocab import build_vocab_from_iterator
from collections import Counter, OrderedDict

In [None]:
X_train

95601     Full screen thumbnail view of images I want to...
89358     Access asp.net class using jquery Ajax I use J...
91058     Print all times, except the ones in the DB I n...
92102     Threading and lambda expressions What is the d...
56061     how to change the visual state of a WPF contro...
                                ...                        
119879    How to store multiple variables from a File In...
103694    Show splash screen until app is done loading M...
131932    Using CSS Pseudo Elements, insert content that...
146867    TTS stops when phone's screen timesout in Andr...
121958    How do I write this query in Django? Suppose I...
Length: 192000, dtype: object

In [51]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer 
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MultiLabelBinarizer
Binarizer= MultiLabelBinarizer()

In [52]:
query_clean_csv = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/embeddings_NN/cleaned_query.csv', 
                              encoding='ISO-8859-1', names= ['1', 'Id','Title',
                                                             'Body','Tags',
                                                             'Tag_Number'],
                        usecols=['Id', 'Title','Body','Tags','Tag_Number'], header=0)
query_df = pd.DataFrame(query_clean_csv)

In [53]:
!pip install swifter

Collecting swifter
  Downloading swifter-1.0.9-py3-none-any.whl (14 kB)
Collecting psutil>=5.6.6
  Downloading psutil-5.8.0-cp37-cp37m-manylinux2010_x86_64.whl (296 kB)
[K     |████████████████████████████████| 296 kB 5.1 MB/s 
Collecting partd>=0.3.10
  Downloading partd-1.2.0-py3-none-any.whl (19 kB)
Collecting fsspec>=0.6.0
  Downloading fsspec-2021.11.1-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 54.7 MB/s 
Collecting locket
  Downloading locket-0.2.1-py2.py3-none-any.whl (4.1 kB)
Installing collected packages: locket, partd, fsspec, psutil, swifter
  Attempting uninstall: psutil
    Found existing installation: psutil 5.4.8
    Uninstalling psutil-5.4.8:
      Successfully uninstalled psutil-5.4.8
Successfully installed fsspec-2021.11.1 locket-0.2.1 partd-1.2.0 psutil-5.8.0 swifter-1.0.9


In [54]:
import swifter
import ast
import re
query_df.fillna('', inplace=True)
query_df['Tag_Number'] =query_df['Tag_Number'].swifter.apply(lambda tags:ast.literal_eval(tags))

Pandas Apply:   0%|          | 0/47427 [00:00<?, ?it/s]

In [55]:
query_df

Unnamed: 0,Id,Title,Body,Tags,Tag_Number
0,3589945,asp query stre dropdown,webpage menu.aspx follow control relevance ...,c# asp.net,"[0, 9]"
1,5756415,run javascript code server java code,want run javascript code server want manipulat...,java javascript,"[1, 3]"
2,2358597,linq sql throw exception row find change,hi linq sql get error row find change update t...,c# asp.net,"[0, 9]"
3,4332882,run python script php server,run nginx web server php cgi like know possibl...,php python,"[2, 7]"
4,5922133,advice write window.resize function,be try write function resize css width element...,javascript jquery,"[3, 5]"
...,...,...,...,...,...
47422,1580289,take value edittext put decimal point,all- work app user enter bill cost tip rate kn...,java android,"[1, 4]"
47423,5010079,listen phone state application,nee liste phone state phone state listener .i ...,java android,"[1, 4]"
47424,2939535,android ui thread,thread task want access main thread runonuithr...,java android,"[1, 4]"
47425,5147598,dynamic table row creation html javascript,html table 1 row fill job detail position user...,asp.net javascript,"[9, 3]"


In [56]:

X = query_df['Title']+" "+query_df['Body']
y = query_df['Tag_Number']
X_train_valid, X_test, y_train_valid, y_test = train_test_split(
    X, y, test_size = 7427, random_state=42)
X_train , X_valid, y_train, y_valid = train_test_split(
    X_train_valid, y_train_valid, test_size = 0.5, random_state=42)

In [57]:
y_train_bin = Binarizer.fit_transform(y_train)
y_valid_bin = Binarizer.transform(y_valid)
y_test_bin = Binarizer.transform(y_test)


In [None]:
#gensim.downloader.info()['models']
model_cbow_vectors.index_to_key

In [58]:
class GensimVectorizer(BaseEstimator,TransformerMixin):
  np.random.seed(0)
  def __init__(self,pretrained_vectors,unk_norm_init=False):
    # load in pre-trained word vectors
    self.pretrained_vectors= pretrained_vectors
    self.vec_size= self.pretrained_vectors.vector_size
    self.unk_norm_init = unk_norm_init
    self.pretrained_vectors_subset = {}
    self.words_not_in_pretrained = []
    self.count_missing = 0
    self.percent_missing = 0


  def fit(self, X,y=None):
    '''
    Gets the subset of pretrained vectors which are present in vocab
    X :  training sentences
    '''
    counter = Counter()

    for sentence in X:
        counter.update(sentence.split())
    for token in counter:
        try:
            self.pretrained_vectors_subset[token] = self.pretrained_vectors.get_vector(token, norm=True)
        except:
            self.words_not_in_pretrained.append(token)
    
    ### save so that you can access this after you fit the vectorizer
    self.count_missing = len(self.words_not_in_pretrained )
    self.percent_missing = self.count_missing / len(counter)
    return self
    
  def transform(self,X,y=None):
    X_vector = np.zeros((len(X), self.vec_size))
    
    for i, sentence in enumerate(X):
        sentence_vector= np.zeros(self.vec_size)
        n=0
        tokens = sentence.split()
        for word in tokens:
            if word in self.pretrained_vectors_subset.keys():
                word_vector=self.pretrained_vectors_subset[word]
                sentence_vector+= word_vector
                n+= 1
            else:
                if self.unk_norm_init :
                    word_vector = np.random.normal(size=  self.vec_size)
                    sentence_vector+= word_vector
                    n+= 1
        if n>0:
            X_vector[i] = sentence_vector/n
    return X_vector

In [59]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import  ClassifierChain
from sklearn.metrics import  fbeta_score

Creating sklearn pipeline and fitting train data:

In [61]:

logit_pipeline = Pipeline([
               ('vectorizer',GensimVectorizer(model_cbow_vectors)),
               ('logistic', ClassifierChain(LogisticRegression()))
                ])

In [62]:
logit_pipeline.fit(X_train, y_train_bin)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('vectorizer',
                 GensimVectorizer(pretrained_vectors=<gensim.models.keyedvectors.KeyedVectors object at 0x7faa48fdac90>)),
                ('logistic',
                 ClassifierChain(base_estimator=LogisticRegression()))])

In [63]:
train_predict = logit_pipeline.predict(X_train)
fbeta_score(y_train_bin,train_predict,beta=2,average=None,zero_division=0).mean()

0.5530651377783654

In [64]:
test_predict = logit_pipeline.predict(X_test)
fbeta_score(y_test_bin,test_predict,beta=2,average=None,zero_division=0).mean()

0.5473313925278738

In [65]:
logit_pipeline.score(X_test,y_test_bin)

0.686818365423455

# Pre-trained Embeddings

I am using pretrained embedddings (Google news) here because my custom embeddings did not give a good result.

In [None]:
gensim.downloader.info()['models']

In [None]:
#Download google news
google_vectors = gensim.downloader.load('word2vec-google-news-300')

In [None]:
google_vectors

<gensim.models.keyedvectors.KeyedVectors at 0x7fe91d4f3ad0>

For each word in dataset’s vocabulary, we check if it is on google_vectors. If it there, we load its pre-trained word vector. Otherwise, we initialize a random vector.

In [None]:
# Create Vocab

counter = Counter()
for line in X_train:
   counter.update(str(line).split())

In [None]:
#Sorting the words based on their frequency and creating OrderedDict from it in descending order
sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)
vocab_dict = vocab(counter, min_freq=4)

In [None]:
vocab_dict.insert_token('<unk>', 0)
vocab_dict.set_default_index(0)

In [None]:
len(vocab_dict)

21797

## COLLATE.FN for Dataloaders

In [None]:
# Creating a lambda function objects that will be used to get the indices of words from vocab
text_pipeline = lambda x: [vocab_dict[token] for token in str(x).split()]
label_pipeline = lambda x: x

In [None]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
         label_list.append(label_pipeline(_label))
         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
         text_list.append(processed_text)
         offsets.append(processed_text.size(0))
    label_list = torch.tensor(np.array(label_list), dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return text_list, label_list, offsets

## CREATE WEIGHT MATRIX OF PRE-TRAINED WEIGHTS

In [None]:
embedding_dim = 300
pretrained_weights = np.zeros((len(vocab_dict), embedding_dim))
words_found = 0
words_not_found = 0

for i, word in enumerate(vocab_dict.get_itos()):
    try: 
        pretrained_weights[i] = google_vectors.get_vector(word, norm=True)
        words_found += 1
    except KeyError:
        words_not_found  += 1
        pretrained_weights[i] = np.random.normal(size=(embedding_dim, ))

In [None]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        text = self.X[idx]
        text = np.array(text)
        labels = self.y[idx]
        labels = np.array(labels)
        sample = (labels,text)
        
        return sample

In [None]:
X_train.reset_index(drop=True,inplace=True)
X_valid.reset_index(drop=True,inplace=True)
X_test.reset_index(drop=True,inplace=True)
y_train.reset_index(drop=True,inplace=True)
y_valid.reset_index(drop=True,inplace=True)
y_test.reset_index(drop=True,inplace=True)

In [None]:
X_train

0        get info user connect tether datum connect use...
1        change input upper case js    < script type="t...
2        receive firebug console response stuff ajax ex...
3        use .size work project help work follow code p...
4        youtube player play ff 7.0 run problem ff 7.0 ...
                               ...                        
19995    add active class main tab list part(2 okay nee...
19996    string recognize valid date time c try possibl...
19997    flip counter jquery android jquery display fli...
19998    work dom ajax function $ body').on('click .kat...
19999    jquery extend access superclass way access sup...
Length: 20000, dtype: object

In [None]:
# Creating a tensors of data
y_train_tensor = torch.tensor(y_train_bin).float()
y_valid_tensor = torch.tensor(y_valid_bin).float()
y_test_tensor = torch.tensor(y_test_bin).float()

In [None]:
trainset = CustomDataset(X_train,y_train_tensor)
validset = CustomDataset(X_valid,y_valid_tensor)
testset = CustomDataset(X_test,y_test_tensor)

## MLPCustom2 MODEL

In [None]:
import torch.nn as nn

In [None]:
# Define custom model using nn.Module()
class MLPCustom2(nn.Module):
  def __init__(self, vocab_size, h_sizes_list, output_dim, non_linearity, pretrained_weights, freezeWeights=True):
        
    super().__init__()

    self.h_sizes_list = h_sizes_list # h_sizes = [emb_dim, hidden_dim1,....hidden_dim2,....hidden_dimn] # n + 1 elements
    
    self.non_linearity = non_linearity
    self.output_dim = output_dim
    self.vocab_size = vocab_size
    self.pretrained_weights = pretrained_weights

    # Initialize hidden layers  

    self.hidden = nn.ModuleList()
    self.dropout = nn.ModuleList()
    self.batchnorm = nn.ModuleList()

    self.embedding = nn.EmbeddingBag(vocab_size, self.h_sizes_list[0]).from_pretrained(pretrained_weights,
                                                                               freeze = freezeWeights)

    for k in range(len(h_sizes_list)-1):
      self.hidden.append(nn.Linear(self.h_sizes_list[k], h_sizes_list[k+1]))
      
    self.output_layer = nn.Linear(self.h_sizes_list[-1], output_dim)
   
    
    ## it is better to use nn.functional.relu in the forward function
    # self.relu = nn.ReLU()

  def forward(self, input, offsets):
    x = self.embedding(input, offsets)
    for  k in range(len(self.h_sizes_list)-1):
      x =  self.non_linearity(self.hidden[k](x))

    x = self.output_layer(x)
    sigmoid = nn.Sigmoid()
    x = sigmoid(x)
    # we are not using softmax function in the forward passs
    # nn.crossentropy loss (which we will use to define our loss) combines  nn.LogSoftmax() and nn.NLLLoss() in one single class
    return x  

## TRAIN FUNCTION

In [None]:
def train(train_loader, model, optimizer, loss_function, log_batch, log_interval, grad_clipping, max_norm):

  # initilalize variables as global
  # these counts will be updated every epoch
  global example_ct_train
  global batch_ct_train

  # Training Loop 
  # Initialize train_loss at the he start of the epoch
  running_train_loss = 0
  running_train_f2_sum = 0
  
  # put the model in training mode
  model.train()

  # Iterate on batches from the dataset using train_loader
  for input, targets, offsets in train_loader:
    # move inputs and outputs to GPUs
    input = input.to(device)
    targets = targets.to(device)
    offsets = offsets.to(device)
    # Forward pass
    output = model(input, offsets)
    loss = loss_function(output.float(), targets.float())
    # Correct prediction
    y_pred = (output > 0.5).float()
    
    example_ct_train +=  len(targets)
    batch_ct_train += 1

    # set gradients to zero 
    optimizer.zero_grad()

    # Backward pass
    loss.backward()

    # Gradient Clipping
    if grad_clipping:
      nn.utils.clip_grad_norm_(model.parameters(), max_norm=max_norm, norm_type=2)

    # Update parameters using their gradient
    optimizer.step()
          
    # Add train loss of a batch 
    running_train_loss += loss.item()

    # Add Corect counts of a batch
    running_train_f2_sum += fbeta_score(y_true=targets,y_pred=y_pred,beta=2,
                                        average=None,zero_division=0).mean()

    # log batch loss and accuracy
    if log_batch:
      if ((batch_ct_train + 1) % log_interval) == 0:
        wandb.log({f"Train Batch Loss  :": loss})
        wandb.log({f"Train Batch Acc :": running_train_f2_sum})

  
  # Calculate mean train loss for the whole dataset for a particular epoch
  train_loss = running_train_loss/len(train_loader)

  # Calculate f2 for the whole dataset for a particular epoch
  train_fbeta = running_train_f2_sum/len(train_loader)                                                                 

  return train_loss, train_fbeta

In [None]:
#Valid Functions

def valid(loader, model, optimizer, loss_function, log_batch, log_interval):

  """ 
  Function for training the model and plotting the graph for train & valid loss vs epoch.
  Input: iterator for train dataset, initial weights and bias, epochs, learning rate, batch size.
  Output: final weights, bias and train loss and valid loss for each epoch.
  """

  # initilalize variables as global
  # these counts will be updated every epoch
  global example_ct_valid
  global batch_ct_valid

  # Validation loop
  # Initialize train_loss at the he strat of the epoch
  running_valid_loss = 0
  running_valid_f2_sum = 0

  
  # put the model in evaluation mode
  model.eval()

  with torch.no_grad():
    for input,targets, offsets in loader:

      # move inputs and outputs to GPUs
      input = input.to(device)
      targets = targets.to(device)
      offsets = offsets.to(device)
      # Forward pass
      output = model(input, offsets)
      loss = loss_function(output.float(),targets.float())

      # Correct Predictions
      y_pred = (output > 0.5).float()

      # count of images and batches
      example_ct_valid +=  len(targets)
      batch_ct_valid += 1

      # Add valid loss of a batch 
      running_valid_loss += loss.item()

      # Add  fbeta score for each batch
      running_valid_f2_sum += fbeta_score(y_true= targets, y_pred=y_pred, beta=2, 
                                          average=None,zero_division=0).mean()

      # log batch loss and accuracy
      if log_batch:
        if ((batch_ct_valid + 1) % log_interval) == 0:
          wandb.log({f"Valid Batch Loss  :": loss})
          wandb.log({f"Valid Batch Accuracy :": running_valid_f2_sum})


    # Calculate mean valid loss for the whole dataset for a particular epoch
    valid_loss = running_valid_loss/len(valid_loader)

    # Calculate accuracy for the whole dataset for a particular epoch
    valid_f2_sum = running_valid_f2_sum/len(valid_loader)
    
  return valid_loss, valid_f2_sum

In [None]:
from datetime import datetime

In [None]:
# MODEL TRAINING LOOP

def train_loop(train_loader, valid_loader, model, loss_function, optimizer, epochs,
               device, patience, early_stopping,
               file_model):

  # Create lists to store train and valid loss at each epoch

  train_loss_history = []
  valid_loss_history = []
  train_f2_history = []
  valid_f2_history = []
  delta = 0
  best_score = None
  valid_loss_min = np.Inf
  counter_early_stop=0
  early_stop=False


  # Iterate for the given number of epochs
  for epoch in range(epochs):
    t0 = datetime.now()
    # Get train loss and fbeta_score for one epoch

    train_loss, train_f2_mean = train(train_loader, model, optimizer, loss_function, 
                                  wandb.config.log_batch, wandb.config.log_interval,
                                  wandb.config.grad_clipping, wandb.config.max_norm)
    valid_loss, valid_f2_mean = valid(valid_loader, model, optimizer, loss_function,
                                    wandb.config.log_batch, wandb.config.log_interval)

    dt = datetime.now() - t0

    # Save history of the Losses and accuracy
    train_loss_history.append(train_loss)
    train_f2_history.append(train_f2_mean)
    valid_loss_history.append(valid_loss)
    valid_f2_history.append(valid_f2_mean)

    if early_stopping:
      score = -valid_loss
      if best_score is None:
        best_score=score
        print(f'Validation loss has decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving Model...')
        torch.save(model.state_dict(), file_model)
        valid_loss_min = valid_loss

      elif score < best_score + delta:
        counter_early_stop += 1
        print(f'Early stoping counter: {counter_early_stop} out of {patience}')
        if counter_early_stop > patience:
          early_stop = True

      
      else:
        best_score = score
        print(f'Validation loss has decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving model...')
        torch.save(model.state_dict(), file_model)
        counter_early_stop=0
        valid_loss_min = valid_loss

      if early_stop:
        print('Early Stopping')
        break

    else:

      score = -valid_loss
      if best_score is None:
        best_score=score
        print(f'Validation loss has decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving Model...')
        torch.save(model.state_dict(), file_model)
        valid_loss_min = valid_loss

      elif score < best_score + delta:
        print(f'Validation loss has not decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Not Saving Model...')
      
      else:
        best_score = score
        print(f'Validation loss has decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving model...')
        torch.save(model.state_dict(), file_model)
        valid_loss_min = valid_loss


    # Log the train and valid loss to W&B
    wandb.log({f"Train epoch Loss :": train_loss, f"Valid epoch Loss :": valid_loss })
    wandb.log({f"Train epoch fbeta :": train_f2_mean, f"Valid epoch fbeta :": valid_f2_mean})


    # Print the train loss and accuracy for given number of epochs, batch size and number of samples
    print(f'Epoch : {epoch+1} / {epochs}')
    print(f'Time to complete {epoch+1} is {dt}')
    # print(f'Learning rate: {scheduler._last_lr[0]}')
    print(f'Train Loss: {train_loss : .4f} | Train fbeta: {train_f2_mean * 100 : .4f}%')
    print(f'Valid Loss: {valid_loss : .4f} | Valid fbeta: {valid_f2_mean * 100 : .4f}%')
    print()
    torch.cuda.empty_cache()

  return train_loss_history, train_f2_history, valid_loss_history, valid_f2_history


In [None]:
save_model_folder = Path('/content/drive/MyDrive/Colab_Notebooks/embeddings_NN/Saved_models')

In [None]:
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau, OneCycleLR, StepLR

In [None]:
# META DATA

hyperparameters = dict(
    
    h_sizes_list = [300] + [300] + [300], # 300 = embed_dim
    dprobs_list = [0],
    batchnorm_binary = False,
    vocab_size = len(vocab_dict),
    output_dim = 10,
    epochs = 50,
    batch_size = 128,
    learning_rate = 0.8,
    dataset="Multi_label_query",
    architecture="MLP",
    log_interval = 25,
    log_batch = True,
    file_model = save_model_folder/'Model_1_freeze.pt',
    grad_clipping = False,
    max_norm = 1,
    momentum = 0,
    patience = 10,
    early_stopping = True,
    scheduler_factor = 0,
    scheduler_patience = 0,
    weight_decay = 0
   )

# non_linearity = F.elu 
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
non_linearity = F.relu
pretrained_weights_tensor = torch.tensor(pretrained_weights).float()

In [None]:
%%capture
!pip install wandb --upgrade
import wandb

# Login to W&B
wandb.login()





True

In [None]:
# INITIALIZE WANDB

wandb.init(name = 'task5', project = 'Dense_Embeddings', config = hyperparameters)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

In [None]:
# wandb.config.non_linearity = non_linearity
wandb.config.device = device
print(wandb.config.device )
wandb.config.non_linearity = non_linearity
print(wandb.config.non_linearity)

cpu
torch.nn.functional.relu


## Specify Dataloader, Loss_function, Model, Optimizer, Weight Initialization

In [None]:
# Fix seed value
SEED = 2345
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Data Loader
train_loader = torch.utils.data.DataLoader(trainset, batch_size=wandb.config.batch_size, 
                                           shuffle = True, collate_fn=collate_batch, num_workers=2)
valid_loader = torch.utils.data.DataLoader(validset, batch_size=wandb.config.batch_size, 
                                           shuffle = False, collate_fn=collate_batch, num_workers=2)
test_loader = torch.utils.data.DataLoader(testset, batch_size=wandb.config.batch_size,   
                                         shuffle = False, collate_fn=collate_batch, num_workers=2)

# cross entropy loss function
loss_function = nn.BCELoss()

# model 
model_1 = MLPCustom2(wandb.config.vocab_size, wandb.config.h_sizes_list, 
           wandb.config.output_dim, non_linearity, pretrained_weights_tensor, 
           freezeWeights= True)

def init_weights(m):
  if type(m) == nn.Linear:
        torch.nn.init.kaiming_normal_(m.weight)
        torch.nn.init.zeros_(m.bias)

  #if type(m) == nn.EmbeddingBag:
  #  torch.nn.init.normal_(m.weight, mean = 0, std = 1)

  

# apply init function recursibvely to all the modules
#model.apply(init_weights)

# put model to GPUs
model_1.to(device)
optimizer = torch.optim.SGD(model_1.parameters(), lr = wandb.config.learning_rate)

wandb.config.optimizer = optimizer

# scheduler = ReduceLROnPlateau(optimizer, mode='min', factor= wandb.config.scheduler_factor, 
#                               patience=wandb.config.scheduler_patience, verbose=True)

# scheduler = StepLR(optimizer, gamma=0.4,step_size=1, verbose=True)

# TRAIN MODEL AND SAVE MODEL

In [None]:
wandb.watch(model_1, log = 'all', log_freq=25, log_graph=True)

[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


[<wandb.wandb_torch.TorchGraph at 0x7fe928653dd0>]

In [None]:
# Fix seed value

SEED = 2345
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

example_ct_train, batch_ct_train, example_ct_valid, batch_ct_valid = 0, 0, 0, 0
train_loss_history,train_acc_history,valid_loss_history,valid_acc_history= train_loop(
    train_loader, valid_loader, model_1, loss_function, optimizer, 
    wandb.config.epochs, wandb.config.device,
    wandb.config.patience, wandb.config.early_stopping, wandb.config.file_model)

Validation loss has decreased (inf --> 0.377430). Saving Model...
Epoch : 1 / 50
Time to complete 1 is 0:00:11.962056
Train Loss:  0.4075 | Train fbeta:  16.2321%
Valid Loss:  0.3774 | Valid fbeta:  15.3553%

Validation loss has decreased (0.377430 --> 0.354054). Saving model...
Epoch : 2 / 50
Time to complete 2 is 0:00:12.490189
Train Loss:  0.3525 | Train fbeta:  17.2253%
Valid Loss:  0.3541 | Valid fbeta:  18.7969%

Validation loss has decreased (0.354054 --> 0.323037). Saving model...
Epoch : 3 / 50
Time to complete 3 is 0:00:12.108255
Train Loss:  0.3168 | Train fbeta:  25.1800%
Valid Loss:  0.3230 | Valid fbeta:  29.0231%

Validation loss has decreased (0.323037 --> 0.283107). Saving model...
Epoch : 4 / 50
Time to complete 4 is 0:00:12.585114
Train Loss:  0.2918 | Train fbeta:  35.0654%
Valid Loss:  0.2831 | Valid fbeta:  38.5949%

Validation loss has decreased (0.283107 --> 0.271828). Saving model...
Epoch : 5 / 50
Time to complete 5 is 0:00:12.072402
Train Loss:  0.2748 | Trai

## Get Prediction

In [None]:
# Evaluation Metric used = Fbeta measure where beta = 2.0

def get_fbeta_pred(data_loader, model):
  """ 
  Function to get predictions for a given test set and calculate fbeta.
  """
  with torch.no_grad():
    # Array to store predicted labels
    predictions = torch.Tensor()
    predictions = predictions.to(device)

    # Array to store actual labels
    y = torch.Tensor()
    y = y.to(device)
    model.eval()

    # Iterate over batches from test set
    for input, targets, offsets  in data_loader:
      # move inputs and outputs to GPUs
      input = input.to(device)
      targets = targets.to(device)
      offsets = offsets.to(device)
      # Forward pass
      output = model(input, offsets)
      
      # Convert probabilities into labels 0 or 1
      preds_batch = (output > 0.5).float()

      # Add the predicted labels in this batch to the predictions array
      predictions = torch.cat((predictions, preds_batch)) 

      # Add the actual labels to the y array
      y = torch.cat((y, targets)) 

    # Check for complete dataset if actual and predicted labels are same or not
    # Calculate accuracy
    test_f2_sum = fbeta_score(y_true= y, y_pred=predictions, beta=2, average=None,
                              zero_division=0).mean()

  # Return array containing predictions and accuracy
  return predictions, test_f2_sum

In [None]:
model_load1 = MLPCustom2(wandb.config.vocab_size, wandb.config.h_sizes_list, 
           wandb.config.output_dim, non_linearity, pretrained_weights_tensor, 
           freezeWeights= True)
model_load1.to(device)
model_load1.load_state_dict(torch.load(wandb.config.file_model))

<All keys matched successfully>

In [None]:
predictions1, fbeta_test1 = get_fbeta_pred(test_loader, model_load1)

In [None]:
fbeta_test1

0.5318230048284723

The fbeta score for the test set is 0.53.

In [None]:
hyperparameters_2 = dict(
    
    h_sizes_list = [300] + [300] + [300], # 300 = embed_dim
    dprobs_list = [0],
    batchnorm_binary = False,
    vocab_size = len(vocab_dict),
    output_dim = 10,
    epochs = 50,
    batch_size = 128,
    learning_rate = 0.8,
    dataset="Multi_label_query",
    architecture="MLP",
    log_interval = 25,
    log_batch = True,
    file_model = save_model_folder/'Model_2_No_freeze.pt',
    grad_clipping = False,
    max_norm = 0,
    momentum = 0,
    patience = 10,
    early_stopping = True,
    scheduler_factor = 0,
    scheduler_patience = 0,
    weight_decay = 0
   )

In [None]:
wandb.init(name = 'task6', project = 'Dense_Embeddings', config = hyperparameters_2)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Train Batch Acc :,▂▁▃▄▃▅▆▅▆▂▆▁▃▅▃▄▆▄▆▁▅▇▂▇▂▄▆▃▅▇▄▆▁▆█▃▇▂▄▆
Train Batch Loss :,█▇▅▅▆▅▆▅▄▄▄▃▄▅▄▃▃▄▃▃▄▃▂▂▃▃▂▂▂▂▁▄▃▁▁▁▁▁▁▂
Train epoch Loss :,█▆▆▅▄▄▄▄▄▄▄▄▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁
Train epoch fbeta :,▁▁▂▄▅▅▅▅▆▆▆▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇███████
Valid Batch Accuracy :,▂▂▃▆▄▆█▅▇▂▇▂▃▆▃▅▇▄▇▁▆▆▂▆▂▄▆▃▅█▄▇▁▆█▃█▂▄▇
Valid Batch Loss :,▇█▆▄▃▃▆▂▅▃▂▃▃▂▅▃▃▁▃▃▄▆▆▅▃▂▇▃▇▁▇▃▁▁▆▂▂▂▂▄
Valid epoch Loss :,▆▅▄▃▂▂▂▂▃▂▂▂▂▂▁▅▂▂▁▁▁█▄▄▂▂▁▄▂▄▁▇▄▃▁▄▂▂▃▁
Valid epoch fbeta :,▁▂▄▅▆▆▆▆▆▆▇▇▆▆▇▆▇▆▇▇▇▅▆▆▇▇▇▆▇▆█▆▇▇█▇▇███

0,1
Train Batch Acc :,95.67012
Train Batch Loss :,0.1338
Train epoch Loss :,0.15061
Train epoch fbeta :,0.61234
Valid Batch Accuracy :,82.45216
Valid Batch Loss :,0.33979
Valid epoch Loss :,0.24387
Valid epoch fbeta :,0.53122


In [None]:
wandb.config.device = device
print(wandb.config.device )
wandb.config.non_linearity = non_linearity
print(wandb.config.non_linearity)

cpu
torch.nn.functional.relu


In [None]:
# Fix seed value
SEED = 2345
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# model 
model_2 = MLPCustom2(wandb.config.vocab_size, wandb.config.h_sizes_list,  
           wandb.config.output_dim, non_linearity, pretrained_weights_tensor, 
           freezeWeights= False)

def init_weights(m):
  if type(m) == nn.Linear:
        torch.nn.init.kaiming_normal_(m.weight)
        torch.nn.init.zeros_(m.bias)

  #if type(m) == nn.EmbeddingBag:
  #  torch.nn.init.normal_(m.weight, mean = 0, std = 1)

  

# apply init function recursibvely to all the modules
#model.apply(init_weights)

# put model to GPUs
model_2.to(device)

optimizer = torch.optim.SGD(model_2.parameters(), lr = wandb.config.learning_rate)

wandb.config.optimizer = optimizer


In [None]:
wandb.watch(model_2, log = 'all', log_freq=25, log_graph=True)

[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


[<wandb.wandb_torch.TorchGraph at 0x7fe8fb0e5e10>]

In [None]:
SEED = 2345
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

example_ct_train, batch_ct_train, example_ct_valid, batch_ct_valid = 0, 0, 0, 0
train_loss_history, train_acc_history, valid_loss_history, valid_acc_history = train_loop(
    train_loader, valid_loader, model_2, loss_function, optimizer, 
    wandb.config.epochs, wandb.config.device, wandb.config.patience, 
    wandb.config.early_stopping, wandb.config.file_model)

Validation loss has decreased (inf --> 0.376770). Saving Model...
Epoch : 1 / 50
Time to complete 1 is 0:00:19.080461
Train Loss:  0.4073 | Train fbeta:  16.2375%
Valid Loss:  0.3768 | Valid fbeta:  15.4196%

Validation loss has decreased (0.376770 --> 0.342089). Saving model...
Epoch : 2 / 50
Time to complete 2 is 0:00:19.023163
Train Loss:  0.3477 | Train fbeta:  17.8253%
Valid Loss:  0.3421 | Valid fbeta:  21.1210%

Validation loss has decreased (0.342089 --> 0.271919). Saving model...
Epoch : 3 / 50
Time to complete 3 is 0:00:18.633977
Train Loss:  0.2955 | Train fbeta:  27.8363%
Valid Loss:  0.2719 | Valid fbeta:  34.6749%

Validation loss has decreased (0.271919 --> 0.244072). Saving model...
Epoch : 4 / 50
Time to complete 4 is 0:00:19.476543
Train Loss:  0.2530 | Train fbeta:  40.6104%
Valid Loss:  0.2441 | Valid fbeta:  44.9099%

Validation loss has decreased (0.244072 --> 0.222872). Saving model...
Epoch : 5 / 50
Time to complete 5 is 0:00:18.528033
Train Loss:  0.2229 | Trai

In [None]:
model_load2 = MLPCustom2(wandb.config.vocab_size, wandb.config.h_sizes_list, 
           wandb.config.output_dim, non_linearity, pretrained_weights_tensor, 
           freezeWeights= False)
model_load2.to(device)
model_load2.load_state_dict(torch.load(wandb.config.file_model))

<All keys matched successfully>

In [None]:
predictions2, fbeta_test2 = get_fbeta_pred(test_loader, model_load2)
fbeta_test2

0.6855914121925709

Here the fbeta for test set has increased to 0.69.