In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
import torch
import logging
logging.basicConfig(level=logging.ERROR)
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu") 


There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [3]:
data = pd.read_csv("/content/drive/MyDrive/dataEM/ncert_updated.csv")


In [4]:
data[:6]

Unnamed: 0.1,Unnamed: 0,taxonomy,text
0,0,class/9/Science/motion,ffffforceorceorceorceorce andandandandand ...
1,1,class/9/Science/life processes,wwwwworkorkorkorkork andandandandand e e e...
2,2,class/9/Science/gravitation,"in chapters 8 and 9, we have learnt about the ..."
3,3,class/9/Science/motion,"in everyday life, we see some objects at rest ..."
4,4,class/9/Science/water,atter ininininin o o o o oururururur s s s...
5,5,class/9/Science/,iversity ininininin l l l l living iving o...


In [5]:
data["taxonomy"].values

array(['class/9/Science/motion', 'class/9/Science/life processes',
       'class/9/Science/gravitation', 'class/9/Science/motion',
       'class/9/Science/water', 'class/9/Science/', 'class/9/Science/',
       'class/9/Science/water', 'class/9/Science/water',
       'class/9/Science/structure of the atom', 'class/9/Science/tissues',
       'class/9/Science/water', 'class/9/Science/', 'class/9/Science/',
       'class/9/Science/sound', 'class/8/science/sound',
       'class/8/science/friction', 'class/8/science/force and pressure',
       'class/8/science/materials : metals and non-metals',
       'class/8/science/combustion and flame',
       'class/8/science/synthetic fibres and plastics',
       'class/8/science/microorganisms : friend and foe',
       'class/8/science/',
       'class/8/science/crop production and management',
       'class/8/science/reaching the age of adolescence',
       'class/8/science/gravitation',
       'class/8/science/pollution of air and water',
       'c

In [6]:
import numpy as np
def clean_taxonomy(data):
    cleaned_tax = []
    data["split_taxonomies"] = data["taxonomy"].apply(lambda x: x.split("/"))
    for taxonomy in  data["split_taxonomies"].values:
        # if 'cbse' in taxonomy:
        #     cbse_index = taxonomy.index('cbse')
        # else:
        #     cbse_index = taxonomy.index('class')
        # if cbse_index!=0:
        #     topic = (' ').join(taxonomy[:cbse_index])
        # rest_of_tax = ('>>').join(taxonomy[cbse_index:])
        cleaned_tax.append('>>'.join(taxonomy))
    data2 = data.assign(cleaned_taxonomy = cleaned_tax)
    # data["cleaned_taxonomy"] = pd.Series(cleaned_tax, dtype=np.str)
    return data2
    



In [7]:
data  = clean_taxonomy(data)

In [None]:
!pip install transformers==2.8.0

In [None]:
!pip install tensorflow==1.13.1
! pip install tensorflow-hub==0.7.0

In [10]:
import re
def clean_sentence(question):
  # print(question)
  question = re.sub('<[^>]*>', ' ',question)
  question = re.sub(' +', ' ', question)
  question = re.sub('\xa0','',question)
  question = question.rstrip()
  question = re.sub('nan','',question)
  question = re.sub(u'\u2004','',question)
  question = re.sub(u'\u2009','',question)

  # question = question.decode("utf-8")
  # question = question.replace(u'\u200\d*','').encode("utf-8")
  question = re.sub('&nbsp','',question)
  question = re.sub('&ndash','',question)
  question = re.sub('\r','',question)
  question = re.sub('\t','',question)
  question = re.sub('\n',' ',question)

  question = re.sub('MathType@.*','',question)
  question = re.sub('&thinsp','',question)
  question = re.sub('&times','',question)
  question = re.sub('\u200b','',question)
  question = re.sub('&rarr;;;','',question)

  return question

In [11]:
data["text"] = data["text"].apply(lambda x: clean_sentence(x))
data

Unnamed: 0.1,Unnamed: 0,taxonomy,text,split_taxonomies,cleaned_taxonomy
0,0,class/9/Science/motion,ffffforceorceorceorceorce andandandandand l l ...,"[class, 9, Science, motion]",class>>9>>Science>>motion
1,1,class/9/Science/life processes,wwwwworkorkorkorkork andandandandand e e e e e...,"[class, 9, Science, life processes]",class>>9>>Science>>life processes
2,2,class/9/Science/gravitation,"in chapters 8 and 9, we have learnt about the ...","[class, 9, Science, gravitation]",class>>9>>Science>>gravitation
3,3,class/9/Science/motion,"in everyday life, we see some objects at rest ...","[class, 9, Science, motion]",class>>9>>Science>>motion
4,4,class/9/Science/water,atter ininininin o o o o oururururur s s s s s...,"[class, 9, Science, water]",class>>9>>Science>>water
...,...,...,...,...,...
81,81,"class/10/Science/acids, bases and salts","acids, bases and salts you have learnt in your...","[class, 10, Science, acids, bases and salts]","class>>10>>Science>>acids, bases and salts"
82,82,class/10/Science/chemical reactions and equations,“facts are not science — as the dictionary is ...,"[class, 10, Science, chemical reactions and eq...",class>>10>>Science>>chemical reactions and equ...
83,83,class/10/Science/periodic classification of el...,periodic classification of elements in class i...,"[class, 10, Science, periodic classification o...",class>>10>>Science>>periodic classification of...
84,84,class/10/Science/,"carbon and its in the last chapter, we came to...","[class, 10, Science, ]",class>>10>>Science>>


In [None]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [13]:
import tensorflow_hub as hub
import tensorflow as tf
class UseSentenceEmbedding():
    def __init__(self):
        # g = tf.Graph()
        with tf.device('/CPU:0'):
        # We will be feeding 1D tensors of text into the graph.
            self.text_input = tf.placeholder(dtype=tf.string, shape=[None])
            
            #kindly replace the location in hub.module with the url commented out below

            # "https://tfhub.dev/google/universal-sentence-encoder-large/3"
            embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder-large/3")
            self.embedded_text = embed(self.text_input)
            init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
        # g.finalize()

        self.session = tf.Session(config=tf.ConfigProto( allow_soft_placement=True))
        self.session.run(init_op)
        print("init _____")



    def get_tokenized_sents_embeddings_USE(self, sents,expand=False):

           
        vectors_USE =  self.session.run(self.embedded_text, feed_dict={self.text_input: sents})

        return vectors_USE

use_embedding = UseSentenceEmbedding()

INFO:absl:hub.KerasLayer is not available because TensorFlow version is less than 1.14
INFO:absl:Using /tmp/tfhub_modules to cache modules.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder-large/3'.
INFO:absl:Downloaded https://tfhub.dev/google/universal-sentence-encoder-large/3, Total size: 810.60MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder-large/3'.


init _____


In [14]:
import numpy as np
!pip install inflection


import inflection

from nltk.stem import PorterStemmer 
ps = PorterStemmer()
from gzip import open as gopen
from pandas.core.common import flatten
import gensim.models.poincare as poincare
def get_cleaned_taxonomy(taxonomy):
  cleaned_taxonomy = []
  for value in taxonomy:
      value = ' '.join(value.split(">>"))
      # taxonomy_words = [inflection.singularize(val)  for token in value for val in token.split(" ") if val.isalpha()]
      cleaned_taxonomy.append( value )
  return cleaned_taxonomy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting inflection
  Downloading inflection-0.5.1-py2.py3-none-any.whl (9.5 kB)
Installing collected packages: inflection
Successfully installed inflection-0.5.1


In [15]:
qc_science_data = pd.read_csv("/content/drive/MyDrive/dataEM/train_taxonomy_prediction.csv")
label_set = list(set(qc_science_data["board_syllabus"].values))

In [16]:
len(label_set)

312

In [17]:
targets = get_cleaned_taxonomy(label_set)
len(targets)

312

In [18]:
taxonomy_vectors = use_embedding.get_tokenized_sents_embeddings_USE(targets)
taxonomy_vectors.shape

(312, 512)

In [19]:
train_poincare_tensor = torch.tensor(taxonomy_vectors,dtype=torch.float)

In [20]:
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset, random_split


def get_tokenized_input(text):
    input_ids = []
    attention_masks = []

    for sent in text:
        # `encode_plus` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        #   (5) Pad or truncate the sentence to `max_length`
        #   (6) Create attention masks for [PAD] tokens.
        encoded_dict = tokenizer.encode_plus(
                            sent,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = 128,           # Pad & truncate all sentences.
                            pad_to_max_length = True,
                            truncation=True,
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                    )
        
        # Add the encoded sentence to the list.    
        input_ids.append(encoded_dict['input_ids'])
        
        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)


    # Print sentence 0, now as a list of IDs.
    print('Original: ', text[0])
    print('Token IDs:', input_ids[0])
    return input_ids, attention_masks

In [21]:
input_ids, attention_masks = get_tokenized_input(data["text"].values)

Original:  ffffforceorceorceorceorce andandandandand l l l l lawsawsawsawsaws ofofofofof m m m m motion in the previous chapter, we described the motion of an object along a straight line in terms of its position, velocity and acceleration. we saw that such a motion can be uniform or non-uniform. we have not yet discovered what causes the motion. why does the speed of an object change with time? do all motions require a cause? if so, what is the nature of this cause? in this chapter we shall make an attempt to quench all such curiosities. for many centuries, the problem of motion and its causes had puzzled scientists and philosophers. a ball on the ground, when given a small hit, does not move forever. such observations suggest that rest is the “natural state” of an object. this remained the belief until galileo galilei and isaac newton developed an entirely different approach to understand motion. in our everyday life we observe that some effort is required to put a stationary object 

In [22]:

import sys
import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
from matplotlib import pyplot as plt
from torch.nn.modules.loss import HingeEmbeddingLoss
from random import randint

from tqdm import tqdm
import time
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
from matplotlib import pyplot as plt
from torch.nn.modules.loss import HingeEmbeddingLoss
from random import randint
import torch.nn.functional as F

import time
import argparse
cos = nn.CosineSimilarity(dim=0, eps=1e-6)
cos_label = nn.CosineSimilarity(dim=1, eps=1e-5)

dist = torch.nn.PairwiseDistance(p=2.0, eps=1e-06)
nn.PairwiseDistance(p=2)
class MHSA(nn.Module):
  def __init__(self,
         emb_dim,
         kqv_dim,
         num_heads=2):
    super(MHSA, self).__init__()
    self.emb_dim = emb_dim
    self.kqv_dim = kqv_dim
    self.num_heads = num_heads

    self.w_k = nn.Linear(emb_dim, kqv_dim * num_heads, bias=False)
    self.w_q = nn.Linear(emb_dim, kqv_dim * num_heads, bias=False)
    self.w_v = nn.Linear(emb_dim, kqv_dim * num_heads, bias=False)
    self.w_out = nn.Linear(kqv_dim * num_heads, emb_dim)

  def forward(self, query, key, value):
    # print("query",query.shape)
    b, t = query.shape
    e = self.kqv_dim
    h = self.num_heads
    keys = self.w_k(key).view(b, h, e)
    values = self.w_v(value).view(b, h, e)
    queries = self.w_q(query).view(b, h, e)

    # keys = keys.transpose(2, 1)
    # queries = queries.transpose(2, 1)
    # values = values.transpose(2, 1)

    dot = queries @ keys.transpose(2, 1)  #(b*h*e) @ (b*e*h)
    dot = dot / np.sqrt(e)  # (b*h*h)
    dot = F.softmax(dot, dim=2)

    out = dot @ values   # (b*h*h) @ (b*h*e) = (b*h*e)
    out = out.contiguous().view(b, h * e)
    out = self.w_out(out)
    return out
# Neural Classifierwork

# Discussion TODOS
# try hierarhical interaction (TODO)

# try bringing in modalities (image, or video)

# Go from classical algorithm -> deep learning

class MulticlassClassifier(nn.Module):
    def __init__(self,bert_model_path):
        super(MulticlassClassifier,self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_path,output_hidden_states=True,output_attentions=False)
        self.dropout = nn.Dropout(0.1)
        self.fc1 = nn.Linear(768, 512)
        self.fc2 = nn.Linear(384, 512)
        self.multi_head_attention = MHSA(512, 512,8)
        self.multihead_attn = torch.nn.MultiheadAttention(embed_dim = 512,  num_heads = 8, batch_first=True)


    def forward(self,tokens,masks, targets=None, skip_attention=False):
        # print("tokens", tokens.shape)
        outputs = self.bert(tokens, attention_mask=masks)[2]
        # outputs[2] = outputs[2].permute(0,2,1)
        output_1 = outputs[-1].permute(1,0,2)
        # print(outputs[1].shape,outputs[0].shape)
        output_1 = torch.mean(output_1, dim=0)
        # output_2 = outputs[-2].permute(1,0,2)
        # output_2 = torch.mean(output_2, dim=0)
        # print("output_2", output_2.shape, output_1.shape)
        pooled_output = outputs[-1] #output_1 # torch.cat((output_1, output_2), dim=1)
        # print("pooled_output", pooled_output.shape)
        x = self.fc1(pooled_output)
        # x = self.fc2(x)
        # print("x shape",x.shape)
        targets_curr_batch = []
        for input_x in x:
            # print(input_x.shape)
            distance = cos_label(torch.mean(input_x,dim=0).reshape(1,-1), train_poincare_tensor)
            distances,indices = torch.topk(distance,1,largest=True)

            target_distances = (F.normalize(train_poincare_tensor[indices],p=2,dim=1) - F.normalize(train_poincare_tensor,p=2,dim=1)).pow(2).sum(1) #cos_label(unique_poincare_tensor[indices].reshape(1,-1), unique_poincare_tensor)
            distances,indices = torch.topk(target_distances,5,largest=False)
            targets_curr_batch.append(train_poincare_tensor[indices].reshape(1,5,512))
            # targets_curr_batch.append(unique_poincare_tensor[indices])
            # print("here")
        # print(len(targets_curr_batch))
        targets_batch = torch.cat(targets_curr_batch, dim=0)
        # print("targets_batch",targets_batch.shape)
        attn_output, attn_output_weights = self.multihead_attn(targets_batch, x, x)
        # target_attn_output, attn_output_weights = self.multihead_attn(x, targets_batch, targets_batch)

        x = torch.sum(attn_output,dim=1)
        # print("X shape",x.shape)
        return x

class MyHingeLoss(torch.nn.Module):
    def __init__(self, margin):
        super(MyHingeLoss, self).__init__()
        self.margin = margin

    def forward(self, output, target):
        loss=0
        for i in range(len(output)):
            v_image = F.normalize(output[i],p=2,dim=0)
            t_label = F.normalize(target[i],p=2,dim=0)

            for i in range(5):
                j = randint(0, len(output)-1)
                while j == i:
                    j = randint(0, len(output)-1)
                t_j = F.normalize(target[j],p=2,dim=0)
                loss+= torch.relu( self.margin - cos(t_label, v_image) + cos(t_j, v_image) )
        return loss / (len(output)*5)


In [None]:
from transformers import BertModel, AdamW, BertConfig
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from transformers import BertModel, AdamW, BertConfig

# Loads BertModel, the pretrained BERT model with a single 
model = MulticlassClassifier('bert-base-uncased')
model.load_state_dict(torch.load('/content/drive/MyDrive/model_euclidean_USE_cos_final_attention_V3/model_weights'))

# Tell pytorch to run this model on the GPU.
model.cuda()


In [24]:
def get_inference_taxonomies(input_ids, attention_masks, label_set, train_poincare_tensor):
    model.eval()
    cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)

    input_ids = input_ids.to('cuda')
    attention_masks = attention_masks.to('cuda')
    train_poincare_tensor = train_poincare_tensor.to('cuda')
    # Tracking variables 
    predictions , true_labels = [], []
    for input_id,attention_mask in zip(input_ids, attention_masks):
        with torch.no_grad():
            outputs = model(input_id.reshape(1,-1),attention_mask.reshape(1,-1))
            
        distances = cos(outputs,train_poincare_tensor)
        distances,indices = torch.topk(distances,3,largest=True)
        label_set = np.array(label_set)
        predictions.append(label_set[indices.cpu().numpy()])
        print(len(predictions))
    return predictions
    

In [None]:
train_poincare_tensor = train_poincare_tensor.to(device)
predictions = get_inference_taxonomies(input_ids, attention_masks, label_set, train_poincare_tensor)

In [None]:
predictions

In [27]:
final_predictions = []
for pred in predictions:
    pred_list = pred.tolist()
    predictions_1 = ('*').join(pred_list)
    final_predictions.append(predictions_1)
final_predictions = pd.Series(final_predictions, dtype = np.str)
data["tagrec++_predictions"] = final_predictions
data

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  


Unnamed: 0.1,Unnamed: 0,taxonomy,text,split_taxonomies,cleaned_taxonomy,tagrec++_predictions
0,0,class/9/Science/motion,ffffforceorceorceorceorce andandandandand l l ...,"[class, 9, Science, motion]",class>>9>>Science>>motion,science>>motion and time*science>>motion*scien...
1,1,class/9/Science/life processes,wwwwworkorkorkorkork andandandandand e e e e e...,"[class, 9, Science, life processes]",class>>9>>Science>>life processes,science>>motion and time*science>>body movemen...
2,2,class/9/Science/gravitation,"in chapters 8 and 9, we have learnt about the ...","[class, 9, Science, gravitation]",class>>9>>Science>>gravitation,science>>force and pressure*science>>force and...
3,3,class/9/Science/motion,"in everyday life, we see some objects at rest ...","[class, 9, Science, motion]",class>>9>>Science>>motion,science>>motion and time*science>>motion and m...
4,4,class/9/Science/water,atter ininininin o o o o oururururur s s s s s...,"[class, 9, Science, water]",class>>9>>Science>>water,science>>matter in our surroundings*social sci...
...,...,...,...,...,...,...
81,81,"class/10/Science/acids, bases and salts","acids, bases and salts you have learnt in your...","[class, 10, Science, acids, bases and salts]","class>>10>>Science>>acids, bases and salts","science>>acids, bases and salts*science>>chemi..."
82,82,class/10/Science/chemical reactions and equations,“facts are not science — as the dictionary is ...,"[class, 10, Science, chemical reactions and eq...",class>>10>>Science>>chemical reactions and equ...,science>>chemical reactions and equations*scie...
83,83,class/10/Science/periodic classification of el...,periodic classification of elements in class i...,"[class, 10, Science, periodic classification o...",class>>10>>Science>>periodic classification of...,science>>periodic classification of elements*c...
84,84,class/10/Science/,"carbon and its in the last chapter, we came to...","[class, 10, Science, ]",class>>10>>Science>>,science*science>>natural resources*science>>me...


In [None]:
data.to_csv("ncert_tagrec_tagged.csv", index=False)