In [None]:
!pip install node2vec
!pip install farm-haystack[colab,sql,only-faiss-gpu,weaviate,pinecone,opensearch,graphdb,inmemorygraph,crawler,preprocessing,ocr,onnx-gpu,ray,dev,inference]



In [None]:
import pandas as pd
import networkx as nx
import numpy as np
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import tqdm
import pickle
import random
from node2vec import Node2Vec
import string

In [None]:
!gdown 1lKeP9E_S8uuaMu12lv8snzVKXXHG99ck #imdb
!gdown 1YQhw_utiHigVsyv6s7sUlmFO2oaiyztX #test
!gdown 1m84oGmiCP06CBwiLgUTkuV9708-xcLQ0 #finetune
!wget https://github.com/Prayas-Agrawal/635-a2-embeddings/raw/master/embeddings.pkl #embeddings of KH
!wget https://github.com/Prayas-Agrawal/635-a2-embeddings/raw/master/network.pkl #KG

In [None]:
df = pd.read_csv("./IMDB-Movie-Data.csv")
df.head()


In [None]:
finetune_q = []
finetune_a = []
with open("./dataset.txt", 'r') as file:
  for l in file.readlines():
    if(l.startswith("Answer:")):
      finetune_a.append(l.split("Answer: ")[1].strip())
    elif(len(l.strip()) > 0):
      finetune_q.append(l.strip())


In [None]:
# Title, Genres, Director, Actors and Years.
# format inspired from fb2k15 dataset
def title():
    return "title"
def director(movie, dir):
    return [str(movie), "_has_director", str(dir) ]
def actor(movie, actor):
    return [str(movie), "_has_actor", str(actor) ]
def year(movie, year):
    return [str(movie), "_has_year", str(year) ]
def genre(movie, genre):
    return [str(movie), "_has_genre", str(genre) ]
def clean(s):
  return ("".join([ch.lower() for ch in s if ch not in string.punctuation]))

ACTORS = []
GENRES = []
DIRS = []
YEARS = []
for i,row in df.iterrows():
  GENRES.extend(row["Genre"].split(","))
  YEARS.append(row["Year"])
  DIRS.append(row["Director"])
  ACTORS.extend(row["Actors"].split(","))

ACTORS = list(set(ACTORS))
GENRES = list(set(GENRES))
DIRS = list(set(DIRS))
YEARS = list(set(YEARS))

g = []
df = pd.read_csv("./IMDB-Movie-Data.csv")
for i,row in df.iterrows():
    movie = clean(row["Title"])
    for _genre in row["Genre"].split(","):
        g.append(genre(movie, clean(_genre)))
    for _actor in row["Actors"].split(","):
        g.append(actor(movie, clean(_actor)))
    g.append(director(movie, clean(row["Director"])))
    g.append(year(movie, row["Year"]))

tf = pd.DataFrame(g, columns=["subject", "relation", "object"])
tf.head()


In [None]:

def constructEmbeddings():
  network =nx.from_pandas_edgelist(tf, "subject", "object", edge_attr=True, create_using=nx.MultiDiGraph()).to_undirected()
  n2v = Node2Vec(network, dimensions=16, walk_length=30, num_walks=200, workers=4)
  emb = n2v.fit(window=3, min_count=1, batch_words=4)
  with open('./network.pkl', 'wb') as f:
    pickle.dump(network, f)
  with open('./embeddings.pkl', 'wb') as f:
      pickle.dump(emb, f)
  return network, emb

## FAISS

In [None]:
network = None
emb = None
with open("./embeddings.pkl", 'rb') as f:
    emb = pickle.load(f)
with open("./network.pkl", 'rb') as f:
    network = pickle.load(f)
print(network)

# if you want to reconsruct, uncomment
# network, emb = constructEmbeddings()

In [None]:
kg_embs = []
for i in range(df.shape[0]):
  kg_embs.append(emb[i])
  
import numpy as np
kg_embs = np.array(kg_embs)
kg_embs.shape

In [None]:
from sentence_transformers import SentenceTransformer, models
from torch import nn
import faiss
out_len = 512
word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=256)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=out_len, activation_function=nn.Tanh())

model_s = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])

In [None]:
#d = sentence_embeddings.shape[1]
index = faiss.IndexFlatL2(kg_embs.shape[1])
index.is_trained


In [None]:
index.add(kg_embs)
index.ntotal

In [None]:
k = 4
ques = "Which 2016 film starred an actor who is famous for playing a superhero with a suit that grants incredible abilities?"
xq = model_s.encode([ques])

In [None]:
%%time
sliceLen = 128-1
D, I = index.search(xq[:sliceLen], k)  # search
for i in range(I.shape[1]):
  print(df.loc[I[0][i]])
  print('*'*80)

## Haystack

In [None]:
class TorchDatasetFormat(Dataset):
  def __init__(self, data, tokenizer):
    self.data = data
    self.tokenizer = tokenizer

  def __getitem__(self, idx):
    inputs = self.tokenizer.encode_plus(
      self.data[idx]['q'],
      self.data[idx]['context'],
      add_special_tokens=True,
      return_tensors='pt',
      max_length=128,
      truncation=True,
      padding='max_length',
      return_offsets_mapping=True
    )
    return {
      'input_ids': inputs['input_ids'].squeeze(),
      'attention_mask': inputs['attention_mask'].squeeze(),
      'start_positions': self.data[idx]['a']['start'][0],
      'end_positions': self.data[idx]['a']['start'][0] + len(self.data[idx]['a']['text'][0])
    }

  def __len__(self):
    return len(self.data)

dataset = []
for i in range(len(finetune_q)):
    pt = {}
    pt['q'] = str(finetune_q[i])
    ans = finetune_a[i].lower()
    context = [ans]
    for node in network.nodes():
        if node in pt['q'].lower() and node is not None:
            context.append(node)
    random.shuffle(context)
    pt['context'] = '.'.join(context)
    pt['a']= {
        "text": str(finetune_a[i].lower()),
        "start": [pt['context'].find(ans)]
    }
    dataset.append(pt)

In [None]:
import torch
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# from transformers import DebertaV2Tokenizer, DebertaV2ForQuestionAnswering
# tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
# model = T5ForConditionalGeneration.from_pretrained("microsoft/deberta-v3-base")

from transformers import T5Tokenizer, T5ForQuestionAnswering
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForQuestionAnswering.from_pretrained("google/flan-t5-base")

# from transformers import AutoTokenizer, DistilBertForQuestionAnswering
# tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

# from transformers import AutoTokenizer, BertForQuestionAnswering

# tokenizer = AutoTokenizer.from_pretrained("deepset/bert-base-cased-squad2")
# model = BertForQuestionAnswering.from_pretrained("deepset/bert-base-cased-squad2")

# from transformers import AutoTokenizer, AlbertForQuestionAnswering
# import torch

# tokenizer = AutoTokenizer.from_pretrained("twmkn9/albert-base-v2-squad2")
# model = AlbertForQuestionAnswering.from_pretrained("twmkn9/albert-base-v2-squad2")

# from transformers import AutoTokenizer, DistilBertForQuestionAnswering
# import torch

# tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

# from transformers import AutoTokenizer, IBertForQuestionAnswering
# import torch

# tokenizer = AutoTokenizer.from_pretrained("kssteven/ibert-roberta-base")
# model = IBertForQuestionAnswering.from_pretrained("kssteven/ibert-roberta-base")




In [None]:
def train():
  optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
  lossFunc = torch.nn.MSELoss()
  torchData = TorchDatasetFormat(dataset, tokenizer)
  train_loader = DataLoader(torchData, batch_size=16, shuffle=True)
  for epoch in range(10):
    model.train()
    _currLoss = 0
    for batch in tqdm.tqdm(train_loader):
      optimizer.zero_grad()
      inputs = batch
      outputs = model(**inputs)
      loss = lossFunc(torch.tensor(torch.argmax(outputs.start_logits, dim = 1).to(torch.float), requires_grad=True), batch['start_positions'].to(torch.float)) + lossFunc(torch.tensor(torch.argmax(outputs.end_logits, dim = 1).to(torch.float), requires_grad=True), batch['end_positions'].to(torch.float))
      loss.backward(retain_graph=True)
      optimizer.step()
      _currLoss = _currLoss + loss
    print('Epoch, Loss:' + str(epoch + 1), str(_currLoss / len(torchData)))
  model.save_pretrained('./models/finetuned')
  tokenizer.save_pretrained('./models/finetuned')



In [None]:
!zip -r /content/finetuned.zip /content/models/finetuned


In [None]:
from haystack.document_stores.memory import InMemoryDocumentStore
from haystack.nodes import FARMReader

LIMIT = 10
def parseTestFile():
  ques = []
  ans = []
  with open('./test.txt', 'r') as file:
    for i, line in enumerate(file.readlines()):
      if i % 2 == 0 and i < 39:
        ques.append(line.strip())
      elif i > 41 and i < 62:
        ans.append(line.split(" - ")[1].strip().lower())
      else:
        continue
  return ques, ans

def predict(reader, network, emb, query, docs_hf):
  matching = [node for node in network.nodes() if node in query.lower()]
  qv = 0
  for node in matching:
    qv = qv + emb.wv[node]
  qv = qv / len(matching) if qv is not None else emb.wv[random.randint(0, len(network.nodes()) - 1)]
  docs_hf.sort(key = lambda d: np.square(d['embedding'] - qv).mean())
  store = InMemoryDocumentStore()
  store.write_documents(docs_hf[:LIMIT])
  result = reader.predict(query = query, documents = store, top_k = LIMIT)
  ret = []
  for ans in result['answers']:
    t = [node for node in network.nodes if node in ans.answer and node not in ret]
    ret.extend(t)
  return ret[:LIMIT]

def docs_hf_format(network):
  docs_hf = []
  for node in network.nodes():
    doc = {}
    doc['node'] = node
    content = [str(node)]
    content.extend([str(n[0]) for n in emb.wv.most_similar(node, topn = LIMIT)])
    content =  ' '.join(content)
    doc['content'] = content
    doc['embedding'] = emb.wv[node]
    docs_hf.append(doc)
  return docs_hf



In [None]:
reader = FARMReader('GoatMilk98/635-a2-model-flan', use_gpu = True)

In [None]:
docs_hf = docs_hf_format(network)

In [None]:
ques, ans = parseTestFile()
acc = 0
for i in range(len(ques)):
  print("q:", ques[i])
  print("a:",ans[i])
  preds = predict(reader, network, emb, ques[i], docs_hf)
  print("pred:",','.join(preds))
  if ans[i].lower() in preds:
    acc = acc + 1

print('acc: ' + str(acc * 100/len(ques)))