# Install python library

In [None]:
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
!pip install bitsandbytes-cudacuda113
!pip install -U sentence-transformers
!pip install rich
!pip install datasets
!pip install numpy
!pip install pandas
!pip install scikit-learn
!pip install pinecone-client

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu113
[31mERROR: Could not find a version that satisfies the requirement bitsandbytes-cudacuda113 (from versions: none)[0m
[31mERROR: No matching distribution found for bitsandbytes-cudacuda113[0m
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 5.3 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 22.9 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 15.1 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.9 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any

# Imports

In [None]:
import json
import os
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import pinecone
from rich import print
from rich.pretty import pprint


# Google drive imports

In [None]:
from google.colab import drive
drive.mount('/content/drive/')



Mounted at /content/drive/


In [None]:
import os
def get_dataset():
  #/content/drive/MyDrive/AIM/discussion_dataset
  file_path = "/content/drive/MyDrive/discussion_dataset"
  ## determine the size of the dataset
  files  = os.listdir(os.path.join(os.getcwd(),file_path))
  print(files)
  dataset_for_model = None

  for discussion in files:
    jsonString = "json"
    if jsonString in discussion:
      new_file_path = "/content/drive/MyDrive/discussion_dataset"
      f = open(os.path.join(new_file_path , discussion))
      data = json.load(f)
      try:
        if data["answer"] is not None:
          if dataset_for_model is not None:
            dataset_object = [data["title"] , data["bodyText"] , data["answer"]["bodyText"] , data["id"] , data['answer']["url"]]
            dataset_object = np.asarray(dataset_object , dtype=object).reshape(1,5)
            dataset_for_model = pd.concat([pd.DataFrame(dataset_object , columns=["question" , "context" , "answer" , "id" , "url"] , dtype=object) , dataset_for_model] , ignore_index=True , axis = 0)
          else:
            dataset_object = [data["title"] , data["bodyText"] , data["answer"]["bodyText"] , data["id"] , data['answer']["url"]]
            dataset_object = np.asarray(dataset_object , dtype=object).reshape(1,5)
            dataset_for_model = pd.DataFrame(dataset_object , columns=["question" , "context" , "answer" , "id" , "url"] , dtype=object)
      except:
        continue
  return dataset_for_model
dataset_for_model  = get_dataset()
  

In [None]:
print(type(dataset_for_model))

In [None]:
print(dataset_for_model.head())
print(dataset_for_model.size)
print(dataset_for_model.shape)
train, eval = train_test_split( dataset_for_model ,  test_size=0.33, random_state=42)

In [None]:
from sentence_transformers import InputExample
from tqdm.auto import tqdm

#InputExample is the data format we use when we're training with sentence transformer library
#tqdm is the progress bar in the output
train = []
for index , row in dataset_for_model.iterrows():
  print(type(row["context"]))
  train.append(InputExample(
      texts= [row["question"] , row["context"]] #no label here, because we are training with multiple ranking loss
  ))

In [None]:
#Because we are using MNR loss, we should ensure each batch doesn't include duplicates
from sentence_transformers import datasets
batch_size = 24

loader = datasets.NoDuplicatesDataLoader(
    train, batch_size=batch_size
)

In [None]:
from sentence_transformers import models, SentenceTransformer

#mpnet model is more accurate than the bert model 
#pooling layer is important!
bert = models.Transformer('nreimers/albert-small-v2')
pooler = models.Pooling(
    bert.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)

model = SentenceTransformer(modules=[bert, pooler])
model

Downloading:   0%|          | 0.00/790 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/44.6M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/428 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/742k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/245 [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: AlbertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [None]:
#Initialize MNR loss for training; we get all the pairs together and we rank all the other contexts as dissimilar
from sentence_transformers import losses

loss = losses.MultipleNegativesRankingLoss(model)

In [None]:
#training!

epochs = 1
warmup_steps = int(len(loader) * epochs * 0.1)
model.fit(
    train_objectives = [(loader, loss)],
    epochs=epochs,
    warmup_steps = warmup_steps,
    output_path='mpnet-mnr-squad2',
    show_progress_bar=True
)



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/17 [00:00<?, ?it/s]

In [None]:
torch.save(model.state_dict(), "model.pth")

# Evaluate the model

In [None]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

eval_df = pd.DataFrame()
for index , row in dataset_for_model.iterrows():
  eval_df = eval_df.append({
      "question": row["question"],
      "context": row["context"],
      "id":row["id"]
  } , ignore_index = True)


In [None]:
eval_df.head()

Unnamed: 0,question,context,id
0,Only Tensors created explicitly by the user (g...,"how can i solve this problem, my net is DB, d...",MDEwOkRpc2N1c3Npb24zNTE3NTEx
1,How to test in low version,There is no train.test () in the lower version...,MDEwOkRpc2N1c3Npb24zNDI2NzQy
2,Loading from checkpoints re-downloads pre-trai...,I am defining a simple multi-class BERT classi...,MDEwOkRpc2N1c3Npb24zNTQ4NzY1
3,Selecting one gpu from cli,"Hi, I have 4 gpus on my machine. I want to sel...",MDEwOkRpc2N1c3Npb24zNDIyMTky
4,CUDA OOM during validation of first epoch,"hi all,\nMy model validation code (see below) ...",D_kwDOCqWgoM4AONtA


# remove duplication in the dataset

In [None]:
no_dupe = eval_df.drop_duplicates(
    subset='context',
    keep='first'
)
# also drop question column
no_dupe = no_dupe.drop(columns=['question'])
# and give each context a slightly unique ID
no_dupe['id'] = no_dupe['id'] + 'con'
no_dupe.head()

Unnamed: 0,context,id
0,"how can i solve this problem, my net is DB, d...",MDEwOkRpc2N1c3Npb24zNTE3NTExcon
1,There is no train.test () in the lower version...,MDEwOkRpc2N1c3Npb24zNDI2NzQycon
2,I am defining a simple multi-class BERT classi...,MDEwOkRpc2N1c3Npb24zNTQ4NzY1con
3,"Hi, I have 4 gpus on my machine. I want to sel...",MDEwOkRpc2N1c3Npb24zNDIyMTkycon
4,"hi all,\nMy model validation code (see below) ...",D_kwDOCqWgoM4AONtAcon


In [None]:
eval_df  = eval_df.merge(no_dupe , how="inner" , on="context")


In [None]:
eval_df

Unnamed: 0,question,context,id_x,id_y
0,Only Tensors created explicitly by the user (g...,"how can i solve this problem, my net is DB, d...",MDEwOkRpc2N1c3Npb24zNTE3NTEx,MDEwOkRpc2N1c3Npb24zNTE3NTExcon
1,How to test in low version,There is no train.test () in the lower version...,MDEwOkRpc2N1c3Npb24zNDI2NzQy,MDEwOkRpc2N1c3Npb24zNDI2NzQycon
2,Loading from checkpoints re-downloads pre-trai...,I am defining a simple multi-class BERT classi...,MDEwOkRpc2N1c3Npb24zNTQ4NzY1,MDEwOkRpc2N1c3Npb24zNTQ4NzY1con
3,Selecting one gpu from cli,"Hi, I have 4 gpus on my machine. I want to sel...",MDEwOkRpc2N1c3Npb24zNDIyMTky,MDEwOkRpc2N1c3Npb24zNDIyMTkycon
4,CUDA OOM during validation of first epoch,"hi all,\nMy model validation code (see below) ...",D_kwDOCqWgoM4AONtA,D_kwDOCqWgoM4AONtAcon
...,...,...,...,...
423,Problems in Pruning,"Hello, I am trying to get pruning to work with...",MDEwOkRpc2N1c3Npb24zMzEzODI5,MDEwOkRpc2N1c3Npb24zMzEzODI5con
424,About `ModelCheckpoint` starting point,My ModelCheckpoint callback:\nckpt_callback = ...,D_kwDOCqWgoM4AO0kZ,D_kwDOCqWgoM4AO0kZcon
425,Multi-GPU Training GPU Usage,❓ Multi-GPU Training GPU Usage\nBefore asking:...,MDEwOkRpc2N1c3Npb244MjI2MA==,MDEwOkRpc2N1c3Npb244MjI2MA==con
426,Link arguments from Datamodule into init_args ...,Hey!\nI'm trying to use LightningArgumentParse...,D_kwDOCqWgoM4AOoRX,D_kwDOCqWgoM4AOoRXcon


# Mapping question to index id 

In [None]:
ir_queries = {
    row['id_x']: row['question'] for i, row in eval_df.iterrows()
}
ir_queries

{'D_kwDOCqWgoM4AN-cL': 'ModelCheckpoint save nothing',
 'D_kwDOCqWgoM4AN-x1': 'Issue in fitting model and finding optimal learning rate parameter',
 'D_kwDOCqWgoM4AN16B': 'How to save/load only part of the weights in the model?',
 'D_kwDOCqWgoM4AN2Gd': 'Example on training with TPU does not run at all',
 'D_kwDOCqWgoM4AN49x': 'Exporting PyTorch Lightning model to ONNX format not working',
 'D_kwDOCqWgoM4AN4Of': 'How to save and load LightningModule whose input containing the pretrained moduel?',
 'D_kwDOCqWgoM4AN4Z4': 'How to disable logging temporarily?',
 'D_kwDOCqWgoM4AN5Lv': 'The call of training_step and validation_step .etc.',
 'D_kwDOCqWgoM4AN637': 'multiple on_train_epoch_start callbacks but only one on_train_epoch_end?',
 'D_kwDOCqWgoM4AN6JX': "'NeuralNetwork' object has no attribute 'log'",
 'D_kwDOCqWgoM4AN6mz': 'Model with best validation accuracy',
 'D_kwDOCqWgoM4AN6vA': 'Getting the test score after restoring a pretrained model',
 'D_kwDOCqWgoM4AN7kb': 'MisconfigurationEx

In [None]:
ir_corpus = {
    row['id_y']: row['context'] for i, row in eval_df.iterrows()
}
ir_corpus

{'D_kwDOCqWgoM4AN-cLcon': "I want to use ModelCheckpoint to save mode while training, however, nothing has been saved. The following is my code. I don't know what leads to this problem, any suggestions?",
 'D_kwDOCqWgoM4AN-x1con': 'following is the error: NotImplementedError: `val_dataloader` must be implemented to be used with the Lightning Trainer\n\nLOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n---------------------------------------------------------------------------\nNotImplementedError                       Traceback (most recent call last)\n<ipython-input-11-263e8be26564> in <module>()\n      3     tft,\n      4     train_dataloader=train_dataloader,\n----> 5     val_dataloaders=val_dataloader,\n      6 )\n\n11 frames\n/usr/local/lib/python3.7/dist-packages/pytorch_lightning/core/hooks.py in val_dataloader(self)\n    590             will have an argument ``dataloader_idx`` which matches the order here.\n    591         """\n--> 592         raise NotImplementedError("`val_dataloade

In [None]:
ir_relevant_docs = {key: [] for key in eval_df['id_x'].unique()}
for i, row in eval_df.iterrows():
    # we append in the case of a question ID being connected to
    # multiple context IDs
    ir_relevant_docs[row['id_x']].append(row['id_y'])
# this must be in format {question_id: {set of context_ids}}
ir_relevant_docs = {key: set(values) for key, values in ir_relevant_docs.items()}
ir_relevant_docs

{'D_kwDOCqWgoM4AN-cL': {'D_kwDOCqWgoM4AN-cLcon'},
 'D_kwDOCqWgoM4AN-x1': {'D_kwDOCqWgoM4AN-x1con'},
 'D_kwDOCqWgoM4AN16B': {'D_kwDOCqWgoM4AN16Bcon'},
 'D_kwDOCqWgoM4AN2Gd': {'D_kwDOCqWgoM4AN2Gdcon'},
 'D_kwDOCqWgoM4AN49x': {'D_kwDOCqWgoM4AN49xcon'},
 'D_kwDOCqWgoM4AN4Of': {'D_kwDOCqWgoM4AN4Ofcon'},
 'D_kwDOCqWgoM4AN4Z4': {'D_kwDOCqWgoM4AN4Z4con'},
 'D_kwDOCqWgoM4AN5Lv': {'D_kwDOCqWgoM4AN5Lvcon'},
 'D_kwDOCqWgoM4AN637': {'D_kwDOCqWgoM4AN637con'},
 'D_kwDOCqWgoM4AN6JX': {'D_kwDOCqWgoM4AN6JXcon'},
 'D_kwDOCqWgoM4AN6mz': {'D_kwDOCqWgoM4AN6mzcon'},
 'D_kwDOCqWgoM4AN6vA': {'D_kwDOCqWgoM4AN6vAcon'},
 'D_kwDOCqWgoM4AN7kb': {'D_kwDOCqWgoM4AN7kbcon'},
 'D_kwDOCqWgoM4AN88O': {'D_kwDOCqWgoM4AN88Ocon'},
 'D_kwDOCqWgoM4AN8UH': {'D_kwDOCqWgoM4AN8UHcon'},
 'D_kwDOCqWgoM4AN_7D': {'D_kwDOCqWgoM4AN_7Dcon'},
 'D_kwDOCqWgoM4AN_8r': {'D_kwDOCqWgoM4AN_8rcon'},
 'D_kwDOCqWgoM4AN_cS': {'D_kwDOCqWgoM4AN_cScon'},
 'D_kwDOCqWgoM4ANn3D': {'D_kwDOCqWgoM4ANn3Dcon'},
 'D_kwDOCqWgoM4ANo9F': {'D_kwDOCqWgoM4ANo9Fcon'},


In [None]:
ir_eval = InformationRetrievalEvaluator(
    ir_queries, ir_corpus, ir_relevant_docs
)

In [None]:
ir_eval(model)

0.3492184893160091

# Encoding Context

In [None]:
eval_dv = pd.DataFrame()
for index , row in dataset_for_model.iterrows():
  eval_dv= eval_dv.append({
      "id":row["id"],
      "encoding":model.encode(row['context']).tolist(),
      "answer": row["answer"],
      "context":row["context"]
  } , ignore_index = True)
  


In [None]:
print(eval_dv.head())
print(eval_dv.shape)

# Initializing the Index

In [None]:
API_KEY = "d77b604e-7a23-4cb7-9b20-3ba3c3c7cc6f"

pinecone.init(api_key = API_KEY , environment='us-west1-gcp')


In [None]:
if "github-question-answer" not in pinecone.list_indexes():
  pinecone.create_index(
      name="github-question-answer" , dimension=model.get_sentence_embedding_dimension(), metric='cosine'
    )

In [None]:
model.get_sentence_embedding_dimension()

768

# Populate the index

In [None]:
index = pinecone.Index('github-question-answer')

In [None]:
from tqdm.auto import tqdm  # progress bar

upserts = [(v['id'], v['encoding'], {'text': "Ans:"+v["answer"]+"\nContext:"+v["context"][:150]}) for row_index , v in eval_dv.iterrows()]
print(len(eval_dv))
# now upsert in chunks
for i in tqdm(range(0, len(upserts), 50)):
    i_end = i + 50
    if i_end > len(upserts): i_end = len(upserts)
    index.upsert(vectors=upserts[i:i_end])

  0%|          | 0/9 [00:00<?, ?it/s]

# Making Query

In [None]:
query= "What are hooks in pytorch lightning?"
xq = model.encode([query]).tolist()

In [None]:
xc = index.query(xq , top_k = 3 , include_metadata = True)
pprint(xc)

NameError: ignored

# Citation

In [None]:
@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

SyntaxError: ignored