<a href="https://colab.research.google.com/github/Preeti-ing/TOEIC-READING-MODEL/blob/main/TOEIC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import json
with open('/content/toeic_test.json') as input_json:
  data = json.load(input_json)
data['1']




{'1': 'suffer',
 '2': 'suffers',
 '3': 'suffering',
 '4': 'suffered',
 'anwser': 'suffered',
 'question': 'The assets of Marble Faun Publishing Company ___ last quarter when one of their main local distributors went out of business.'}

In [5]:
question_infors = []

for key, value in data.items():
    question_infors.append(value)

question_infors[0]

{'1': 'suffer',
 '2': 'suffers',
 '3': 'suffering',
 '4': 'suffered',
 'anwser': 'suffered',
 'question': 'The assets of Marble Faun Publishing Company ___ last quarter when one of their main local distributors went out of business.'}

In [6]:
!pip install -U pytorch-pretrained-bert;

Collecting pytorch-pretrained-bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.8/123.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting boto3 (from pytorch-pretrained-bert)
  Downloading boto3-1.34.4-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Collecting botocore<1.35.0,>=1.34.4 (from boto3->pytorch-pretrained-bert)
  Downloading botocore-1.34.4-py3-none-any.whl (11.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8 MB[0m [31m59.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jmespath<2.0.0,>=0.7.1 (from boto3->pytorch-pretrained-bert)
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting s3transfer<0.10.0,>=0.9.0 (from boto3->pytorch-pretrained-bert)
  Downloading s3transfer-0.9.0-py3-none-any.whl (82 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [7]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

In [8]:
class TOEICBert():
    """
    Model using pretrained Bert for answering toeic question, running for each example
    Bertmodel: we can choose bert large cased/bert large uncased, etc

    Model return the answer for the question based on the highest probability
    """
    def __init__(self, bertmodel):
        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        self.bertmodel = bertmodel
        # Initial tokenizer to tokenize the question later
        self.tokenizer = BertTokenizer.from_pretrained(self.bertmodel)
        self.model = BertForMaskedLM.from_pretrained(self.bertmodel).to(self.device)
         # We used pretrained BertForMaskedLM to fill in the blank, do not fine tuning so we set model to eval
        self.model.eval()

    def get_score(self,question_tensors, segment_tensors, masked_index, candidate):
        # Tokenize the answer candidate
        candidate_tokens = self.tokenizer.tokenize(candidate)
        # After tokenizing, we convert token to ids, (word to numerical)
        candidate_ids = self.tokenizer.convert_tokens_to_ids(candidate_tokens)
        predictions = self.model(question_tensors, segment_tensors)
        predictions_candidates = predictions[0,masked_index, candidate_ids].mean()
        return predictions_candidates.item()

    def predict(self,row):
        # Tokenizing questions, convert '___' to '_' so that we can MASK it
        question_tokens = self.tokenizer.tokenize(row['question'].replace('___', '_'))
        masked_index = question_tokens.index('_')
        # Assign [MASK] to blank that need to be completed
        question_tokens[masked_index] = '[MASK]'
        segment_ids = [0] * len(question_tokens)
        segment_tensors = torch.tensor([segment_ids]).to(self.device)
        question_ids = self.tokenizer.convert_tokens_to_ids(question_tokens)
        question_tensors = torch.tensor([question_ids]).to(self.device)
        candidates = [row['1'], row['2'], row['3'], row['4']]
        # Return probabilities of answer choice [prob1, prob2, prob3, prob4]
        predict_tensor = torch.tensor([self.get_score(question_tensors, segment_tensors,
                                                masked_index, candidate) for candidate in candidates])
        # Softmax the predict probability to return the index for maximum values
        predict_idx = torch.argmax(predict_tensor).item()
        return candidates[predict_idx]





In [9]:
Bertmodel  = 'bert-large-uncased'
model = TOEICBert(Bertmodel)

100%|██████████| 231508/231508 [00:00<00:00, 1534388.79B/s]
100%|██████████| 1248501532/1248501532 [00:36<00:00, 34412764.40B/s]


Play with your own Toeic questions

In [12]:
def Answer_toeic(question):
    predict_anwser = model.predict(question)
    anwser = question['anwser']
    if predict_anwser == anwser:
        print(f'The BertModel answer: {predict_anwser}')
        print('This is right answer')
    else:
        print(f'The BertModel answer: {predict_anwser}')
        print('This is wrong answer')
# now we have a TOEIC question on below:
question = {'1': 'different',
 '2': 'differently',
 '3': 'difference',
 '4': 'differences',
 'anwser': 'different',
 'question': 'Matos Realty has developed two ___ methods of identifying undervalued properties.'}

# Check the model
Answer_toeic(question)

The BertModel answer: different
This is right answer
