<a href="https://colab.research.google.com/github/Teasotea/DialogSystem/blob/main/ConversationalAI_improved.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libraries

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.4 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 26.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 35.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 65.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.1 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

In [2]:
!pip install nltk



In [25]:
import numpy as np
import pandas as pd
import time
import os
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import nltk
from nltk.stem.lancaster import LancasterStemmer
import requests
import re

In [4]:
stemmer = LancasterStemmer()

# Part I: Greeting Classification

In [5]:
training_data = []
greetings = ['hi', "hola", 'hey', 'hello','morning', 'evening', 'good day', 'good morning', 'greetings', 'howdy', 'welcome', 'bonjour',
             'buenas noches', 'buenos dias', 'salutation', 'salut', 'hail', 'salaam', 'aloha', 'ciao', 'good wishes', 'respects', 'high-five',
             'aloha', 'yoo-hoo', 'yawp', 'psst', 'oh', 'toast', 'ave', "how is it going?", 'yo', 'hi there']
# other = ['face','wisecrack','care','thick','reference','deserve','engine','cry','mud','worth',
#          'railroad','permanent','throne','tradition','loan','employ','resource','privilege','parachute',
#          'rent','of','characteristic','coin','teenager','established','reveal','bad','undress','revoke','ward']
for i in greetings:
  training_data.append({"class":"greeting", "sentence":i})
# for i in other:
#   training_data.append({"class":"other", "sentence":i})
training_data.append({"class":"other", "sentence":'word'})
  
greet_df = pd.DataFrame(training_data)
greet_df

Unnamed: 0,class,sentence
0,greeting,hi
1,greeting,hola
2,greeting,hey
3,greeting,hello
4,greeting,morning
5,greeting,evening
6,greeting,good day
7,greeting,good morning
8,greeting,greetings
9,greeting,howdy


In [6]:
nltk.download('punkt')
corpus_words = {}
class_words = {}
classes = list(set([a['class'] for a in training_data]))
for c in classes:
    class_words[c] = []

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [7]:
classes

['other', 'greeting']

In [8]:
for data in training_data:
    for word in nltk.word_tokenize(data['sentence']):
        # ignore a some things
        if word not in ["?", "'s"]:
            # stem and lowercase each word
            stemmed_word = stemmer.stem(word.lower())
            # have we not seen this word already?
            if stemmed_word not in corpus_words:
                corpus_words[stemmed_word] = 1
            else:
                corpus_words[stemmed_word] += 1

            # add the word to our words in class list
            class_words[data['class']].extend([stemmed_word])

# we now have each stemmed word and the number of occurances of the word in our training corpus (the word's commonality)
print ("Corpus words and counts: %s \n" % corpus_words)
# also we have all words in each class
print ("Class words: %s" % class_words)

Corpus words and counts: {'hi': 2, 'hol': 1, 'hey': 1, 'hello': 1, 'morn': 2, 'ev': 1, 'good': 3, 'day': 1, 'greet': 1, 'howdy': 1, 'welcom': 1, 'bonjo': 1, 'buena': 1, 'noch': 1, 'bueno': 1, 'dia': 1, 'salut': 2, 'hail': 1, 'salaam': 1, 'aloh': 2, 'ciao': 1, 'wish': 1, 'respect': 1, 'high-five': 1, 'yoo-hoo': 1, 'yawp': 1, 'psst': 1, 'oh': 1, 'toast': 1, 'av': 1, 'how': 1, 'is': 1, 'it': 1, 'going': 1, 'yo': 1, 'ther': 1, 'word': 1} 

Class words: {'other': ['word'], 'greeting': ['hi', 'hol', 'hey', 'hello', 'morn', 'ev', 'good', 'day', 'good', 'morn', 'greet', 'howdy', 'welcom', 'bonjo', 'buena', 'noch', 'bueno', 'dia', 'salut', 'salut', 'hail', 'salaam', 'aloh', 'ciao', 'good', 'wish', 'respect', 'high-five', 'aloh', 'yoo-hoo', 'yawp', 'psst', 'oh', 'toast', 'av', 'how', 'is', 'it', 'going', 'yo', 'hi', 'ther']}


In [9]:
# calculate a score for a given class
def calculate_class_score(sentence, class_name, show_details=True):
    score = 0
    # tokenize each word in our new sentence
    for word in nltk.word_tokenize(sentence):
        # check to see if the stem of the word is in any of our classes
        if stemmer.stem(word.lower()) in class_words[class_name]:
            # treat each word with same weight
            score += 1
            
            if show_details:
                print ("   match: %s" % stemmer.stem(word.lower() ))
    return score

In [10]:
# we can now calculate a score for a new sentence
sentence = "good day for us to have lunch?"

# now we can find the class with the highest score
for c in class_words.keys():
    print ("Class: %s  Score: %s \n" % (c, calculate_class_score(sentence, c)))

Class: other  Score: 0 

   match: good
   match: day
Class: greeting  Score: 2 



In [11]:
# calculate a score for a given class taking into account word commonality
def calculate_class_score_commonality(sentence, class_name, show_details=True):
    score = 0
    # tokenize each word in our new sentence
    for word in nltk.word_tokenize(sentence):
        # check to see if the stem of the word is in any of our classes
        if stemmer.stem(word.lower()) in class_words[class_name]:
            # treat each word with relative weight
            score += (1 / corpus_words[stemmer.stem(word.lower())])

            if show_details:
                print ("   match: %s (%s)" % (stemmer.stem(word.lower()), 1 / corpus_words[stemmer.stem(word.lower())]))
    return score

In [12]:
# now we can find the class with the highest score
for c in class_words.keys():
    print ("Class: %s  Score: %s \n" % (c, calculate_class_score_commonality(sentence, c)))

Class: other  Score: 0 

   match: good (0.3333333333333333)
   match: day (1.0)
Class: greeting  Score: 1.3333333333333333 



In [13]:
# return the class with highest score for sentence
def classify(sentence):
    high_class = 'other'
    high_score = 0
    # loop through our classes
    for c in class_words.keys():
        # calculate score of sentence for each class
        score = calculate_class_score_commonality(sentence, c, show_details=False)
        # keep track of highest score
        if score > high_score:
            high_class = c
            high_score = score

    return high_class

In [14]:
classify("oh! are u a human?")

'greeting'

# Part II: Question Answering

In [15]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 5.3 MB/s 
[?25hCollecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.3.0-py3-none-any.whl (136 kB)
[K     |████████████████████████████████| 136 kB 51.1 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 38.8 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 59.3 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 55.0 MB/s 
Collecting aiosignal>=1.1.2
  Downloading aiosignal-1.2.0-py

In [16]:
import datasets

# [ds for ds in datasets.list_datasets() if 'ml' in ds.lower()]

In [17]:
qa_ds = datasets.load_dataset('squad', streaming = False)
qa_ds

Downloading builder script:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text (download: 33.51 MiB, generated: 85.63 MiB, post-processed: Unknown size, total: 119.14 MiB) to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [None]:
qa_ds['train'].description

'Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.\n'

In [None]:
print(qa_ds['train'].dataset_size)
qa_ds['train'].features

89846964


{'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None),
 'context': Value(dtype='string', id=None),
 'id': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'title': Value(dtype='string', id=None)}

In [18]:
qa_ds['train'].to_pandas().head()

Unnamed: 0,id,title,context,question,answers
0,5733be284776f41900661182,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,"{'text': ['Saint Bernadette Soubirous'], 'answ..."
1,5733be284776f4190066117f,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,"{'text': ['a copper statue of Christ'], 'answe..."
2,5733be284776f41900661180,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,"{'text': ['the Main Building'], 'answer_start'..."
3,5733be284776f41900661181,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,{'text': ['a Marian place of prayer and reflec...
4,5733be284776f4190066117e,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,{'text': ['a golden statue of the Virgin Mary'...


In [None]:
from transformers import BertTokenizer
b_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
qa_ds['train'] = qa_ds['train'].map(
    lambda x: b_tokenizer(
        x['question'], x['context'], max_length = 512, padding = 'max_length', truncation = True
    ), batched = True, batch_size = 32
)

  0%|          | 0/2738 [00:00<?, ?ba/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

# QA Dataset 

In [20]:
!pip install jupyterlab
!pip install python-Levenshtein
!pip install bert-serving-server bert-serving-client

Collecting jupyterlab
  Downloading jupyterlab-3.3.4-py3-none-any.whl (8.7 MB)
[K     |████████████████████████████████| 8.7 MB 5.0 MB/s 
[?25hCollecting nbclassic~=0.2
  Downloading nbclassic-0.3.7-py3-none-any.whl (13 kB)
Collecting jupyter-server~=1.4
  Downloading jupyter_server-1.16.0-py3-none-any.whl (343 kB)
[K     |████████████████████████████████| 343 kB 63.5 MB/s 
Collecting jupyterlab-server~=2.10
  Downloading jupyterlab_server-2.13.0-py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.8 MB/s 
[?25hCollecting tornado>=6.1.0
  Downloading tornado-6.1-cp37-cp37m-manylinux2010_x86_64.whl (428 kB)
[K     |████████████████████████████████| 428 kB 70.8 MB/s 
Collecting anyio>=3.1.0
  Downloading anyio-3.5.0-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 7.3 MB/s 
[?25hCollecting nbconvert>=6.4.4
  Downloading nbconvert-6.5.0-py3-none-any.whl (561 kB)
[K     |████████████████████████████████| 561 kB 56.7 MB/s 
Collecting ju

Collecting python-Levenshtein
  Downloading python-Levenshtein-0.12.2.tar.gz (50 kB)
[?25l[K     |██████▌                         | 10 kB 24.4 MB/s eta 0:00:01[K     |█████████████                   | 20 kB 15.0 MB/s eta 0:00:01[K     |███████████████████▌            | 30 kB 10.8 MB/s eta 0:00:01[K     |██████████████████████████      | 40 kB 9.4 MB/s eta 0:00:01[K     |████████████████████████████████| 50 kB 3.2 MB/s 
Building wheels for collected packages: python-Levenshtein
  Building wheel for python-Levenshtein (setup.py) ... [?25l[?25hdone
  Created wheel for python-Levenshtein: filename=python_Levenshtein-0.12.2-cp37-cp37m-linux_x86_64.whl size=149874 sha256=83106a7483780c44db1b84e64ec01446ddc3dcc7d47678222fdbd01c5633e395
  Stored in directory: /root/.cache/pip/wheels/05/5f/ca/7c4367734892581bb5ff896f15027a932c551080b2abd3e00d
Successfully built python-Levenshtein
Installing collected packages: python-Levenshtein
Successfully installed python-Levenshtein-0.12.2
Coll

In [34]:
qa_data = pd.read_csv('https://raw.githubusercontent.com/Kizuna-Cheng/Data_Science_Interviews_NLP/main/data.csv')
qa_data.head(7)

Unnamed: 0,Category,Questions,Answers
0,Linear Regression,What are the assumptions required for linear r...,There are four assumptions associated with a l...
1,Statistics,What is collinearity? What is multicollinearit...,Collinearity is a linear association between t...
2,Linear Regression,What are the drawbacks of a linear model?\n,There are a couple of drawbacks of a linear mo...
3,Linear Regression,What are ridge and lasso regression and what a...,Both L1 and L2 regularization are methods used...
4,KNN,How does K-Nearest Neighbor work?,K-Nearest Neighbors is a classification techni...
5,KMeans,How can you select k for k means?,"You can use the elbow method, which is a popul..."
6,Naive Bayes,Why is Naive Bayes “naive”?,Naive Bayes is naive because it holds a strong...


In [36]:
qa_data.Questions[:10].tolist()

['What are the assumptions required for linear regression? What if some of these assumptions are violated?',
 'What is collinearity? What is multicollinearity? How do you deal with it?\n',
 'What are the drawbacks of a linear model?\n',
 'What are ridge and lasso regression and what are the differences between them?',
 'How does K-Nearest Neighbor work?',
 'How can you select k for k means?',
 'Why is Naive Bayes “naive”?',
 'What are the support vectors in SVM?',
 'What is pruning in decision trees?',
 'What are random forests? Why is Naive Bayes better?']

In [37]:
test_data = [
             'What does linear regression stands for?',
 'What is the differencebetween collinearity and multicollinearity?',
 'What are the cons of using a linear model?\n',
 'What are ridge and lasso regression?',
 'How does K-Nearest Neighbor work?',
 'How to select k for k means?',
 'Why is Naive Bayes “naive”?',
 'When should I use SVM?',
'What is pruning in decision trees?',
 'What are random forests? Why is Naive Bayes better?']


# QA Baseline

In [38]:
def getResults(questions, fn):
    def getResult(q):
        answer, score, prediction = fn(q)
        return [q, prediction, answer, score]
    return pd.DataFrame(list(map(getResult, questions)), columns=["Q", "Prediction", "A", "Score"])
data

{'class': 'other', 'sentence': 'word'}

In [41]:
def getNaiveAnswer(q):
    row = qa_data.loc[qa_data['Questions'].str.contains(re.sub(r"[^\w'\s)]+", "", q),case=False)]
    if len(row) > 0:
        return row["Answers"].values[0], 1, row["Questions"].values[0]
    else: return "Sorry, I didn't get you", 0, ""
print(getNaiveAnswer('How does K-Nearest Neighbor work?'))
getResults(test_data, getNaiveAnswer)

("Sorry, I didn't get you", 0, '')


Unnamed: 0,Q,Prediction,A,Score
0,What does linear regression stands for?,,"Sorry, I didn't get you",0
1,What is the differencebetween collinearity and...,,"Sorry, I didn't get you",0
2,What are the cons of using a linear model?\n,,"Sorry, I didn't get you",0
3,What are ridge and lasso regression?,What are ridge and lasso regression and what a...,Both L1 and L2 regularization are methods used...,1
4,How does K-Nearest Neighbor work?,,"Sorry, I didn't get you",0
5,How to select k for k means?,,"Sorry, I didn't get you",0
6,Why is Naive Bayes “naive”?,,"Sorry, I didn't get you",0
7,When should I use SVM?,,"Sorry, I didn't get you",0
8,What is pruning in decision trees?,What is pruning in decision trees?,Pruning is a technique in machine learning and...,1
9,What are random forests? Why is Naive Bayes be...,,"Sorry, I didn't get you",0


# DistilBERT model from HuggingFace

In [45]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")

model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/249M [00:00<?, ?B/s]

In [46]:
from transformers import DistilBertTokenizer, TFDistilBertForQuestionAnswering

import tensorflow as tf

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
inputs = tokenizer(question, text, return_tensors="tf")
outputs = model(**inputs)
answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/347M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForQuestionAnswering: ['vocab_transform', 'vocab_projector', 'activation_13', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs', 'dropout_19']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


'[CLS] who was'

### Pretraining BERT using MLM and NSP

In [None]:
import random
from tqdm import tqdm
from transformers import AdamW
from transformers import BertForPreTraining #for MLM and NSP

In [None]:
bert_qa_model = BertForPreTraining.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
url = 'https://raw.githubusercontent.com/Teasotea/DialogSystem/main/data/ml_info.txt'
url2 = 'https://github.com/jamescalam/transformers/blob/main/data/text/meditations/clean.txt'
text_data = requests.get(url)
text_data

<Response [200]>

In [None]:
text_data_text = text_data.text.split('\n')
text_data_text[:3]

['Linear Regression is a supervised machine learning algorithm where the predicted output is continuous and has a constant slope. It’s used to predict values within a continuous range, (e.g. sales, price) rather than trying to classify them into categories (e.g. cat, dog). ',
 'There are two main types: Simple linear regression uses traditional slope-intercept form, where 𝑚',
 'and 𝑏 are the variables our algorithm will try to “learn” to produce the most accurate predictions. 𝑥 represents our input data and 𝑦represents our prediction.']

In [None]:
bag = [sent for p in text_data_text for sent in p.split(',') if sent != '']
bag_size = len(bag)

In [None]:
sent_a, sent_b, label = [], [], []
for p in text_data_text:
  sents = [
           sent for sent in p.split('.') if sent != ''
  ]
  num_sent = len(sents)
  if num_sent>1:
    start = random.randint(0, num_sent-2)
    sent_a.append(sents[start])
    if random.random() > 0.5:
      sent_b.append(sents[start+1])
      label.append(0)
    else:
      sent_b.append(bag[random.randint(0, bag_size - 1)])
      label.append(1)
      

In [None]:
inputs = b_tokenizer(sent_a, sent_b, return_tensors = 'pt', max_length = 512, truncation = True, padding = 'max_length')

In [None]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [None]:
inputs['next_sentence_label'] = torch.LongTensor([label]).T
inputs['next_sentence_label'][:10]

tensor([[1],
        [0],
        [0],
        [1],
        [1],
        [0],
        [0],
        [0],
        [1],
        [0]])

In [None]:
inputs['label'] = inputs.input_ids.detach().clone()

In [None]:
rand = torch.rand(inputs.input_ids.shape)
mask_arr = (rand < 0.15) * (inputs.input_ids !=  101) *  (inputs.input_ids !=  102) *  (inputs.input_ids !=  0)

In [None]:
for i in range(inputs.input_ids.shape[0]):
  sel = torch.flatten(mask_arr[i].nonzero()).tolist()
  inputs.input_ids[i, sel] = 103

In [None]:
class PrepareDataset(torch.utils.data.Dataset):
  def __init__(self, encodings): 
    self.encodings = encodings
  def __getitem__(self, idx):
    return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  def __len__(self):
    return len(self.encodings.input_ids)

In [None]:
data = PrepareDataset(inputs)

In [None]:
loader = torch.utils.data.DataLoader(data, batch_size = 16, shuffle = True)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
bert_qa_model.to(device)

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [None]:
bert_qa_model.train()

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [None]:
optim = AdamW(bert_qa_model.parameters(), lr=5e-5)



In [None]:
for epoch in range(2):
  loop = tqdm(loader, leave = True)
  for batch in loop:
    optim.zero_grad()
    input_ids = batch['input_ids'].to(device)
    token_type_ids = batch['token_type_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    next_sentence_label = batch['next_sentence_label'].to(device)
    labels = batch['labels'].to(device)
    outputs = bert_qa_model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask,next_sentence_label=next_sentence_label)
    loss = outputs.loss
    loss.backward()
    optim.step()
    loop.set_description(f'Epoch{epoch}')
    loop.set_postfix(loss=loss.item())

  """
  0%|          | 0/50 [00:00<?, ?it/s]


KeyError: ignored

# Part III: Natural Language Generation

In [42]:
# checkpoint 
checkpoint = "microsoft/DialoGPT-medium"
# download and cache tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# download and cache pre-trained model
modelNLG = AutoModelForCausalLM.from_pretrained(checkpoint)

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/823M [00:00<?, ?B/s]

# Part IV: Chatbot Development

In [43]:
#change the code later, make it better
class ChatBot():
    def __init__(self):
        # once chat starts, the history will be stored for chat continuity
        self.chat_history_ids = None
        # make input ids global to use them anywhere within the object
        self.bot_input_ids = None
        # a flag to check whether to end the conversation
        self.end_chat = False
        # greet while starting
        self.welcome()
        self.is_greeting = False
        self.is_question_from_context = False
        self.answer = ''
        
    def welcome(self):
        print("Initializing ChatBot ...")
        # some time to get user ready
        time.sleep(2)
        print('Type "bye" or "quit" or "exit" to end chat \n')
        # give time to read what has been printed
        time.sleep(3)

        
    def user_input(self):
        # receive input from user
        text = input("User    >> ")
        # end conversation if user wishes so
        if text.lower().strip() in ['bye', 'quit', 'exit']:
            # turn flag on 
            self.end_chat=True
            # a closing comment
            print('ChatBot >>  See you soon! Bye!')
            time.sleep(1)
            print('\nQuitting ChatBot ...')
        else:
            # continue chat, preprocess input text
            # encode the new user input, add the eos_token and return a tensor in Pytorch
            if classify(text) == 'other':
              self.answer = getNaiveAnswer(text)[0]
              if self.answer != "Sorry, I didn't get you":
                self.is_question_from_context = True
              else: 
                self.new_user_input_ids = tokenizer.encode(text + tokenizer.eos_token, \
                                                       return_tensors='pt')
                self.answer = ''
            else: 
              self.is_greeting = True

    def bot_answer(self):
      print("ChatBot >>  " + self.answer)
      self.is_question_from_context = False
      self.answer = ''

    def bot_greet(self):
        greeting = np.random.choice([
            "Welcome, I am ChatBot, here for your kind service",
            "Hey, Great day! I am your virtual assistant",
            "Hello, it's my pleasure meeting you",
            "Hi, I am a ChatBot. Let's chat!"
        ])
        print("ChatBot >>  " + greeting)
        self.is_greeting = False

    def bot_response(self):
        # append the new user input tokens to the chat history
        # if chat has already begun
        if self.chat_history_ids is not None:
            self.bot_input_ids = torch.cat([self.chat_history_ids, self.new_user_input_ids], dim=-1) 
        else:
            # if first entry, initialize bot_input_ids
            self.bot_input_ids = self.new_user_input_ids
        
        # define the new chat_history_ids based on the preceding chats
        # generated a response while limiting the total chat history to 1000 tokens, 
        self.chat_history_ids = modelNLG.generate(self.bot_input_ids, max_length=1000, \
                                               pad_token_id=tokenizer.eos_token_id)
            
        # last ouput tokens from bot
        response = tokenizer.decode(self.chat_history_ids[:, self.bot_input_ids.shape[-1]:][0], \
                               skip_special_tokens=True)
        # in case, bot fails to answer
        if response == "":
            response = self.random_response()
        # print bot response
        print('ChatBot >>  '+ response)
        
    def random_response(self):
        i = -1
        response = tokenizer.decode(self.chat_history_ids[:, self.bot_input_ids.shape[i]:][0], \
                               skip_special_tokens=True)
        # iterate over history backwards to find the last token
        while response == '':
            i = i-1
            response = tokenizer.decode(self.chat_history_ids[:, self.bot_input_ids.shape[i]:][0], \
                               skip_special_tokens=True)
        # if it is a question, answer suitably
        if response.strip() == '?':
            reply = np.random.choice(["I don't know", 
                                     "I am not sure"])
        # not a question? answer suitably
        else:
            reply = np.random.choice(["Great", 
                                      "Fine. What's up?", 
                                      "Okay"
                                     ])
        return reply

In [44]:
# build a ChatBot object
bot = ChatBot()
# start chatting
while True:
    # receive user input
    bot.user_input()
    # check whether to end chat
    if bot.end_chat:
        break
    # output bot response
    if bot.is_greeting == True:
       bot.bot_greet() 
    elif bot.is_question_from_context == True:
       bot.bot_answer()
    else: bot.bot_response()   

Initializing ChatBot ...
Type "bye" or "quit" or "exit" to end chat 

User    >> Yo, hello, my dear bot!
ChatBot >>  Hello, it's my pleasure meeting you
User    >> What are ridge and lasso regression?
ChatBot >>  Both L1 and L2 regularization are methods used to reduce the overfitting of training data. Least Squares minimizes the sum of the squared residuals, which can result in low bias but high variance. L2 Regularization, also called ridge regression, minimizes the sum of the squared residuals plus lambda times the slope squared. This additional term is called the Ridge Regression Penalty. This increases the bias of the model, making the fit worse on the training data, but also decreases the variance. If you take the ridge regression penalty and replace it with the absolute value of the slope, then you get Lasso regression or L1 regularization. L2 is less robust but has a stable solution and always one solution. L1 is more robust but has an unstable solution and can possibly have mu