In [1]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [2]:
import pandas as pd 

df = pd.read_csv('100_Unique_QA_Dataset.csv')
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


1. Tokenize the words.
2. Form the vocab.
3. convert words to numerical numbers.

In [3]:
def tokenize(text): 
    text = text.lower()
    text = text.replace('?' , '')
    text = text.replace("'" , '')
    return text.split()

In [4]:
tokenize("What is the capital of Germany?")

['what', 'is', 'the', 'capital', 'of', 'germany']

In [5]:
# create the vocab 
vocab = {'<UNK>' : 0} # Unkown words

def build_vocab(row): 
    # print(f"{row['question']} | {row['answer']}")
    tokenized_question = tokenize(row['question'])
    tokenized_answers = tokenize(row['answer'])
    merged_tokens = tokenized_question + tokenized_answers
    # print(merged_tokens)
    # print(tokenized_question , tokenized_answers)

    for token in merged_tokens: 
        if token not in vocab: 
            vocab[token] = len(vocab)

In [6]:
df.apply(func = build_vocab , axis = 1)

0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [7]:
vocab

{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 'to': 12,
 'kill': 13,
 'a': 14,
 'mockingbird': 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 '1984': 67,
 'george-orwell': 68,
 'currency': 69,
 'unit

In [8]:
len(vocab)

324

In [9]:
# convert words to numerical indices 
def text_to_indices(text , vocab): 
    indexed_text = []
    tokens = tokenize(text)

    for token in tokens: 
        if token in vocab: 
            indexed_text.append(vocab[token])
        else:
            indexed_text.append(vocab['<UNK>'])

    return indexed_text

In [10]:
text_to_indices("Who is tipto?" , vocab)

[10, 2, 0]

In [11]:
from torch.utils.data import Dataset , DataLoader

In [12]:
class QADataset(Dataset): 
    def __init__(self , df , vocab): 
        self.df = df 
        self.vocab = vocab
        
    def __len__(self): 
        return self.df.shape[0]
        
    def __getitem__(self , index): 
        question = text_to_indices(self.df.iloc[index]['question'] , self.vocab)
        answer = text_to_indices(self.df.iloc[index]['answer'] , self.vocab)
        return torch.tensor(question) , torch.tensor(answer)

In [13]:
dataset = QADataset(df , vocab)

In [14]:
dataset[0]

(tensor([1, 2, 3, 4, 5, 6]), tensor([7]))

In [15]:
dataloader = DataLoader(dataset = dataset , batch_size = 1, shuffle = True , pin_memory = True)

In [16]:
for q , a in dataloader: 
    print(q , a)

tensor([[  1,   2,   3, 141, 117,  83,   3, 277, 278]]) tensor([[121]])
tensor([[ 10, 140,   3, 141, 142,  12, 143,  83,   3, 144]]) tensor([[145]])
tensor([[78, 79, 80, 81, 82, 83, 84]]) tensor([[85]])
tensor([[ 78,  79, 195,  81,  19,   3, 196, 197, 198]]) tensor([[199]])
tensor([[ 42, 117, 118,   3, 119,  94, 120]]) tensor([[121]])
tensor([[  1,   2,   3,  92, 137,  19,   3,  45]]) tensor([[185]])
tensor([[ 1,  2,  3, 69,  5,  3, 70, 71]]) tensor([[72]])
tensor([[ 42, 250, 251, 118, 252, 253]]) tensor([[254]])
tensor([[ 10,  11, 157, 158, 159]]) tensor([[160]])
tensor([[ 10,  75,   3, 296,  19, 297]]) tensor([[298]])
tensor([[ 42, 318,   2,  62,  63,   3, 319,   5, 320]]) tensor([[321]])
tensor([[ 10,   2,  62,  63,   3, 283,   5, 284]]) tensor([[285]])
tensor([[ 1,  2,  3,  4,  5, 53]]) tensor([[54]])
tensor([[ 42,  18, 118,   3, 186, 187]]) tensor([[188]])
tensor([[ 1,  2,  3, 24, 25,  5, 26, 19, 27]]) tensor([[28]])
tensor([[10, 96,  3, 97]]) tensor([[98]])
tensor([[  1,   2,   3

In [22]:
# make the RNN architecture 
import torch.nn as nn 

class SimpleRNN(nn.Module): 
    def __init__(self , vocab_size): 
        super().__init__()
        self.embedding = nn.Embedding(vocab_size , embedding_dim = 50)
        self.rnn = nn.RNN(50 , 64 , batch_first = True)
        self.fc = nn.Linear(64 , vocab_size)

    def forward(self , question):  
        embedded_question = self.embedding(question)
        all_hidden_states,final_hidden_state_value = self.rnn(embedded_question)
        output = self.fc(final_hidden_state_value.squeeze(0)) # train based on only on last state
        return output 

In [23]:
learning_rate = 0.001
epochs = 50

In [24]:
model = SimpleRNN(vocab_size = len(vocab)).to(device)

In [25]:
import torch.optim as optim
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters() , lr = learning_rate)

In [26]:
vocab_size = len(vocab)
# training loop 
for epoch in range(epochs): 
    total_epoch_loss = 0 
    for question , answer in dataloader: 
        question , answer = question.to(device) , answer.to(device)

        # forward
        output = model(question)
        # loss 
        loss = criterion(output , answer[0])
        total_epoch_loss += loss.item()
        # grad
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch: {epoch + 1} | Loss: {total_epoch_loss:4f}")

Epoch: 1 | Loss: 525.496064
Epoch: 2 | Loss: 461.329195
Epoch: 3 | Loss: 386.883098
Epoch: 4 | Loss: 318.250582
Epoch: 5 | Loss: 264.126282
Epoch: 6 | Loss: 214.489299
Epoch: 7 | Loss: 170.066736
Epoch: 8 | Loss: 131.933222
Epoch: 9 | Loss: 100.253669
Epoch: 10 | Loss: 76.843899
Epoch: 11 | Loss: 58.936115
Epoch: 12 | Loss: 46.106703
Epoch: 13 | Loss: 36.478910
Epoch: 14 | Loss: 29.895913
Epoch: 15 | Loss: 24.701719
Epoch: 16 | Loss: 20.912375
Epoch: 17 | Loss: 17.778834
Epoch: 18 | Loss: 15.177159
Epoch: 19 | Loss: 13.108596
Epoch: 20 | Loss: 11.250150
Epoch: 21 | Loss: 9.850190
Epoch: 22 | Loss: 8.676468
Epoch: 23 | Loss: 7.684516
Epoch: 24 | Loss: 6.869779
Epoch: 25 | Loss: 6.161794
Epoch: 26 | Loss: 5.560571
Epoch: 27 | Loss: 5.001348
Epoch: 28 | Loss: 4.565667
Epoch: 29 | Loss: 4.153508
Epoch: 30 | Loss: 3.792535
Epoch: 31 | Loss: 3.483482
Epoch: 32 | Loss: 3.195590
Epoch: 33 | Loss: 2.948065
Epoch: 34 | Loss: 2.731674
Epoch: 35 | Loss: 2.532692
Epoch: 36 | Loss: 2.351718
Epoch: 3

In [30]:
def prediction(model , question , thresold = 0.5): 
    # convert question into number
    numerical_question = text_to_indices(question , vocab)
    # convert into tensor
    question_tensor = torch.tensor(numerical_question).unsqueeze(0)

    # send the question to the model and model will give logits
    question_tensor = question_tensor.to(device)
    output = model(question_tensor)
    # convert logits to probability
    prob = torch.nn.functional.softmax(output , dim = 1)
    # find the max prob
    value , index = torch.max(prob , 1)
    if value < thresold:
        print("I don't know")
    else:
        ans = list(vocab.keys())[index]
        print(ans)

In [36]:
question = "Who is the first person landed on the moon?"
prediction(model , question)

armstrong
