<a href="https://colab.research.google.com/github/PhongCT1105/AI_Tutor_Transformer_Model/blob/main/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

#AI Tutor for Deep Learning Class built from scratch

#I. Building model and Training

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader



#1. Import dataset used for training

In [None]:
from datasets import load_dataset
original_dataset = load_dataset("allenai/sciq")

print(original_dataset)

ModuleNotFoundError: No module named 'datasets'

In [None]:
# Feature Engineering:
def combined_features(dataset):
  return {
      "question": dataset["question"],
      "answer": "The answer is " + dataset["correct_answer"] + ". Explanation: " + dataset["support"]
  }

dataset = original_dataset.map(combined_features)
dataset = dataset.remove_columns(["distractor3", "distractor1", "distractor2", "correct_answer", "support"])

# Print feature combined:
print(dataset)
print(dataset["train"][0])

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 11679
    })
    validation: Dataset({
        features: ['question', 'answer'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1000
    })
})
{'question': 'What type of organism is commonly used in preparation of foods such as cheese and yogurt?', 'answer': 'The answer is mesophilic organisms. Explanation: Mesophiles grow best in moderate temperature, typically between 25°C and 40°C (77°F and 104°F). Mesophiles are often found living in or on the bodies of humans or other animals. The optimal growth temperature of many pathogenic mesophiles is 37°C (98°F), the normal human body temperature. Mesophilic organisms have important uses in food preparation, including cheese, yogurt, beer and wine.'}


#2. Tokenizer


###2.1 Vocabulary creation

In [None]:
from collections import Counter

def vocab_mapping(dataset, freq=1, part = "train"):

  # Counting word frequency
  counter = Counter()
  for sample in dataset[part]:
    for word in sample["question"].split():
      counter[word] += 1
    for word in sample["answer"].split():
      counter[word] += 1

  # Creating word collection for encode
  vocab = {
      "<PAD>": 0, # Paddinng token
      "<UNK>": 1, # Unknown token
      "<BOS>": 2, # Beginning of sequence
      "<EOS>": 3, # Ending of sequence
  }

  # Unique ID for each word
  for word, count in counter.items():
    if count >= freq and word not in vocab:
      vocab[word] = len(vocab)


  return vocab

vocab_map = vocab_mapping(dataset, 1, "train")
print(vocab_map)



###2.2 Tokenize based on the vocabulary

In [None]:
def tokenize_map(row):

  # Adding tokenize question into input columns
  input = []
  for word in row["question"].split():
    if word in vocab_map:
      input.append(vocab_map[word])
    else:
      input.append(vocab_map["<UNK>"])
  row["input"] = input

  # Adding tokenize
  output = []
  for word in row["answer"].split():
    if word in vocab_map:
      output.append(vocab_map[word])
    else:
      output.append(vocab_map["<UNK>"])
  row["output"] = output

  return row


# Apply for only training set

data_train = dataset["train"].map(tokenize_map)
print(data_train["input"][:5])
print(data_train["output"][:5])

[[4, 5, 6, 7, 8, 9, 10, 11, 12, 6, 13, 14, 15, 16, 17, 18], [4, 70, 71, 72, 73, 74, 75, 76, 77, 39, 41, 78, 11, 41, 79, 80, 17, 81, 76, 82, 39, 41, 78, 11, 41, 83, 84], [99, 100, 101, 102, 103, 76, 101, 104, 103, 105, 15, 101, 106, 76, 101, 107, 35, 108, 109], [4, 8, 41, 208, 209, 210, 211], [218, 11, 219, 8, 41, 220, 221, 222, 223, 224, 225, 223, 226, 227, 228, 229, 230, 17, 231, 232, 233, 234]]
[[19, 20, 8, 21, 22, 23, 24, 25, 26, 11, 27, 28, 29, 30, 31, 17, 32, 33, 17, 34, 24, 35, 36, 37, 38, 11, 39, 40, 41, 42, 6, 43, 39, 44, 45, 19, 46, 47, 48, 6, 49, 50, 51, 8, 52, 53, 41, 54, 55, 56, 57, 58, 59, 60, 61, 62, 11, 63, 64, 65, 66, 67, 68, 17, 69], [19, 20, 8, 85, 86, 23, 87, 88, 89, 41, 72, 73, 90, 74, 91, 76, 92, 39, 92, 76, 93, 94, 88, 71, 95, 74, 75, 76, 77, 39, 41, 78, 11, 41, 96, 97, 19, 73, 74, 81, 76, 82, 39, 41, 78, 11, 41, 83, 98], [19, 20, 8, 110, 23, 111, 99, 6, 103, 35, 112, 6, 113, 114, 39, 113, 115, 116, 113, 117, 35, 118, 119, 117, 11, 41, 120, 6, 101, 121, 99, 100, 1

###2.3 Adding special token and padding

Map:   0%|          | 0/11679 [00:00<?, ? examples/s]

[[4, 5, 6, 7, 8, 9, 10, 11, 12, 6, 13, 14, 15, 16, 17, 18], [4, 70, 71, 72, 73, 74, 75, 76, 77, 39, 41, 78, 11, 41, 79, 80, 17, 81, 76, 82, 39, 41, 78, 11, 41, 83, 84], [99, 100, 101, 102, 103, 76, 101, 104, 103, 105, 15, 101, 106, 76, 101, 107, 35, 108, 109], [4, 8, 41, 208, 209, 210, 211], [218, 11, 219, 8, 41, 220, 221, 222, 223, 224, 225, 223, 226, 227, 228, 229, 230, 17, 231, 232, 233, 234]]
[[19, 20, 8, 21, 22, 23, 24, 25, 26, 11, 27, 28, 29, 30, 31, 17, 32, 33, 17, 34, 24, 35, 36, 37, 38, 11, 39, 40, 41, 42, 6, 43, 39, 44, 45, 19, 46, 47, 48, 6, 49, 50, 51, 8, 52, 53, 41, 54, 55, 56, 57, 58, 59, 60, 61, 62, 11, 63, 64, 65, 66, 67, 68, 17, 69], [19, 20, 8, 85, 86, 23, 87, 88, 89, 41, 72, 73, 90, 74, 91, 76, 92, 39, 92, 76, 93, 94, 88, 71, 95, 74, 75, 76, 77, 39, 41, 78, 11, 41, 96, 97, 19, 73, 74, 81, 76, 82, 39, 41, 78, 11, 41, 83, 98], [19, 20, 8, 110, 23, 111, 99, 6, 103, 35, 112, 6, 113, 114, 39, 113, 115, 116, 113, 117, 35, 118, 119, 117, 11, 41, 120, 6, 101, 121, 99, 100, 1