<a href="https://colab.research.google.com/github/TheodoredaCunha/Homemade-GPT-Model/blob/main/GPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Loading and Cleaning the Dataset**

In [2]:
import string
import nltk.data
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
#getting all the filenames that contain the data
TOTAL_CHAPTERS  = 3
dataset = []
for i in range(1, TOTAL_CHAPTERS + 1):
  dataset.append("chapter{}.txt".format(i))

print(dataset)

['chapter1.txt', 'chapter2.txt', 'chapter3.txt']


In [4]:
from os import setpgid

PATH = "drive/MyDrive/gpt/"
VOCAB_SET = set()
VOCAB_DICTIONARY = {}
ALL_CHAPTERS = []
def cleanData(filepath): #lowercase all characters and removing punctuations
  with open(filepath) as f:
    full_chapter = f.read().lower()
    cleaned_chapter = full_chapter.translate (str.maketrans ('', '', string.punctuation)) #remove punctuations from every sentence
    cleaned_split_chapter = cleaned_chapter.split() #turn every sentence into a list of words
    chapter_vocab = set(cleaned_split_chapter) #update the set of unique wordds in this sentence
    ALL_CHAPTERS.append(cleaned_split_chapter) #append the cleaned chapter text to a single list
    VOCAB_SET.update(chapter_vocab) #update the set of unique words for all chapters in the data
    f.close() #close the file
 
def createVocabDictionary(): #fill out vocab_dictionary based on the vocab set
  vocab_list = list(VOCAB_SET)
  for count, word in enumerate(vocab_list): #assign a number to every word in the vocab list
    VOCAB_DICTIONARY[word] = count

'''
IMPLEMENTING THE FUNCTIONS:
'''
#opening and cleaning each chapter
for chapter in dataset:
  print(chapter)
  cleanData(PATH + chapter)

#creating the vocabulary dictionary
createVocabDictionary()

#convert each letter in the chapters into their assigned numbers from the dictionary
for chapter in ALL_CHAPTERS:
  for i in range(len(chapter)):
    chapter[i] = VOCAB_DICTIONARY[chapter[i]]

print(ALL_CHAPTERS[0])

chapter1.txt
chapter2.txt
chapter3.txt
[659, 42, 560, 879, 123, 1164, 927, 1789, 671, 64, 1730, 255, 711, 1724, 761, 64, 333, 910, 906, 485, 524, 252, 761, 64, 1499, 1012, 548, 575, 2118, 255, 1791, 146, 2028, 1879, 1349, 1567, 2125, 233, 761, 1509, 1980, 466, 1562, 1847, 1756, 659, 879, 205, 1499, 160, 123, 1065, 1318, 197, 1875, 1529, 149, 431, 354, 205, 1065, 1190, 985, 471, 1562, 1262, 1705, 1542, 1283, 354, 163, 2040, 1065, 524, 311, 784, 560, 879, 205, 2043, 42, 1582, 42, 1546, 558, 1900, 1499, 537, 373, 123, 1542, 1529, 1489, 2028, 524, 911, 509, 1084, 695, 2060, 252, 123, 1658, 745, 1973, 1680, 1555, 1198, 1196, 188, 1499, 1722, 1499, 1773, 1546, 1065, 1647, 1384, 197, 11, 42, 2028, 445, 480, 1898, 205, 847, 1557, 1735, 116, 1499, 1773, 1546, 387, 761, 229, 587, 761, 583, 1546, 1065, 65, 42, 445, 421, 1619, 205, 1724, 1648, 1876, 1584, 269, 761, 1980, 1102, 761, 392, 1915, 269, 1633, 1008, 1620, 1531, 315, 1499, 225, 560, 1878, 205, 560, 1773, 338, 587, 761, 1386, 2014, 2131, 5

**Creating the GPT Model**

In [5]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [6]:
training = []
testing = []
sampling_rate = 0.1 #10% of the sentences from each chapter will be used as test data
for chapter in ALL_CHAPTERS:
  test_data_num = int(len(chapter)*0.1)
  training.append(chapter[test_data_num * -1:])
  testing.append(chapter[:test_data_num])  

for i in range(TOTAL_CHAPTERS):  
  training[i] = torch.tensor(training[i], dtype=torch.long)
  testing[i] = torch.tensor(testing[i], dtype=torch.long)

In [10]:
#chunking the data
context_length = 20
batch_size = 10
torch.manual_seed(1337)

def get_batches(data):
  ix = torch.randint(len(data) - context_length, (batch_size,))
  x = torch.stack([data[i:i+context_length] for i in ix])
  y = torch.stack([data[i + 1:i+context_length + 1] for i in ix])
  return x, y

x_1, y_1 = get_batches(training[0])
print(x_1)
print(y_1)



tensor([[1499, 1298,  123,  603,  188, 1499,  804,  123, 1164,  927,   88,  418,
         2127,  354,  785,  354, 2067,  188, 1282,  872],
        [ 438,  587,  417, 1065,  557,  188,  261, 1236, 1567,   50, 1791, 1620,
          748,  978,  213, 1497, 1236,  142,  188, 1499],
        [1236, 1567,   50, 1791, 1620,  748,  978,  213, 1497, 1236,  142,  188,
         1499, 1476,  509,  162,  632, 1680, 1499,  412],
        [1041, 1718,  269,  687, 1794,  843, 1153, 1794,  288, 1255, 2127,  342,
         1074,   61, 1562, 1963,  239,  239,  365,  145],
        [ 557,  188,  261, 1236, 1567,   50, 1791, 1620,  748,  978,  213, 1497,
         1236,  142,  188, 1499, 1476,  509,  162,  632],
        [ 205,  947, 1763, 1256,  354, 1876, 1791, 1638, 2028, 1065, 2001,  299,
          745,  257,  560, 1773,  135,  509, 1084, 1589],
        [ 613, 1863,   42,    1,  728, 1499, 1443, 1387, 1499,  524, 1012, 2013,
          485, 1876, 2118,  868,  819,  255, 1199, 2127],
        [1789,  671, 1529, 

In [18]:
#Creating a Bigram Model
class BigramModel(nn.Module): #Bigram Model that is a child of PyTorch's Module Class
  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, vocab_size) #lookup table

  def forward(self, input, target):
      logits = self.embedding(input) #make predictions --> ouputs a (B, T, C) tensor

      #reshaping logits and target to match the required shape by pytorch
      B, T, C = logits.shape 
      logits = logits.view(B*T, C)
      target = target.view(B*T)

      loss = F.cross_entropy(logits, target) #calculate log-likelihood loss
      
      return logits, loss

vocab_size = len(VOCAB_SET)
print("vocab size: {}".format(vocab_size))
m = BigramModel(vocab_size)
logits, loss = m(x_1, y_1)
print(logits.shape)
print(loss)

'''
Expected loss: -ln(1/vocab_size) = -ln(1/2158) = 7.67693714582

But before training, the loss should obviously be higher
'''

vocab size: 2158
torch.Size([200, 2158])
tensor(8.2312, grad_fn=<NllLossBackward0>)


'\nExpected loss: -ln(1/vocab_size) = -ln(1/2158) = 7.67693714582\n\nBut before training, the loss should obviously be higher\n'