In [168]:
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


### Encoder-Decoder dataset

### Poetry dataset

In [3]:
df = pd.read_csv('/home/matta/repos/autumn_school/data/russianPoetryWithTheme.csv')

In [4]:
df = df[['author', 'text']]

In [5]:
df

Unnamed: 0,author,text
0,Михаил Лермонтов,"Забывши волнения жизни мятежной,\r\nОдин жил в..."
1,Сергей Есенин,"Нивы сжаты, рощи голы,\r\nОт воды туман и сыро..."
2,Игорь Северянин,Лючинь печальная читала вечером ручьисто-вкрад...
3,Анатолий Жигулин,"Глыбу кварца разбили молотом,\r\nИ, веселым ог..."
4,Николай Тихонов,"Хлынул дождь, когда девушки, встав в хоровод,\..."
...,...,...
16689,Леонид Мартынов,Седо\r\nКурчавятся облака\r\nНад чернотою поле...
16690,Гаврила Державин,"Белокурая Параша,\r\nСребророзова лицом,\r\nКо..."
16691,Федор Сологуб,"Сладкозвучная богиня,\r\nРифма золотая,\r\nСлу..."
16692,Илья Эренбург,Я так любил тебя — до грубых шуток\r\nИ до так...


In [6]:
alphabet = ''.join(chr(i) for i in range(ord('а'), ord('я') + 1))
alphabet += ' \n.,!?-«»()—…:;\"\''

In [7]:
df['text'] = df['text'].str.lower().apply(lambda x: '<start> ' + ''.join([ch for ch in x if ch in alphabet]) + ' <end>')

In [8]:
df[0]

KeyError: 0

In [None]:
chars = sorted(set(list(''.join(df['text'].to_list()))))
vocab_size = len(chars)
stoi = {ch: i for i,ch in enumerate(chars)} # string to index
itos = {i: ch for i,ch in enumerate(chars)} # index to string
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [None]:
class PoetryDataset(Dataset):
	 

### Transformer

In [85]:
batch_ex = torch.randn(batch_size, seq_len, emb_size)

In [None]:
class Head(nn.Module):
	"""One head of self-attention"""
	def __init__(self, emb_size, head_size, dropout=0.0):
		super().__init__()
		self.head_size = head_size
		self.query = nn.Linear(emb_size, head_size, bias=False)
		self.key = nn.Linear(emb_size, head_size, bias=False)
		self.value = nn.Linear(emb_size, head_size, bias=False)
		self.dropout = nn.Dropout(dropout)
		self.softmax = nn.Softmax(dim=-1)

	def forward(self, x):
		Q = self.query(x)
		K = self.key(x)
		V = self.value(x)
		attention_score = Q @ K.transpose(-2, -1) / (self.head_size ** 0.5)

		# if self.training:
		# 	mask = torch.tril(torch.ones(seq_len, seq_len)).to(device)
		# 	attention_score = attention_score.masked_fill(mask == 0, float('-inf'))

		out = self.softmax(attention_score) @ V
		out = self.dropout(out)
		return out

In [None]:
class MultiHeadAttention(nn.Module):
	"""
	Multiple heads of self-attention in parallel
	"""
	def __init__(self, emb_size, num_heads, head_size, dropout=0.0):
		super().__init__()
		self.heads = nn.ModuleList(
			[
				Head(emb_size=emb_size, head_size=head_size, dropout=dropout)
				for _ in range(num_heads)
			]
		)
		self.linear = nn.Linear(num_heads * head_size, emb_size)
		self.dropout = nn.Dropout(dropout)
		
	def forward(self, x):
		out = torch.cat([head(x) for head in self.heads], dim=-1)
		out = self.linear(out)
		out = self.dropout(out)
		return out

In [None]:
class FeedForwardNetwork(nn.Module):
	"""
	A simple linear layer followed by a non-linearity
	"""
	def __init__(self, emb_size, hidden_size, dropout=0.0):	
		super().__init__()
		self.linear1 = nn.Linear(emb_size, hidden_size)
		self.linear2 = nn.Linear(hidden_size, emb_size)
		self.relu = nn.ReLU()
		self.dropout = nn.Dropout(dropout)

	def forward(self, x):
		x = self.linear1(x)
		x = self.relu(x)
		x = self.dropout(x)
		x = self.linear2(x)
		return x

In [None]:
class TransformerBlock(nn.Module):
	"""
	Transformer block: communication followed by computation
	"""
	def __init__(self, emb_size, num_heads, head_size, hidden_size, dropout=0.0):
		super().__init__()
		self.attention = MultiHeadAttention(emb_size, num_heads, head_size, dropout)
		self.ffn = FeedForwardNetwork(emb_size, hidden_size, dropout)
		self.norm1 = nn.LayerNorm(emb_size)
		self.norm2 = nn.LayerNorm(emb_size)
		
	def forward(self, x):
		x = self.norm1(x + self.attention(x))
		x = self.norm2(x + self.ffn(x))
		return x

In [None]:
class GPT(nn.Module):
	def __init__(self, dict_size, emb_size, seq_len, num_heads=8, head_size=64, hidden_size=2048, dropout=0.3):
		super().__init__()
		self.token_embedding_table = nn.Embedding(num_embeddings=dict_size, embedding_dim=emb_size)
		self.blocks = nn.Sequential(
			*[
				TransformerBlock(
				emb_size=emb_size,
		   		num_heads=num_heads,
				head_size=head_size,
				hidden_size=hidden_size,
				dropout=dropout
				) for _ in range(6)
			]
		)
		self.flatten = nn.Flatten()
		self.linear = nn.Linear(emb_size * seq_len, dict_size)
		self.apply(self._init_weights)

	def _init_weights(self, module):
		if isinstance(module, nn.Linear):
			torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
			if module.bias is not None:
				torch.nn.init.zeros_(module.bias)
		elif isinstance(module, nn.Embedding):
			torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

	def forward(self, idx):
		# idx = torch.argmax(x, dim=-1)
		emb = self.token_embedding_table(idx)
		out = self.blocks(emb)
		out = self.flatten(out)
		logits = self.linear(out)
		return logits

	def generate_next_token(self, idx):
		logits = self.forward(idx)
		probs = F.softmax(logits, dim=-1)
		next_token = torch.multinomial(probs, num_samples=1)
		return next_token
		

### Names dataset

In [141]:
f = open('/home/matta/repos/autumn_school/data/names.txt', 'r')
names = f.read().splitlines()

In [174]:
from torch import Tensor

In [260]:
class NamesDataset(Dataset):
	def __init__(self, names: list[str], seq_len: int=3):
		self._seq_len = seq_len
		self.__stoi = {
			"<start>": 0,
			"<end>": 1,
		}
		for i, ch in enumerate(sorted(set(''.join(names)))):
			self.__stoi[ch] = i + 2
		self.__itos = {i: ch for ch, i in self.__stoi.items()}
		self.names: list[tuple[str, str]] = []
		self.names_encoded: list[tuple[Tensor, Tensor]] = []
		for name in names:
			for i in range(len(name)):
				target: str = name[i]
				target_encoded = torch.tensor([self.__stoi[target]])
				part_name = name[max(0, i - self.seq_len):i]
				entry = "<start>" * (self.seq_len - len(part_name)) + part_name
				entry_encoded = torch.tensor([self.__stoi["<start>"]] * (self.seq_len - len(part_name)) + [self.__stoi[ch] for ch in part_name])
				self.names.append((entry, target))
				self.names_encoded.append((entry_encoded, target_encoded))

	def __len__(self):
		return len(self.names)
	
	@property
	def seq_len(self):
		return self._seq_len
	
	@property
	def stoi(self):
		return self.__stoi
	
	@property	
	def itos(self):
		return self.__itos
	
	def get_sample(self, idx):
		return self.names[idx]
	
	def __getitem__(self, idx):
		return self.names_encoded[idx]

In [294]:
names_dataset = NamesDataset(names)
names_loader = DataLoader(names_dataset, batch_size=1, shuffle=True)

In [None]:
model = GPT(
	dict_size=len(names_dataset.stoi),
	emb_size=8, seq_len=names_dataset.seq_len,
	hidden_size=64, num_heads=8,
	head_size=64,
	dropout=0.1)
optim = torch.optim.AdamW(model.parameters(), lr=3e-5)
criterion = nn.CrossEntropyLoss()

In [296]:
epochs = 10

In [None]:
for epoch in range(epochs):
	model.train()
	total_loss = 0
	for seq, target in tqdm(names_loader):
		seq, target = seq.to(device), target.to(device)
		target = F.one_hot(target, num_classes=len(names_dataset.stoi)).squeeze(1).float()
		optim.zero_grad()
		logits = model(seq)
		loss = criterion(logits, target)
		loss.backward()
		optim.step()
		total_loss += loss.item()
	avg_loss = total_loss / len(names_loader)
	print(f"Epoch {epoch + 1} / {epochs}, Loss: {avg_loss:.4f}")

  0%|          | 792/196113 [01:00<4:09:40, 13.04it/s]


KeyboardInterrupt: 

In [301]:
print(model)

GPTLanguageModel(
  (token_embedding_table): Embedding(28, 8)
  (blocks): Sequential(
    (0): Block(
      (attention): MultiHeadAttention(
        (heads): ModuleList(
          (0-7): 8 x Head(
            (query): Linear(in_features=8, out_features=64, bias=False)
            (key): Linear(in_features=8, out_features=64, bias=False)
            (value): Linear(in_features=8, out_features=64, bias=False)
            (dropout): Dropout(p=0.1, inplace=False)
            (softmax): Softmax(dim=-1)
          )
        )
        (linear): Linear(in_features=512, out_features=8, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ffn): FeedFoward(
        (linear1): Linear(in_features=8, out_features=64, bias=True)
        (linear2): Linear(in_features=64, out_features=8, bias=True)
        (relu): ReLU()
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (norm1): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((8,), eps=1e-0

In [302]:
def count_parameters(model):
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return trainable

In [303]:
count_parameters(model)

106044