In [None]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
with open('datasets/shakespeare.txt') as file:
    text = file.read()

In [None]:
from tokenizers import bpe_tokenizer
from pipelines import text_to_tensor

text = text[:10000]

tokenizer = bpe_tokenizer.BytePairEncodingTokenizer(250)
tokenizer.fit([text])

train_data = text[:9000]
test_data = text[9000:]

In [None]:
from language_model import generation

model = generation.LanguageModel(tokenizer, device)

print(sum([p.numel() for p in model.encoder.parameters()]) / 1e6, 'M parameters')

In [None]:
model.predict(' ', max_new_tokens=100)

In [None]:
from language_model import train

trainer = train.ModelTrainer(model, train_data, test_data)

trainer.train()

In [None]:
model.predict(' ', 100)

In [None]:
model.save('model.pt')

In [None]:
import random
from concurrent.futures import ThreadPoolExecutor
from itertools import chain
import wikipedia
import pandas as pd

def get_wikipedia_data(link: str):
    try:
        page = wikipedia.page(link)

        page_links = page.links

        data = {
            'title': page.title,
            'summary': page.summary
        }

        return data, page_links
    except Exception as e:
        print(f"Error fetching data for {link}: {e}")
        return None, []

n_pages = 1000
sample_size = 500
all_links = {'Deep Learning'}
visited_links = set()
data = []

# all_links empty case
# duplicated keys
# generation: maximum token length

with ThreadPoolExecutor(max_workers=sample_size) as executor:
    while len(data) < n_pages and len(all_links) > 0:
        current_sample_size = min(sample_size, len(all_links), n_pages - len(data))

        link_sample = random.sample(list(all_links), current_sample_size)

        visited_links.update(link_sample)

        for link in link_sample:
            all_links.remove(link)

        futures = [executor.submit(get_wikipedia_data, link) for link in link_sample]
        for future in futures:
            page_data, links = future.result()
            if page_data:
                data.append(page_data)

                new_links = {link for link in links if link not in visited_links}

                all_links.update(new_links)

df = pd.DataFrame(data)
df

In [None]:
len(df['title'].unique())

In [None]:
s = df.groupby('title').size()

s[s > 1]