In [1]:
%pip install torch
%pip install beautifulsoup4 lxml
%pip install requests

[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.


## Importing dependencies

In [None]:
# for scraping
from bs4 import BeautifulSoup as bs4
import requests
import json

# for model
import torch
import torch.nn as nn
from torch.nn import functional as f
import re

# data steps:
import os
from torch.utils.data import Dataset, DataLoader
from collections import Counter


## Gathering Data

In [3]:
def scrape_site_text(url: str, site: str, filename: str) -> str | None:
    """
    Scapes the main text from the website depending on site ["marca", "as", "mundo"] and saves as <filename>.txt
    """


    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/140.0.0.0 Safari/537.36"
    }

    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print('Error: Could not fetch page', response.status_code)
        return None

    soup = bs4(response.text, 'html.parser')

    article_text = ""

    match site:
        case "as":
            #using json-ld data
            scripts = soup.find("script", type="application/ld+json")


            for script in scripts:
                if script and script.string:
                    try:
                        data = json.loads(script.string)
                        items = data if isinstance(data, list) else [data]

                        for item in items:
                            if (isinstance(item, dict) and 
                                item.get("@type") in ["NewsArticle", ["NewsArticle"]] and 
                                item.get("articleBody")):

                                filetxt = item.get("articleBody").strip()
                                with open("../articles/" + filename+'.txt', 'w') as file:
                                    file.write(filetxt)

                                return filetxt
                    except Exception as e:
                        print("JSON parse error:", e)
                        return None
        

        case "marca":
            article_body = soup.find("div", class_="ue-c-article__body")

            if article_body:
                paragraphs  = article_body.find_all("p", class_ = "ue-c-article__paragraph")

                if paragraphs:
                    for para in paragraphs:
                        para_text = para.get_text(strip = True)
                        if(para_text):
                            article_text = article_text + para_text


                    filetxt = article_text.strip() if article_text else None
                    with open("../articles/" + filename+'.txt', 'w') as file:
                        file.write(filetxt)

                    return filetxt

        case "mundo":
            paragraphs  = soup.find_all("p", class_ = "paragraph")

            if paragraphs:
                for para in paragraphs:
                    para_text = para.get_text(strip = True)
                    if(para_text):
                        article_text = article_text + para_text
                
                filetxt = article_text.strip() if article_text else None
                with open("../articles/" + filename+'.txt', 'w') as file:
                    file.write(filetxt)

                return filetxt

    return None

In [4]:
## Load and combine all article files:
def combine_articles(combined_text="") -> str:

    for filename in os.listdir("../articles"):
        if(filename.endswith(".txt")):
            with open(os.path.join("../articles", filename), 'r', encoding = "utf-8") as f:
                combined_text = combined_text + f.read() + "\n\n"
    return combined_text

## Dataset and Data loader

In [5]:
class BigramDataset(Dataset):

    # need to overwrite __init__, __len__, and __getitem__ for DataLoader

    def __init__(self, data, block_size = 128):
        self.data = data
        self.block_size = block_size

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        x = self.data[idx:idx+self.block_size]
        y = self.data[idx + 1:idx+self.block_size + 1]
        return x, y

## Model Implementation

In [6]:
class BigramModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    

    def forward(self, idx):
        logits = self.token_embedding_table(idx)
        return logits
    
    def configure_optimizer(self):
        """
        Using AdamW optimizer -> check optim_notes.txt
        """
        return torch.optim.AdamW(self.parameters(), lr = 0.001)
    
    def training_step(self, input, target):
        """
        1. Embed tokens -> get predictions for each token in the sequence
        2. Flatten sequences into single batch -> required format for loss function
        3. Compute cross-entropy loss
        4. Return loss -> used to update weights with the optimizer
        """


        logits = self.token_embedding_table(input)
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        targets = target.view(B*T)
        loss = f.cross_entropy(logits, targets)

        return loss

    def generate(self, idx, max_new_tokens, temperature=1.0):
        """
        1. Start with an initial token sequence idx
        2. For each new token to generate:
            1. Predict Logits for all tokens in the vocab at the last position
            2. Convert logits to probability using softmax
            3. Sample a token from multinomial distribution
            4. Apped the sampled token to the sequence
        3. Repeat until max_new_tokens
        4. Return the complete generated sentence
        """

        for _ in range(max_new_tokens):
            logits = self.forward(idx)
            logits = logits[:, -1, :] # take all sequences in batch, take only the last token in the sequence, take all vocab logits

            # adding temperature
            logits = logits/temperature

            probabilities = f.softmax(logits, dim = -1)     
            idx_next = torch.multinomial(probabilities, num_samples=1) # randomly picks an index according to probabilities
            idx = torch.cat((idx, idx_next), dim = 1)

        return idx

## Model initialization and main()

In [None]:
# Build Vocabulary

def main():
    website_list = ["https://www.marca.com/baloncesto/nba/2025/09/22/plan-lakers-nuevo-doncic-asaltar-nba-miman-luka-adelgazar-14-kilos.html",
                "https://as.com/baloncesto/nba/la-retirada-esta-cerca-pero-aun-no-ha-llegado-n/",
                "https://www.mundodeportivo.com/baloncesto/nba/20250926/1002539251/nueva-lesion-vuelve-golpear-sixers.html"
                ]
    

    scrape_site_text(website_list[0], "marca", "lebron-retires")
    scrape_site_text(website_list[1], "as", "lakers-and-luka")
    scrape_site_text(website_list[2], "mundo", "sixers-new-injury")


    combined_text = combine_articles().lower()

    words = re.findall(r'\w+|\S', combined_text)
    counts = Counter(words)
    words = [w if counts[w]>1 else "<unk>" for w in words]

    
    

    # chars = sorted(list(set(combined_text)))
    vocab = sorted(list(set(words)))
    vocab_size = len(vocab)

    # mappings between text and numbers
    stoi = {ch: i for i, ch in enumerate(vocab)}
    itos = {i: ch for i, ch in enumerate(vocab)}

    # Encode the whole text
    data = torch.tensor([stoi[w] for w in words], dtype=torch.long)


    # Train/validation split

    n = int(0.9 * len(data))

    train_data = data[:n]           # first 90% 
    val_data = data[n:]             # last 10%

    block_size = 8
    train_dataset = BigramDataset(train_data, block_size)
    val_dataset = BigramDataset(val_data, block_size)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle =True)
    val_loader = DataLoader(val_dataset, batch_size=32)


    model = BigramModel(vocab_size)
    optimizer = torch.optim.AdamW(model.parameters(), lr = 0.001)

    # pre-training things:
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    epochs = 30

    # Training loop:
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for x,y in train_loader:
            x,y = x.to(device), y.to(device)
            optimizer.zero_grad()
            logits = model.forward(x)
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            y = y.view(B*T)
            loss = f.cross_entropy(logits, y)
            loss.backward()
            optimizer.step()
            total_loss = total_loss + loss.item()
        print(f"Epoch: {epoch+1}, Train Loss: {total_loss/len(train_loader):.4f}")

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for x,y, in val_loader:
                x,y = x.to(device), y.to(device)
                logits = model(x)
                B, T, C = logits.shape
                logits = logits.view(B*T, C)
                y = y.view(B*T)
                loss = f.cross_entropy(logits, y)
                val_loss = val_loss + loss.item()
        print(f"Epoch: {epoch+1}, Val Loss: {val_loss/len(val_loader):.4f}")



    # Generate Text:
    start_word = "el" if "el" in stoi else vocab[0]
    start = torch.tensor([[stoi[start_word]]], dtype=torch.long).to(device)

    generated = model.generate(start, max_new_tokens=50, temperature=0.8)

    #decode
    
    text_generated = ' '.join([itos[int(i)] for i in generated[0]])
    print("Generated Text: \n")
    print(text_generated)



    

In [10]:
if __name__ == "__main__":
    main()

Epoch: 1, Train Loss: 5.5320
Epoch: 1, Val Loss: 5.5004
Epoch: 2, Train Loss: 5.4736
Epoch: 2, Val Loss: 5.4593
Epoch: 3, Train Loss: 5.4208
Epoch: 3, Val Loss: 5.4189
Epoch: 4, Train Loss: 5.3655
Epoch: 4, Val Loss: 5.3789
Epoch: 5, Train Loss: 5.3090
Epoch: 5, Val Loss: 5.3391
Epoch: 6, Train Loss: 5.2563
Epoch: 6, Val Loss: 5.2997
Epoch: 7, Train Loss: 5.2048
Epoch: 7, Val Loss: 5.2610
Epoch: 8, Train Loss: 5.1527
Epoch: 8, Val Loss: 5.2228
Epoch: 9, Train Loss: 5.0980
Epoch: 9, Val Loss: 5.1851
Epoch: 10, Train Loss: 5.0472
Epoch: 10, Val Loss: 5.1478
Epoch: 11, Train Loss: 4.9943
Epoch: 11, Val Loss: 5.1113
Epoch: 12, Train Loss: 4.9457
Epoch: 12, Val Loss: 5.0749
Epoch: 13, Train Loss: 4.8953
Epoch: 13, Val Loss: 5.0397
Epoch: 14, Train Loss: 4.8486
Epoch: 14, Val Loss: 5.0043
Epoch: 15, Train Loss: 4.7979
Epoch: 15, Val Loss: 4.9697
Epoch: 16, Train Loss: 4.7483
Epoch: 16, Val Loss: 4.9358
Epoch: 17, Train Loss: 4.7009
Epoch: 17, Val Loss: 4.9021
Epoch: 18, Train Loss: 4.6534
Ep