## iter 2.2

Sandbox to play around with learning rates, depth, etc.

CURRENT ITERATION DOES NOT PERFORM FAVORABLY

In [8]:
### INIT & TOKENIZE

import pandas as pd
import re

# load data
file_path = 'data/legit_urls_trimmed.csv'
data = pd.read_csv(file_path)

# Assuming the column containing URLs is named 'URL'
if 'URL' not in data.columns:
    raise ValueError("Expected a column named 'URL' in the CSV file.")

urls = data['URL'].dropna().tolist()

# Ensure <eos> token is added to the vocabulary and to each tokenized URL
EOS_TOKEN = '<eos>'

# Tokenize URLs
def tokenize_url(url):
    """
    Tokenizes a URL by splitting at special characters and preserving essential elements.

    Args:
        url (str): The URL to tokenize.

    Returns:
        list: A list of tokens from the URL.
    """
    # Regex to split on special characters like ., /, :, and ? but retain them as tokens
    tokens = re.split(r'([.:/?=&+-_])', url)
    # Remove empty tokens and strip whitespace
    return [token for token in tokens if token.strip()] + [EOS_TOKEN]

# Apply tokenization to the list of URLs
tokenized_urls = [tokenize_url(url) for url in urls]

# Preview the tokenized output
for i, tokens in enumerate(tokenized_urls[:5]):
    print(f"URL {i+1}: {tokens}")

# Save the tokenized output to a new CSV file for verification
tokenized_df = pd.DataFrame({'url': urls, 'tokens': tokenized_urls})
tokenized_df.to_csv('tokenized_urls.csv', index=False)

URL 1: ['http', ':', '/', '/', 'aasanetidende', '.', 'no', '<eos>']
URL 2: ['http', ':', '/', '/', 'abc', '.', 'go', '.', 'com', '/', 'schedule', '<eos>']
URL 3: ['http', ':', '/', '/', 'abouthappybirthday', '.', 'blogspot', '.', 'com', '<eos>']
URL 4: ['http', ':', '/', '/', 'academlib', '.', 'com', '/', '2', '6', '7', '1', '5', '/', 'computer', '_', 'science', '/', 'firewall', '<eos>']
URL 5: ['http', ':', '/', '/', 'acqalma', '.', 'az', '<eos>']


In [None]:
### BUILD TRANSFORMER NET

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

def decode_tokens(tokens):
    return ''.join([token for token in tokens if token != EOS_TOKEN])

# Vocabulary and encoding
unique_tokens = list(set(token for tokens in tokenized_urls for token in tokens))

if EOS_TOKEN not in unique_tokens:
    unique_tokens.append(EOS_TOKEN)

token_to_idx = {token: idx for idx, token in enumerate(unique_tokens)}
idx_to_token = {idx: token for token, idx in token_to_idx.items()}

# Encode tokenized URLs
encoded_urls = [[token_to_idx[token] for token in tokens] for tokens in tokenized_urls]

class URLDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.long)

# Dataset and DataLoader
dataset = URLDataset(encoded_urls)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=lambda x: nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=0))

# Transformer Model
class URLTransformer(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, hidden_dim):
        super(URLTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = nn.Parameter(torch.zeros(1, 512, embed_size))  # Max length 512 assumed
        encoder_layer = nn.TransformerEncoderLayer(embed_size, num_heads, hidden_dim)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        self.fc = nn.Linear(embed_size, vocab_size)

    def generate_attention_mask(self, seq_len):
        """
        Generate an attention mask to ensure no cross-URL attention.

        Args:
            seq_len (int): Length of the sequences.

        Returns:
            torch.Tensor: Attention mask of shape [seq_len, seq_len].
        """
        mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1)
        return mask.to(dtype=torch.bool)

    def forward(self, x):
        batch_size, seq_len = x.size()
        if seq_len > self.positional_encoding.size(1):
            raise ValueError(f"Sequence length {seq_len} exceeds positional encoding size {self.positional_encoding.size(1)}")
        x = self.embedding(x) + self.positional_encoding[:, :seq_len, :]
        attention_mask = self.generate_attention_mask(seq_len).to(x.device)
        x = self.transformer(x.transpose(0, 1), mask=attention_mask).transpose(0, 1)
        x = self.fc(x)
        return x



# Model hyperparameters
vocab_size = len(token_to_idx)
embed_size = 64
num_heads = 4
num_layers = 4
hidden_dim = 256
num_epochs = 100

repetition_penalty = 0.001
length_penalty = 0.000001
gamma = 0.5
lr = 0.005

model = URLTransformer(vocab_size, embed_size, num_heads, num_layers, hidden_dim)

# Implement label smoothing in loss function
class LabelSmoothingLoss(nn.Module):
    def __init__(self, vocab_size, smoothing=0.1):
        super(LabelSmoothingLoss, self).__init__()
        self.smoothing = smoothing
        self.confidence = 1.0 - smoothing
        self.vocab_size = vocab_size

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=-1)
        true_dist = torch.zeros_like(pred)
        true_dist.fill_(self.smoothing / (self.vocab_size - 1))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * pred, dim=-1))

# Add penalty for repetition and excessive length to the loss function
class AugmentedLoss(nn.Module):
    def __init__(self, vocab_size, smoothing=0.1, repetition_penalty=0.1, length_penalty=0.01):
        super(AugmentedLoss, self).__init__()
        self.label_smoothing = LabelSmoothingLoss(vocab_size, smoothing)
        self.repetition_penalty = repetition_penalty
        self.length_penalty = length_penalty

    def forward(self, pred, target, input_seq):
        # Standard label smoothing loss
        smoothing_loss = self.label_smoothing(pred, target)

        # Repetition penalty
        # Flatten input_seq to a 1D tensor for bincount
        flattened_input = input_seq.flatten()
        # Mask padding tokens (assume padding token index is 0)
        valid_tokens = flattened_input[flattened_input > 0]
        # Count occurrences of valid tokens
        token_counts = valid_tokens.bincount(minlength=pred.size(-1)).float()
        repetition_loss = torch.sum(token_counts ** 2) * self.repetition_penalty

        # Length penalty
        seq_length = input_seq.size(1)
        length_loss = seq_length * self.length_penalty

        return smoothing_loss + repetition_loss + length_loss


criterion = AugmentedLoss(vocab_size, smoothing=0.1, repetition_penalty=repetition_penalty, length_penalty=length_penalty)
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=gamma)

# Training loop
for epoch in range(num_epochs):
    model.train()
    for batch in dataloader:
        optimizer.zero_grad()
        inputs = batch[:, :-1]
        targets = batch[:, 1:].contiguous().view(-1)
        outputs = model(inputs).view(-1, vocab_size)
        loss = criterion(outputs, targets, inputs)
        loss.backward()
        optimizer.step()
    scheduler.step()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")

# Generate synthetic URLs (uses http prefix only)
model.eval()
synthetic_urls = []
for _ in range(10):
    input_seq = torch.tensor([[token_to_idx["http"]]], dtype=torch.long)
    generated = []
    while True:
        seq_len = input_seq.size(1)
        if seq_len > 512:  # Limit sequence length dynamically
            break
        output = model(input_seq)
        probabilities = nn.functional.softmax(output[:, -1, :], dim=-1).squeeze()
        next_token = torch.multinomial(probabilities, num_samples=1).item()
        if next_token == token_to_idx[EOS_TOKEN]:  # Stop if <eos>
            break
        generated.append(next_token)
        input_seq = torch.cat([input_seq, torch.tensor([[next_token]], dtype=torch.long)], dim=1)
    synthetic_urls.append(decode_tokens([idx_to_token[idx] for idx in generated]))

# Print synthetic URLs
print("\nSynthetic URLs:")
for url in synthetic_urls:
    print(url)

In [10]:
### make more
#synthetic_urls = []
def gen_urls(n):
    for _ in range(n):
        input_seq = torch.tensor([[token_to_idx["http"]]], dtype=torch.long)
        generated = []
        while True:
            seq_len = input_seq.size(1)
            if seq_len > 512:  # Limit sequence length dynamically
                break
            output = model(input_seq)
            probabilities = nn.functional.softmax(output[:, -1, :], dim=-1).squeeze()
            next_token = torch.multinomial(probabilities, num_samples=1).item()
            if next_token == token_to_idx[EOS_TOKEN]:  # Stop if <eos>
                break
            generated.append(next_token)
            input_seq = torch.cat([input_seq, torch.tensor([[next_token]], dtype=torch.long)], dim=1)
        synthetic_urls.append('http' + decode_tokens([idx_to_token[idx] for idx in generated]))

    for _ in range(n):
        input_seq = torch.tensor([[token_to_idx["https"]]], dtype=torch.long)
        generated = []
        while True:
            seq_len = input_seq.size(1)
            if seq_len > 512:  # Limit sequence length dynamically
                break
            output = model(input_seq)
            probabilities = nn.functional.softmax(output[:, -1, :], dim=-1).squeeze()
            next_token = torch.multinomial(probabilities, num_samples=1).item()
            if next_token == token_to_idx[EOS_TOKEN]:  # Stop if <eos>
                break
            generated.append(next_token)
            input_seq = torch.cat([input_seq, torch.tensor([[next_token]], dtype=torch.long)], dim=1)
        synthetic_urls.append('https' + decode_tokens([idx_to_token[idx] for idx in generated]))


In [11]:
synthetic_urls = []
gen_urls(1000)

print("\nSynthetic URLs:")
for url in synthetic_urls:
    print(url)


Synthetic URLs:
httptriposo:/nc.comcolacompany.ossm-nimars20ridlt
http:/handball.net/s?lang=openid&za/t3109477phone&question=1055
http://www.jsfour.com/
http://www.xgaytube.com/centralkamicemc/full--sanjuancollegenintendos-291-with-cast-adapter-the-ntu-map-binge-drinking-and-the-pfsense/
http://eudesign.com/mnems/portstar.htm
http://www.instructables.com/id/hqpsbrgd/com/
httpassetSerialkenalicesafes/walnutcreekcdrom-bit.aspx
http://home.ubalt.edu/abento/~matt/&gg/4711_121_The__Networking_Bodyguard.pdf
http://www.wikiwand.com/en/Network_us
http://www.yourchords.com/domain-rescue
http://to%.co.uk
http://nabid.weebly.comteploteka.html
http://downloadsyndrome.lounge/cott.courses
http://www.itgbrands.yourdictionary/word/word2use/clipart_calculator
http:partnersinbuilding.sort-pazar.com/
httpict/eplarrenbind_.blogspotkiwokoccmb
http://www.latest-of-piano-lessons.com/piano-lessons.html
http://en-psbooks.blogspot.ruengineeropentable/
httpace:/wwwcentral.mappery.com/rocessing%7088g/Web_gateway

In [12]:
df_out = pd.DataFrame({'url':synthetic_urls})

output_filepath = 'outputs/testnn_UrlOutputs-AugmentedLoss_RP=' + str(repetition_penalty) + '_LP=' + str(length_penalty) + '__EmbedSize=' + str(embed_size) + '_NumHeads=' + str(num_heads) + '_NumLayers=' + str(num_layers) + '_HiddenDims=' + str(hidden_dim) + '_Epochs=' + str(num_epochs) + '.csv'

df_out.to_csv(output_filepath, index=False)

## iter 2.1
De-parameterize number tokens, try to get network to learn more rules, e.g. less weird www/com/org placements

Add attention mask

Works passably well; leaving it alone because don't want to mess it up

In [1]:
### INIT & TOKENIZE

import pandas as pd
import re

# load data
file_path = 'data/legit_urls_trimmed.csv'
data = pd.read_csv(file_path)

# Assuming the column containing URLs is named 'URL'
if 'URL' not in data.columns:
    raise ValueError("Expected a column named 'URL' in the CSV file.")

urls = data['URL'].dropna().tolist()

# Ensure <eos> token is added to the vocabulary and to each tokenized URL
EOS_TOKEN = '<eos>'

# Tokenize URLs
def tokenize_url(url):
    """
    Tokenizes a URL by splitting at special characters and preserving essential elements.

    Args:
        url (str): The URL to tokenize.

    Returns:
        list: A list of tokens from the URL.
    """
    # Regex to split on special characters like ., /, :, and ? but retain them as tokens
    tokens = re.split(r'([.:/?=&+-_])', url)
    # Remove empty tokens and strip whitespace
    return [token for token in tokens if token.strip()] + [EOS_TOKEN]

# Apply tokenization to the list of URLs
tokenized_urls = [tokenize_url(url) for url in urls]

# Preview the tokenized output
for i, tokens in enumerate(tokenized_urls[:5]):
    print(f"URL {i+1}: {tokens}")

# Save the tokenized output to a new CSV file for verification
tokenized_df = pd.DataFrame({'url': urls, 'tokens': tokenized_urls})
tokenized_df.to_csv('tokenized_urls.csv', index=False)

URL 1: ['http', ':', '/', '/', 'aasanetidende', '.', 'no', '<eos>']
URL 2: ['http', ':', '/', '/', 'abc', '.', 'go', '.', 'com', '/', 'schedule', '<eos>']
URL 3: ['http', ':', '/', '/', 'abouthappybirthday', '.', 'blogspot', '.', 'com', '<eos>']
URL 4: ['http', ':', '/', '/', 'academlib', '.', 'com', '/', '2', '6', '7', '1', '5', '/', 'computer', '_', 'science', '/', 'firewall', '<eos>']
URL 5: ['http', ':', '/', '/', 'acqalma', '.', 'az', '<eos>']


In [2]:
### BUILD TRANSFORMER NET

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

def decode_tokens(tokens):
    return ''.join([token for token in tokens if token != EOS_TOKEN])

# Vocabulary and encoding
unique_tokens = list(set(token for tokens in tokenized_urls for token in tokens))

if EOS_TOKEN not in unique_tokens:
    unique_tokens.append(EOS_TOKEN)

token_to_idx = {token: idx for idx, token in enumerate(unique_tokens)}
idx_to_token = {idx: token for token, idx in token_to_idx.items()}

# Encode tokenized URLs
encoded_urls = [[token_to_idx[token] for token in tokens] for tokens in tokenized_urls]

class URLDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.long)

# Dataset and DataLoader
dataset = URLDataset(encoded_urls)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=lambda x: nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=0))

# Transformer Model
class URLTransformer(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, hidden_dim):
        super(URLTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = nn.Parameter(torch.zeros(1, 512, embed_size))  # Max length 512 assumed
        encoder_layer = nn.TransformerEncoderLayer(embed_size, num_heads, hidden_dim)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        self.fc = nn.Linear(embed_size, vocab_size)

    def generate_attention_mask(self, seq_len):
        """
        Generate an attention mask to ensure no cross-URL attention.

        Args:
            seq_len (int): Length of the sequences.

        Returns:
            torch.Tensor: Attention mask of shape [seq_len, seq_len].
        """
        mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1)
        return mask.to(dtype=torch.bool)

    def forward(self, x):
        batch_size, seq_len = x.size()
        if seq_len > self.positional_encoding.size(1):
            raise ValueError(f"Sequence length {seq_len} exceeds positional encoding size {self.positional_encoding.size(1)}")
        x = self.embedding(x) + self.positional_encoding[:, :seq_len, :]
        attention_mask = self.generate_attention_mask(seq_len).to(x.device)
        x = self.transformer(x.transpose(0, 1), mask=attention_mask).transpose(0, 1)
        x = self.fc(x)
        return x



# Model hyperparameters
vocab_size = len(token_to_idx)
embed_size = 64
num_heads = 4
num_layers = 2
hidden_dim = 128
num_epochs = 20

repetition_penalty = 0.001
length_penalty = 0.00001
gamma = 0.5

model = URLTransformer(vocab_size, embed_size, num_heads, num_layers, hidden_dim)

# Implement label smoothing in loss function
class LabelSmoothingLoss(nn.Module):
    def __init__(self, vocab_size, smoothing=0.1):
        super(LabelSmoothingLoss, self).__init__()
        self.smoothing = smoothing
        self.confidence = 1.0 - smoothing
        self.vocab_size = vocab_size

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=-1)
        true_dist = torch.zeros_like(pred)
        true_dist.fill_(self.smoothing / (self.vocab_size - 1))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * pred, dim=-1))

# Add penalty for repetition and excessive length to the loss function
class AugmentedLoss(nn.Module):
    def __init__(self, vocab_size, smoothing=0.1, repetition_penalty=0.1, length_penalty=0.01):
        super(AugmentedLoss, self).__init__()
        self.label_smoothing = LabelSmoothingLoss(vocab_size, smoothing)
        self.repetition_penalty = repetition_penalty
        self.length_penalty = length_penalty

    def forward(self, pred, target, input_seq):
        # Standard label smoothing loss
        smoothing_loss = self.label_smoothing(pred, target)

        # Repetition penalty
        # Flatten input_seq to a 1D tensor for bincount
        flattened_input = input_seq.flatten()
        # Mask padding tokens (assume padding token index is 0)
        valid_tokens = flattened_input[flattened_input > 0]
        # Count occurrences of valid tokens
        token_counts = valid_tokens.bincount(minlength=pred.size(-1)).float()
        repetition_loss = torch.sum(token_counts ** 2) * self.repetition_penalty

        # Length penalty
        seq_length = input_seq.size(1)
        length_loss = seq_length * self.length_penalty

        return smoothing_loss + repetition_loss + length_loss


criterion = AugmentedLoss(vocab_size, smoothing=0.1, repetition_penalty=repetition_penalty, length_penalty=length_penalty)
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=gamma)

# Training loop
for epoch in range(num_epochs):
    model.train()
    for batch in dataloader:
        optimizer.zero_grad()
        inputs = batch[:, :-1]
        targets = batch[:, 1:].contiguous().view(-1)
        outputs = model(inputs).view(-1, vocab_size)
        loss = criterion(outputs, targets, inputs)
        loss.backward()
        optimizer.step()
    scheduler.step()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")

# Generate synthetic URLs (uses http prefix only)
model.eval()
synthetic_urls = []
for _ in range(10):
    input_seq = torch.tensor([[token_to_idx["http"]]], dtype=torch.long)
    generated = []
    while True:
        seq_len = input_seq.size(1)
        if seq_len > 512:  # Limit sequence length dynamically
            break
        output = model(input_seq)
        probabilities = nn.functional.softmax(output[:, -1, :], dim=-1).squeeze()
        next_token = torch.multinomial(probabilities, num_samples=1).item()
        if next_token == token_to_idx[EOS_TOKEN]:  # Stop if <eos>
            break
        generated.append(next_token)
        input_seq = torch.cat([input_seq, torch.tensor([[next_token]], dtype=torch.long)], dim=1)
    synthetic_urls.append(decode_tokens([idx_to_token[idx] for idx in generated]))

# Print synthetic URLs
print("\nSynthetic URLs:")
for url in synthetic_urls:
    print(url)



Epoch 1/20, Loss: 5.2517
Epoch 2/20, Loss: 4.8024
Epoch 3/20, Loss: 6.4406
Epoch 4/20, Loss: 4.9725
Epoch 5/20, Loss: 5.7670
Epoch 6/20, Loss: 4.7332
Epoch 7/20, Loss: 5.5585
Epoch 8/20, Loss: 5.3885
Epoch 9/20, Loss: 5.2447
Epoch 10/20, Loss: 4.6599
Epoch 11/20, Loss: 6.2395
Epoch 12/20, Loss: 4.6773
Epoch 13/20, Loss: 5.4415
Epoch 14/20, Loss: 5.7235
Epoch 15/20, Loss: 4.4367
Epoch 16/20, Loss: 4.7726
Epoch 17/20, Loss: 4.9541
Epoch 18/20, Loss: 4.3393
Epoch 19/20, Loss: 5.0305
Epoch 20/20, Loss: 7.1210

Synthetic URLs:
://xbooksdah.comemeglobal
://www.dummies.org/as-control-event.html
://www.wikihowcscw.com/Account.htm
://www.corporateoffice-resources/
://www.cafedelabourse.net/quest/files/internet.cfm
karistelefonoptiscanlepota.contractwanken.com/definition/articlesheets/Onternational_TDR_tennis
://www.great.com/en/nit-synthesis/fonts
://www.acute.lv/
://infohot.tumblr.com
://testinternetspeed1605-animal


In [5]:
## analysis: length (in tokens) of training data
import numpy as np
import matplotlib.pyplot as plt

lengths = np.empty(len(tokenized_urls))

for i, tokenlist in enumerate(tokenized_urls):
    lengths[i] = len(tokenlist)

In [None]:
# enforce very few tokens
model.eval()
synthetic_urls = []
for _ in range(10):
    input_seq = torch.tensor([[token_to_idx["http"]]], dtype=torch.long)
    generated = []
    while True:
        seq_len = input_seq.size(1)
        if seq_len > 10:  # Limit sequence length dynamically
            break
        output = model(input_seq)
        probabilities = nn.functional.softmax(output[:, -1, :], dim=-1).squeeze()
        next_token = torch.multinomial(probabilities, num_samples=1).item()
        if next_token == token_to_idx[EOS_TOKEN]:  # Stop if <eos>
            break
        generated.append(next_token)
        input_seq = torch.cat([input_seq, torch.tensor([[next_token]], dtype=torch.long)], dim=1)
    synthetic_urls.append('http' + decode_tokens([idx_to_token[idx] for idx in generated]))

# Print synthetic URLs
print("\nSynthetic URLs:")
for url in synthetic_urls:
    print(url)

In [4]:
### make more
#synthetic_urls = []
def gen_urls(n):
    for _ in range(n):
        input_seq = torch.tensor([[token_to_idx["http"]]], dtype=torch.long)
        generated = []
        while True:
            seq_len = input_seq.size(1)
            if seq_len > 512:  # Limit sequence length dynamically
                break
            output = model(input_seq)
            probabilities = nn.functional.softmax(output[:, -1, :], dim=-1).squeeze()
            next_token = torch.multinomial(probabilities, num_samples=1).item()
            if next_token == token_to_idx[EOS_TOKEN]:  # Stop if <eos>
                break
            generated.append(next_token)
            input_seq = torch.cat([input_seq, torch.tensor([[next_token]], dtype=torch.long)], dim=1)
        synthetic_urls.append('http' + decode_tokens([idx_to_token[idx] for idx in generated]))

    for _ in range(n):
        input_seq = torch.tensor([[token_to_idx["https"]]], dtype=torch.long)
        generated = []
        while True:
            seq_len = input_seq.size(1)
            if seq_len > 512:  # Limit sequence length dynamically
                break
            output = model(input_seq)
            probabilities = nn.functional.softmax(output[:, -1, :], dim=-1).squeeze()
            next_token = torch.multinomial(probabilities, num_samples=1).item()
            if next_token == token_to_idx[EOS_TOKEN]:  # Stop if <eos>
                break
            generated.append(next_token)
            input_seq = torch.cat([input_seq, torch.tensor([[next_token]], dtype=torch.long)], dim=1)
        synthetic_urls.append('https' + decode_tokens([idx_to_token[idx] for idx in generated]))


In [5]:
synthetic_urls = []
gen_urls(1000)

print("\nSynthetic URLs:")
for url in synthetic_urls:
    print(url)


Synthetic URLs:
http://www.aaaspell.com/
http://wwwhioxmlgrid.nhs.uk/electric
http://www.yourdictionary.com/categories/rush-domain-tickets-the-dna-qnimate-1fMultipathalerts-cable
http://www.anadian.com/securityumber%/c_climbingweather/2019twelve/10.htm
http:/supportline-russeprofilering-ot.tumblr.tumblr.com/#_=_
http://www.gametorrentzone.com/
http://xpdrivers.edu.com/4searchsecurity-parallel-essentials-sale-healthy/
http://charlotte.tumblr.com
http://www.iracing.com/us/rescue/courses/catalogs/related-graphic-providers.html
http://www.kronostmroom.com/store/impact_to_icon.html
http://housing.techtarget.comvote
http://www.xbooks.com/
http://www.comparebroadband.co.uk/cs/computer/subnet-panel-lessons-mickey.html
http://www.simplyhired.edu/history/cs/Salary.asp
http://searchnetworkingducicge.careerplanner.com/windows
http://softloads.ru/www.cfm
http://www.helpingwithmath.com/australia-federteep/restricted/excel/india-alternative37usacomputerstore/indextwitter-faq-bandwidth.hotography-for

In [6]:
df_out = pd.DataFrame({'url':synthetic_urls})

output_filepath = 'outputs/UrlOutputs-AugmentedLoss_RP=' + str(repetition_penalty) + '_LP=' + str(length_penalty) + '__EmbedSize=' + str(embed_size) + '_NumHeads=' + str(num_heads) + '_NumLayers=' + str(num_layers) + '_HiddenDims=' + str(hidden_dim) + '_Epochs=' + str(num_epochs) + '.csv'

df_out.to_csv(output_filepath, index=False)

In [7]:
# Save the model
model_save_path = 'url_transformer_model.pth'
save_data = {
    "model_state_dict": model.state_dict(),
    "vocab_size": vocab_size,
    "embed_size": embed_size,
    "num_heads": num_heads,
    "num_layers": num_layers,
    "hidden_dim": hidden_dim,
    "token_to_idx": token_to_idx,
    "idx_to_token": idx_to_token,
    "repetition_penalty": repetition_penalty,
    "length_penalty": length_penalty
}
torch.save(save_data, model_save_path)
print(f"Model saved to {model_save_path}")

# Function to load the model
def load_model(filepath):
    checkpoint = torch.load(filepath)
    model = URLTransformer(
        vocab_size=checkpoint["vocab_size"],
        embed_size=checkpoint["embed_size"],
        num_heads=checkpoint["num_heads"],
        num_layers=checkpoint["num_layers"],
        hidden_dim=checkpoint["hidden_dim"]
    )
    model.load_state_dict(checkpoint["model_state_dict"])
    model.eval()
    print(f"Model loaded from {filepath}")
    return model, checkpoint["token_to_idx"], checkpoint["idx_to_token"]


Model saved to url_transformer_model.pth


## iter 2.0

New (kaggle) data with subdomains; new tokenization regime

In [None]:
### INIT & TOKENIZE

import pandas as pd
import re

# load data
file_path = 'legit_urls_trimmed.csv'
data = pd.read_csv(file_path)

# Assuming the column containing URLs is named 'URL'
if 'URL' not in data.columns:
    raise ValueError("Expected a column named 'URL' in the CSV file.")

urls = data['URL'].dropna().tolist()

def tokenize_url(url):
    """
    Tokenizes a URL by splitting at special characters and preserving essential elements.

    Args:
        url (str): The URL to tokenize.

    Returns:
        list: A list of tokens from the URL.
    """
    # Regex to split on special characters like ., /, :, and ? but retain them as tokens
    # incl. +, _, -
    tokens = re.split(r'([.:/?=&+-_])', url)
    # Remove empty tokens and strip whitespace
    return [token for token in tokens if token.strip()]

# Apply tokenization to the list of URLs
tokenized_urls = [tokenize_url(url) for url in urls]

# Preview the tokenized output
for i, tokens in enumerate(tokenized_urls[:5]):
    print(f"URL {i+1}: {tokens}")

# Save the tokenized output to a new CSV file for verification
tokenized_df = pd.DataFrame({'url': urls, 'tokens': tokenized_urls})
tokenized_df.to_csv('tokenized_urls.csv', index=False)


URL 1: ['http', ':', '/', '/', 'aasanetidende', '.', 'no']
URL 2: ['http', ':', '/', '/', 'abc', '.', 'go', '.', 'com', '/', 'schedule']
URL 3: ['http', ':', '/', '/', 'abouthappybirthday', '.', 'blogspot', '.', 'com']
URL 4: ['http', ':', '/', '/', 'academlib', '.', 'com', '/', '2', '6', '7', '1', '5', '/', 'computer', '_', 'science', '/', 'firewall']
URL 5: ['http', ':', '/', '/', 'acqalma', '.', 'az']


In [None]:
### BUILD TRANSFORMER NET

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

def decode_tokens(tokens):
    return ''.join(tokens)

# Vocabulary and encoding
unique_tokens = list(set(token for tokens in tokenized_urls for token in tokens))
token_to_idx = {token: idx for idx, token in enumerate(unique_tokens)}
idx_to_token = {idx: token for token, idx in token_to_idx.items()}

# Encode tokenized URLs
encoded_urls = [[token_to_idx[token] for token in tokens] for tokens in tokenized_urls]

class URLDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.long)

# Dataset and DataLoader
dataset = URLDataset(encoded_urls)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=lambda x: nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=0))

# Transformer Model
class URLTransformer(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, hidden_dim, max_len):
        super(URLTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = nn.Parameter(torch.zeros(1, max_len, embed_size))
        encoder_layer = nn.TransformerEncoderLayer(embed_size, num_heads, hidden_dim)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        self.fc = nn.Linear(embed_size, vocab_size)

    def forward(self, x):
        seq_len = x.size(1)
        x = self.embedding(x) + self.positional_encoding[:, :seq_len, :]
        x = self.transformer(x)
        x = self.fc(x)
        return x

# Model hyperparameters
vocab_size = len(token_to_idx)
embed_size = 64
num_heads = 4
num_layers = 2
hidden_dim = 128
max_len = max(len(tokens) for tokens in encoded_urls)

model = URLTransformer(vocab_size, embed_size, num_heads, num_layers, hidden_dim, max_len)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for batch in dataloader:
        optimizer.zero_grad()
        inputs = batch[:, :-1]
        targets = batch[:, 1:].contiguous().view(-1)
        outputs = model(inputs).view(-1, vocab_size)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")


synthetic_urls = []

In [None]:
import numpy as np
import matplotlib.pyplot as plt

## get stats for distribution of # tokens in data
lengths = np.empty(len(tokenized_urls))

for i, tokenlist in enumerate(tokenized_urls):
    lengths[i] = len(tokenlist)

: 

In [None]:
### Generate synthetic URLs

# (for http:// only)
model.eval()
synthetic_urls = []
for _ in range(5):
    input_seq = torch.tensor([[token_to_idx["http"]]], dtype=torch.long)
    generated = []

    # generate length of URL

    for _ in range(max_len):
        output = model(input_seq)
        probabilities = nn.functional.softmax(output[:, -1, :], dim=-1).squeeze()
        next_token = torch.multinomial(probabilities, num_samples=1).item()
        generated.append(next_token)
        if next_token == 0:
            break
        input_seq = torch.cat([input_seq, torch.tensor([[next_token]], dtype=torch.long)], dim=1)
    synthetic_urls.append('http' + decode_tokens([idx_to_token[idx] for idx in generated]))


# (for https:// only)
for _ in range(5):
    input_seq = torch.tensor([[token_to_idx["https"]]], dtype=torch.long)
    generated = []
    for _ in range(max_len):
        output = model(input_seq)
        probabilities = nn.functional.softmax(output[:, -1, :], dim=-1).squeeze()
        next_token = torch.multinomial(probabilities, num_samples=1).item()
        generated.append(next_token)
        if next_token == 0:
            break
        input_seq = torch.cat([input_seq, torch.tensor([[next_token]], dtype=torch.long)], dim=1)
    synthetic_urls.append('https' + decode_tokens([idx_to_token[idx] for idx in generated]))

# Print synthetic URLs
print("\nSynthetic URLs:")
for url in synthetic_urls:
    print(url)


Synthetic URLs:
http://www.outcoast.org/everything/wikitravel-up-menu/access.org/en.facebook.eliteprospects.com/www.optionsxpress/www.org/www.com.org//fashion/xhamster.com/www.wikipedia.pro.dictionary.baj-android.com/www.pdx-amazon-the-cheap.blogspot.com/ru/www.com/moriwajin/www.com/huggybearheadcanon-memory-Player3-nutriticsnbc.com/www.com/www.com/www.in-listener-anyone+/www.reliancestaffing-help.org//www.com/www.fr/www.careerbuilder200Pch=linza-guide.wordpress.com.templestudy/www.com/www.kln-DFnd/ru/en.net/www.wikipedia.com/www.uk/www.com//forum.wordpress.com/hardware.lostnfound.com/www.com/myactivityk.com.com/fav-portable.com/www.com/ru/www.blogspot.com/www.gob.me.com/www.com.com/en.org/www.org/setuprouter.com/techteach.it.tumblr.com/www.prepressure-in-amazing.weeblywiki/click.quora.clarias.facebook.com/state4501076BndI1%:/www.gr/www.com/www.com/forums.com/www.wordpress.ietf.blogspot.com/www.comparison.blogspot.com/www.nl//null.com/www.es/www.jobs.com//my.ac.tcbk/www.wikipedia.cz.c

In [None]:
synthetic_urls = []

In [31]:
### make more
#synthetic_urls = []
def gen_urls(n):
    for _ in range(n):
        input_seq = torch.tensor([[token_to_idx["http"]]], dtype=torch.long)
        generated = []
        for _ in range(max_len):
            output = model(input_seq)
            probabilities = nn.functional.softmax(output[:, -1, :], dim=-1).squeeze()
            next_token = torch.multinomial(probabilities, num_samples=1).item()
            generated.append(next_token)
            if next_token == 0:
                break
            input_seq = torch.cat([input_seq, torch.tensor([[next_token]], dtype=torch.long)], dim=1)
        synthetic_urls.append('http' + decode_tokens([idx_to_token[idx] for idx in generated]))

    for _ in range(n):
        input_seq = torch.tensor([[token_to_idx["https"]]], dtype=torch.long)
        generated = []
        for _ in range(max_len):
            output = model(input_seq)
            probabilities = nn.functional.softmax(output[:, -1, :], dim=-1).squeeze()
            next_token = torch.multinomial(probabilities, num_samples=1).item()
            generated.append(next_token)
            if next_token == 0:
                break
            input_seq = torch.cat([input_seq, torch.tensor([[next_token]], dtype=torch.long)], dim=1)
        synthetic_urls.append('https' + decode_tokens([idx_to_token[idx] for idx in generated]))

In [30]:
df_out = pd.DataFrame({'url':synthetic_urls})

legits = []
for i in range(len(df_out.url)):
    legits.append('legitimate')

df_out['status'] = legits

df_out.to_csv('generated_synthetic_urls_legit-labeled.csv', index=False)

In [27]:
df_out

Unnamed: 0,url,status
0,http://www.benjerry.org/en.com/www.wikipedia.e...,legitimate
1,http://www.azlyrics.com/www.org/en.com/en.xfin...,legitimate
2,http://www.hp.com/www.com/au/holloweentwinkie/...,legitimate
3,http://www.sevenforums.com/directorA//www.com/...,legitimate
4,http://www.org//www.com//www.ubuntu.downloads....,legitimate
...,...,...
95,https://www.techguy.com/www.co.ru/www.ru/wiki/...,legitimate
96,https://www.babylon-tr/shell.com/www.com/www.w...,legitimate
97,https://www.youtube.net/www.com/download:/www....,legitimate
98,https://wordstream-transmission.cz.ping.com/pl...,legitimate


## iter 1.0
Domain name only

In [None]:
### not completed

## iter 4

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [3]:
import torch.cuda

In [4]:
# Utilize GPU if available
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

Using device: cpu


In [None]:
# Load the dataset
file_path = 'legit_urls.csv'  # Update the file path as needed
legit_urls_df = pd.read_csv(file_path)
urls = legit_urls_df['col_0'].astype(str).tolist()

# Split the dataset into training and validation sets
from sklearn.model_selection import train_test_split
train_urls, val_urls = train_test_split(urls, test_size=0.2, random_state=42)

# Build vocabulary
from collections import Counter

def simple_tokenize(url):
    return list(url)  # Tokenize URLs by character

counter = Counter()
for url in map(simple_tokenize, train_urls):
    counter.update(url)

chars = ['<pad>', '<sos>', '<eos>'] + sorted(counter.keys())
vocab = {char: idx for idx, char in enumerate(chars)}
reverse_vocab = {idx: char for char, idx in vocab.items()}

PAD_IDX = vocab['<pad>']
SOS_IDX = vocab['<sos>']
EOS_IDX = vocab['<eos>']

# Define sequence length
MAX_SEQ_LENGTH = 64

# Encode URLs

def decode_sequence(sequence, reverse_vocab):
    return ''.join(reverse_vocab[idx] for idx in sequence if idx not in {PAD_IDX, SOS_IDX, EOS_IDX})
def encode_url(url, vocab):
    tokens = ['<sos>'] + list(url) + ['<eos>']
    return [vocab[token] for token in tokens]

def pad_and_truncate(sequence, max_length, pad_value=PAD_IDX):
    sequence = sequence[:max_length]  # Truncate if too long
    return sequence + [pad_value] * (max_length - len(sequence))  # Pad if too short

train_sequences = [pad_and_truncate(encode_url(url, vocab), MAX_SEQ_LENGTH) for url in train_urls]
val_sequences = [pad_and_truncate(encode_url(url, vocab), MAX_SEQ_LENGTH) for url in val_urls]

# Define a custom Dataset class
class URLDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx], dtype=torch.long)

# Define the Transformer model
class URLTransformer(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, forward_expansion, max_length, dropout=0.1):
        super(URLTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = nn.Embedding(max_length, embed_size)
        self.transformer = nn.Transformer(
            d_model=embed_size,
            nhead=num_heads,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=forward_expansion * embed_size,
            dropout=dropout,
            batch_first=True
        )
        self.fc_out = nn.Linear(embed_size, vocab_size)

    def forward(self, src, tgt):
        src_seq_len = src.size(1)
        tgt_seq_len = tgt.size(1)
        
        src_positions = torch.arange(0, src_seq_len).unsqueeze(0).to(src.device)
        tgt_positions = torch.arange(0, tgt_seq_len).unsqueeze(0).to(tgt.device)

        src_embedding = self.embedding(src) + self.positional_encoding(src_positions)
        tgt_embedding = self.embedding(tgt) + self.positional_encoding(tgt_positions)

        transformer_out = self.transformer(src_embedding, tgt_embedding)
        return self.fc_out(transformer_out)

# Define the training loop
def train_model(model, dataloader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    total_batches = len(dataloader)

    for batch_idx, batch in enumerate(dataloader):
        batch = batch.to(device)
        src = batch[:, :-1]  # Input sequence (exclude last token)
        tgt = batch[:, 1:]  # Target sequence (exclude first token)

        optimizer.zero_grad()
        output = model(src, src)[:, :, :]  # Ensure output dimensions match target
        loss = criterion(output.reshape(-1, output.size(-1)), tgt.reshape(-1))
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

        # Print progress every 10 batches
        if (batch_idx + 1) % 10 == 0:
            print(f"Batch {batch_idx + 1}/{total_batches}, Loss: {loss.item():.4f}")

    return epoch_loss / total_batches
    model.train()
    epoch_loss = 0

    for batch in dataloader:
        batch = batch.to(device)
        src = batch[:, :-1]  # Input sequence (exclude last token)
        tgt = batch[:, 1:]  # Target sequence (exclude first token)

        optimizer.zero_grad()
        output = model(src, src)[:, :, :]  # Ensure output dimensions match target
        loss = criterion(output.reshape(-1, output.size(-1)), tgt.reshape(-1))
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

# Hyperparameters
VOCAB_SIZE = len(vocab)
EMBED_SIZE = 128
NUM_HEADS = 8
NUM_LAYERS = 4
FORWARD_EXPANSION = 4
MAX_LENGTH = MAX_SEQ_LENGTH
DROPOUT = 0.1
BATCH_SIZE = 64
NUM_EPOCHS = 10
LEARNING_RATE = 0.001

# Create DataLoader
train_dataset = URLDataset(train_sequences)
val_dataset = URLDataset(val_sequences)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# Initialize model, optimizer, and loss function
model = URLTransformer(
    vocab_size=VOCAB_SIZE,
    embed_size=EMBED_SIZE,
    num_heads=NUM_HEADS,
    num_layers=NUM_LAYERS,
    forward_expansion=FORWARD_EXPANSION,
    max_length=MAX_LENGTH,
    dropout=DROPOUT
).to(DEVICE)

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

# Training loop
for epoch in range(NUM_EPOCHS):
    train_loss = train_model(model, train_loader, optimizer, criterion, DEVICE)
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Train Loss: {train_loss:.4f}")

# Generation function
def generate_url(model, start_token, max_length, vocab, reverse_vocab, device):
    model.eval()
    generated = [vocab[start_token]]

    for _ in range(max_length):
        src = torch.tensor([generated], dtype=torch.long).to(device)
        output = model(src, src)
        next_token = output.argmax(dim=-1)[:, -1].item()

        if next_token == EOS_IDX:
            break

        generated.append(next_token)

    return decode_sequence(generated, reverse_vocab)

# Example usage
start_token = '<sos>'
synthetic_url = generate_url(model, start_token, MAX_SEQ_LENGTH, vocab, reverse_vocab, DEVICE)
print(f"Generated URL: {synthetic_url}")


## iter 3

In [4]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# Load the dataset
file_path = 'legit_urls.csv'  # Update the file path as needed
legit_urls_df = pd.read_csv(file_path)
urls = legit_urls_df['col_0'].astype(str).tolist()

# Split the dataset into training and validation sets
from sklearn.model_selection import train_test_split
train_urls, val_urls = train_test_split(urls, test_size=0.2, random_state=42)

# Build vocabulary
from collections import Counter

def simple_tokenize(url):
    return list(url)  # Tokenize URLs by character

counter = Counter()
for url in map(simple_tokenize, train_urls):
    counter.update(url)

chars = ['<pad>', '<sos>', '<eos>'] + sorted(counter.keys())
vocab = {char: idx for idx, char in enumerate(chars)}
reverse_vocab = {idx: char for char, idx in vocab.items()}

PAD_IDX = vocab['<pad>']
SOS_IDX = vocab['<sos>']
EOS_IDX = vocab['<eos>']

# Define sequence length
MAX_SEQ_LENGTH = 64

# Encode URLs
def encode_url(url, vocab):
    tokens = ['<sos>'] + list(url) + ['<eos>']
    return [vocab[token] for token in tokens]

def pad_and_truncate(sequence, max_length, pad_value=PAD_IDX):
    sequence = sequence[:max_length]  # Truncate if too long
    return sequence + [pad_value] * (max_length - len(sequence))  # Pad if too short

train_sequences = [pad_and_truncate(encode_url(url, vocab), MAX_SEQ_LENGTH) for url in train_urls]
val_sequences = [pad_and_truncate(encode_url(url, vocab), MAX_SEQ_LENGTH) for url in val_urls]

# Define a custom Dataset class
class URLDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx], dtype=torch.long)

# Define the Transformer model
class URLTransformer(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, forward_expansion, max_length, dropout=0.1):
        super(URLTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = nn.Embedding(max_length, embed_size)
        self.transformer = nn.Transformer(
            d_model=embed_size,
            nhead=num_heads,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=forward_expansion * embed_size,
            dropout=dropout,
            batch_first=True
        )
        self.fc_out = nn.Linear(embed_size, vocab_size)

    def forward(self, src, tgt):
        src_seq_len = src.size(1)
        tgt_seq_len = tgt.size(1)
        
        src_positions = torch.arange(0, src_seq_len).unsqueeze(0).to(src.device)
        tgt_positions = torch.arange(0, tgt_seq_len).unsqueeze(0).to(tgt.device)

        src_embedding = self.embedding(src) + self.positional_encoding(src_positions)
        tgt_embedding = self.embedding(tgt) + self.positional_encoding(tgt_positions)

        transformer_out = self.transformer(src_embedding, tgt_embedding)
        return self.fc_out(transformer_out)

# Define the training loop
def train_model(model, dataloader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0

    for batch in dataloader:
        batch = batch.to(device)
        src = batch[:, :-1]  # Input sequence (exclude last token)
        tgt = batch[:, 1:]  # Target sequence (exclude first token)

        optimizer.zero_grad()
        output = model(src, src)[:, :, :]  # Ensure output dimensions match target
        loss = criterion(output.reshape(-1, output.size(-1)), tgt.reshape(-1))
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

# Hyperparameters
VOCAB_SIZE = len(vocab)
EMBED_SIZE = 128
NUM_HEADS = 8
NUM_LAYERS = 4
FORWARD_EXPANSION = 4
MAX_LENGTH = MAX_SEQ_LENGTH
DROPOUT = 0.1
BATCH_SIZE = 64
NUM_EPOCHS = 10
LEARNING_RATE = 0.001
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create DataLoader
train_dataset = URLDataset(train_sequences)
val_dataset = URLDataset(val_sequences)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# Initialize model, optimizer, and loss function
model = URLTransformer(
    vocab_size=VOCAB_SIZE,
    embed_size=EMBED_SIZE,
    num_heads=NUM_HEADS,
    num_layers=NUM_LAYERS,
    forward_expansion=FORWARD_EXPANSION,
    max_length=MAX_LENGTH,
    dropout=DROPOUT
).to(DEVICE)

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

# Training loop
for epoch in range(NUM_EPOCHS):
    train_loss = train_model(model, train_loader, optimizer, criterion, DEVICE)
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Train Loss: {train_loss:.4f}")

# Generation function
def generate_url(model, start_token, max_length, vocab, reverse_vocab, device):
    model.eval()
    generated = [vocab[start_token]]

    for _ in range(max_length):
        src = torch.tensor([generated], dtype=torch.long).to(device)
        output = model(src, src)
        next_token = output.argmax(dim=-1)[:, -1].item()

        if next_token == EOS_IDX:
            break

        generated.append(next_token)

    return decode_sequence(generated, reverse_vocab)

# Example usage
start_token = '<sos>'
synthetic_url = generate_url(model, start_token, MAX_SEQ_LENGTH, vocab, reverse_vocab, DEVICE)
print(f"Generated URL: {synthetic_url}")


KeyboardInterrupt: 

## iter 2

In [2]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# Load the dataset
file_path = 'legit_urls.csv'  # Update the file path as needed
legit_urls_df = pd.read_csv(file_path)
urls = legit_urls_df['col_0'].astype(str).tolist()

# Split the dataset into training and validation sets
from sklearn.model_selection import train_test_split
train_urls, val_urls = train_test_split(urls, test_size=0.2, random_state=42)

# Build vocabulary
from collections import Counter

def simple_tokenize(url):
    return list(url)  # Tokenize URLs by character

counter = Counter()
for url in map(simple_tokenize, train_urls):
    counter.update(url)

chars = ['<pad>', '<sos>', '<eos>'] + sorted(counter.keys())
vocab = {char: idx for idx, char in enumerate(chars)}
reverse_vocab = {idx: char for char, idx in vocab.items()}

PAD_IDX = vocab['<pad>']
SOS_IDX = vocab['<sos>']
EOS_IDX = vocab['<eos>']

# Define sequence length
MAX_SEQ_LENGTH = 64

# Encode URLs
def encode_url(url, vocab):
    tokens = ['<sos>'] + list(url) + ['<eos>']
    return [vocab[token] for token in tokens]

def pad_and_truncate(sequence, max_length, pad_value=PAD_IDX):
    sequence = sequence[:max_length]  # Truncate if too long
    return sequence + [pad_value] * (max_length - len(sequence))  # Pad if too short

train_sequences = [pad_and_truncate(encode_url(url, vocab), MAX_SEQ_LENGTH) for url in train_urls]
val_sequences = [pad_and_truncate(encode_url(url, vocab), MAX_SEQ_LENGTH) for url in val_urls]

# Define a custom Dataset class
class URLDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx], dtype=torch.long)

# Define the Transformer model
class URLTransformer(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, forward_expansion, max_length, dropout=0.1):
        super(URLTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = nn.Embedding(max_length, embed_size)
        self.transformer = nn.Transformer(
            d_model=embed_size,
            nhead=num_heads,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=forward_expansion * embed_size,
            dropout=dropout,
            batch_first=True
        )
        self.fc_out = nn.Linear(embed_size, vocab_size)

    def forward(self, src, tgt):
        src_seq_len = src.size(1)
        tgt_seq_len = tgt.size(1)
        
        src_positions = torch.arange(0, src_seq_len).unsqueeze(0).to(src.device)
        tgt_positions = torch.arange(0, tgt_seq_len).unsqueeze(0).to(tgt.device)

        src_embedding = self.embedding(src) + self.positional_encoding(src_positions)
        tgt_embedding = self.embedding(tgt) + self.positional_encoding(tgt_positions)

        transformer_out = self.transformer(src_embedding, tgt_embedding)
        return self.fc_out(transformer_out)

# Define the training loop
def train_model(model, dataloader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0

    for batch in dataloader:
        batch = batch.to(device)
        src = batch[:, :-1]  # Input sequence
        tgt = batch[:, 1:]  # Target sequence

        optimizer.zero_grad()
        output = model(src, src)[:, :-1, :]  # Ensure output dimensions match target
        loss = criterion(output.reshape(-1, output.size(-1)), tgt.reshape(-1))
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

# Hyperparameters
VOCAB_SIZE = len(vocab)
EMBED_SIZE = 128
NUM_HEADS = 8
NUM_LAYERS = 4
FORWARD_EXPANSION = 4
MAX_LENGTH = MAX_SEQ_LENGTH
DROPOUT = 0.1
BATCH_SIZE = 64
NUM_EPOCHS = 10
LEARNING_RATE = 0.001
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create DataLoader
train_dataset = URLDataset(train_sequences)
val_dataset = URLDataset(val_sequences)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# Initialize model, optimizer, and loss function
model = URLTransformer(
    vocab_size=VOCAB_SIZE,
    embed_size=EMBED_SIZE,
    num_heads=NUM_HEADS,
    num_layers=NUM_LAYERS,
    forward_expansion=FORWARD_EXPANSION,
    max_length=MAX_LENGTH,
    dropout=DROPOUT
).to(DEVICE)

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

# Training loop
for epoch in range(NUM_EPOCHS):
    train_loss = train_model(model, train_loader, optimizer, criterion, DEVICE)
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Train Loss: {train_loss:.4f}")

# Generation function
def generate_url(model, start_token, max_length, vocab, reverse_vocab, device):
    model.eval()
    generated = [vocab[start_token]]

    for _ in range(max_length):
        src = torch.tensor([generated], dtype=torch.long).to(device)
        output = model(src, src)
        next_token = output.argmax(dim=-1)[:, -1].item()

        if next_token == EOS_IDX:
            break

        generated.append(next_token)

    return decode_sequence(generated, reverse_vocab)

# Example usage
start_token = '<sos>'
synthetic_url = generate_url(model, start_token, MAX_SEQ_LENGTH, vocab, reverse_vocab, DEVICE)
print(f"Generated URL: {synthetic_url}")


ValueError: Expected input batch_size (3968) to match target batch_size (4032).

## iter 1

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# Load the dataset
file_path = 'legit_urls.csv'
legit_urls_df = pd.read_csv(file_path)
urls = legit_urls_df['col_0'].astype(str).tolist()

# Split the dataset into training and validation sets
from sklearn.model_selection import train_test_split
train_urls, val_urls = train_test_split(urls, test_size=0.2, random_state=42)

# Build vocabulary
from collections import Counter

def simple_tokenize(url):
    return list(url)  # Tokenize URLs by character

counter = Counter()
for url in map(simple_tokenize, train_urls):
    counter.update(url)

chars = ['<pad>', '<sos>', '<eos>'] + sorted(counter.keys())
vocab = {char: idx for idx, char in enumerate(chars)}
reverse_vocab = {idx: char for char, idx in vocab.items()}

PAD_IDX = vocab['<pad>']
SOS_IDX = vocab['<sos>']
EOS_IDX = vocab['<eos>']

# Define sequence length
MAX_SEQ_LENGTH = 64

# Encode URLs
def encode_url(url, vocab):
    tokens = ['<sos>'] + list(url) + ['<eos>']
    return [vocab[token] for token in tokens]

def pad_and_truncate(sequence, max_length, pad_value=PAD_IDX):
    sequence = sequence[:max_length]  # Truncate if too long
    return sequence + [pad_value] * (max_length - len(sequence))  # Pad if too short

train_sequences = [pad_and_truncate(encode_url(url, vocab), MAX_SEQ_LENGTH) for url in train_urls]
val_sequences = [pad_and_truncate(encode_url(url, vocab), MAX_SEQ_LENGTH) for url in val_urls]

# Define a custom Dataset class
class URLDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx], dtype=torch.long)

# Define the Transformer model
class URLTransformer(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, forward_expansion, max_length, dropout=0.1):
        super(URLTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = nn.Embedding(max_length, embed_size)
        self.transformer = nn.Transformer(
            d_model=embed_size,
            nhead=num_heads,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=forward_expansion * embed_size,
            dropout=dropout,
            batch_first=True
        )
        self.fc_out = nn.Linear(embed_size, vocab_size)

    def forward(self, src, tgt):
        src_seq_len = src.size(1)
        tgt_seq_len = tgt.size(1)
        
        src_positions = torch.arange(0, src_seq_len).unsqueeze(0).to(src.device)
        tgt_positions = torch.arange(0, tgt_seq_len).unsqueeze(0).to(tgt.device)

        src_embedding = self.embedding(src) + self.positional_encoding(src_positions)
        tgt_embedding = self.embedding(tgt) + self.positional_encoding(tgt_positions)

        transformer_out = self.transformer(src_embedding, tgt_embedding)
        return self.fc_out(transformer_out)

# Define the training loop
def train_model(model, dataloader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0

    for batch in dataloader:
        src = batch[:, :-1].to(device)
        tgt = batch[:, 1:].to(device)

        optimizer.zero_grad()
        output = model(src, src)[:, :-1]
        loss = criterion(output.reshape(-1, output.size(-1)), tgt.reshape(-1))
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

# Hyperparameters
VOCAB_SIZE = len(vocab)
EMBED_SIZE = 128
NUM_HEADS = 8
NUM_LAYERS = 4
FORWARD_EXPANSION = 4
MAX_LENGTH = MAX_SEQ_LENGTH
DROPOUT = 0.1
BATCH_SIZE = 64
NUM_EPOCHS = 10
LEARNING_RATE = 0.001
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create DataLoader
train_dataset = URLDataset(train_sequences)
val_dataset = URLDataset(val_sequences)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# Initialize model, optimizer, and loss function
model = URLTransformer(
    vocab_size=VOCAB_SIZE,
    embed_size=EMBED_SIZE,
    num_heads=NUM_HEADS,
    num_layers=NUM_LAYERS,
    forward_expansion=FORWARD_EXPANSION,
    max_length=MAX_LENGTH,
    dropout=DROPOUT
).to(DEVICE)

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

# Training loop
for epoch in range(NUM_EPOCHS):
    train_loss = train_model(model, train_loader, optimizer, criterion, DEVICE)
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Train Loss: {train_loss:.4f}")

# Generation function
def generate_url(model, start_token, max_length, vocab, reverse_vocab, device):
    model.eval()
    generated = [vocab[start_token]]

    for _ in range(max_length):
        src = torch.tensor([generated], dtype=torch.long).to(device)
        output = model(src, src)
        next_token = output.argmax(dim=-1)[:, -1].item()

        if next_token == EOS_IDX:
            break

        generated.append(next_token)

    return decode_sequence(generated, reverse_vocab)

# Example usage
start_token = '<sos>'
synthetic_url = generate_url(model, start_token, MAX_SEQ_LENGTH, vocab, reverse_vocab, DEVICE)
print(f"Generated URL: {synthetic_url}")


ValueError: Expected input batch_size (3968) to match target batch_size (4032).

## iter 0