In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext, datasets, math
from tqdm import tqdm

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

#make our work comparable if restarted the kernel
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# torch.cuda.get_device_name(0)

cuda


In [4]:
import tarfile
import os

# Define the path for the uploaded file
uploaded_file_path = 'swb1_dialogact_annot.tar.gz'
extracted_folder_path = 'swb1_extracted/'

# Create a directory for extraction if it doesn't exist
os.makedirs(extracted_folder_path, exist_ok=True)

# Extract the tar.gz file
with tarfile.open(uploaded_file_path, "r:gz") as tar:
    tar.extractall(path=extracted_folder_path)

# Check the contents of the extracted folder
extracted_files = os.listdir(extracted_folder_path)
extracted_files


  tar.extractall(path=extracted_folder_path)


['sw12utt',
 'sw03utt',
 'sw02utt',
 'sw06utt',
 'sw00utt',
 'sw09utt',
 'sw07utt',
 'sw05utt',
 'sw01utt',
 'sw08utt',
 'sw13utt',
 'doc',
 'README',
 'sw10utt',
 'sw04utt',
 'sw11utt']

In [11]:
import os
import pandas as pd

# Initialize a list to store data
data = []

# Function to process files recursively
def process_directory(directory_path):
    for item in os.listdir(directory_path):
        item_path = os.path.join(directory_path, item)
        
        # Check if it's a file or directory
        if os.path.isfile(item_path):  # Process only files
            with open(item_path, 'r', encoding='utf-8') as f:
                lines = [line.strip() for line in f.readlines() if line.strip()]  # Remove empty lines
                data.extend(lines)
        elif os.path.isdir(item_path):  # Recursively process directories
            process_directory(item_path)

# Start processing from the extracted folder
process_directory(extracted_folder_path)

# Convert to DataFrame
if data:
    df = pd.DataFrame(data, columns=["text"])
    print(f"Dataset size: {len(df)}")
    print("First 5 rows:", df.head())
else:
    print("No data found in the files.")


Dataset size: 261133
First 5 rows:                                                 text
0  *x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x...
1  *x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x...
2  *x*                                           ...
3  *x*            Copyright (C) 1995 University o...
4  *x*                                           ...


In [29]:
# Convert the DataFrame to a plain dictionary with only text
text_dict = {idx: text for idx, text in enumerate(df['text'].tolist())}

# Check the first few entries
print("Sample from text dictionary:", list(text_dict.items())[:5])


Sample from text dictionary: [(0, '*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*'), (1, '*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*x*'), (2, '*x*                                                                     *x*'), (3, '*x*            Copyright (C) 1995 University of Pennsylvania            *x*'), (4, '*x*                                                                     *x*')]


In [30]:
from sklearn.model_selection import train_test_split

# Original text list
text_list = list(text_dict.values())  # Convert dictionary values to a list

# Perform train-test-validation split
train_texts, temp_texts = train_test_split(text_list, test_size=0.2, random_state=42)  # 80% train, 20% temp
validation_texts, test_texts = train_test_split(temp_texts, test_size=0.5, random_state=42)  # 10% validation, 10% test

# Create a new dictionary with the splits
text_dict = {
    'train': {idx: text for idx, text in enumerate(train_texts)},
    'validation': {idx: text for idx, text in enumerate(validation_texts)},
    'test': {idx: text for idx, text in enumerate(test_texts)}
}

# Verify the structure
print("Keys in text_dict:", text_dict.keys())
print("Sample train text:", text_dict['train'][0])


Keys in text_dict: dict_keys(['train', 'validation', 'test'])
Sample train text: sd          A.81 utt4:   I have always called them, - /


In [31]:
# # Shuffle and then select a random sample of 1000 rows from each split
# sampled_custom_dataset = {
#     'train': custom_dataset['train'].shuffle(seed=42).select(range(50000)),
#     'test': custom_dataset['test'].shuffle(seed=42).select(range(10000)),
#     'validation': custom_dataset['validation'].shuffle(seed=42).select(range(1000)),
# }

# # Check the size of the sampled data
# print(sampled_custom_dataset['train'])
# print(sampled_custom_dataset['validation'])
# print(sampled_custom_dataset['test'])

print(len(train_dict))
print(len(validation_dict))
print(len(test_dict))


208906
26113
26114


In [32]:
from torchtext.data.utils import get_tokenizer

# Initialize tokenizer
tokenizer = get_tokenizer('basic_english')

# Tokenize the dataset
tokenized_sampled_dataset = {}
for split in ['train', 'validation', 'test']:
    tokenized_sampled_dataset[split] = {
        idx: {'tokens': tokenizer(text)} for idx, text in text_dict[split].items()
    }

# Check tokenized samples
print("First tokenized training sample:", tokenized_sampled_dataset['train'][0])  # First training sample
print("First tokenized validation sample:", tokenized_sampled_dataset['validation'][0])  # First validation sample
print("First tokenized test sample:", tokenized_sampled_dataset['test'][0])  # First test sample


First tokenized training sample: {'tokens': ['sd', 'a', '.', '81', 'utt4', 'i', 'have', 'always', 'called', 'them', ',', '-', '/']}
First tokenized validation sample: {'tokens': ['fc', 'a', '.', '73', 'utt3', '{c', 'and', '}', '--']}
First tokenized test sample: {'tokens': ['fp', 'a', '.', '2', 'utt1', 'hello', '.', '/']}


In [34]:
from torchtext.vocab import build_vocab_from_iterator
import pickle

# Define an iterator to extract tokens
def yield_tokens(dataset_split):
    for entry in dataset_split.values():  # Iterate over the dictionary values
        yield entry['tokens']  # Extract the 'tokens' list for each entry

# Build vocabulary from the iterator
vocab = build_vocab_from_iterator(yield_tokens(tokenized_sampled_dataset['train']), min_freq=3)

# Add special tokens
vocab.insert_token('<unk>', 0)
vocab.insert_token('<eos>', 1)

# Set default index for unknown tokens
vocab.set_default_index(vocab['<unk>'])

# Save the vocabulary
with open("vocab2.pkl", "wb") as f:
    pickle.dump(vocab, f)

print("Vocabulary saved to vocab2.pkl")

# Print information
print(len(vocab))  # Vocabulary size
print(vocab.get_itos()[:10])  # First 10 tokens in the vocabulary


Vocabulary saved to vocab2.pkl
10531
['<unk>', '<eos>', '.', ',', '/', 'a', 'b', '}', 'utt1', 'sd']


In [37]:
def get_data(dataset, vocab, batch_size):
    data = []                                                   
    for example in dataset.values():  # Iterate over the values of the dictionary
        if 'tokens' in example:         
            # Append <eos> token to the sequence
            tokens = example['tokens'] + ['<eos>']   
            
            # Numericalize the tokens
            tokens = [vocab[token] for token in tokens] 
            
            # Add tokens to the data list
            data.extend(tokens)                                    
    
    # Convert to tensor
    data = torch.LongTensor(data)
    
    # Make sure the data is evenly divisible by batch size
    num_batches = data.shape[0] // batch_size
    data = data[:num_batches * batch_size]
    
    # Reshape into [batch_size, num_batches]
    data = data.view(batch_size, -1)
    return data  # [batch_size, num_tokens_per_batch]


In [38]:
batch_size = 128
train_data = get_data(tokenized_sampled_dataset['train'], vocab, batch_size)
valid_data = get_data(tokenized_sampled_dataset['validation'], vocab, batch_size)
test_data  = get_data(tokenized_sampled_dataset['test'], vocab, batch_size)

In [39]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
                
        super().__init__()
        self.num_layers = num_layers
        self.hid_dim = hid_dim
        self.emb_dim = emb_dim

        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, 
                    dropout=dropout_rate, batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hid_dim, vocab_size)
        
        self.init_weights()
        
    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hid_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_emb)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.emb_dim,
                    self.hid_dim).uniform_(-init_range_other, init_range_other) 
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hid_dim, 
                    self.hid_dim).uniform_(-init_range_other, init_range_other) 

    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        cell   = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        return hidden, cell
    
    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach()
        cell = cell.detach()
        return hidden, cell

    def forward(self, src, hidden):
        #src: [batch size, seq len]
        embedding = self.dropout(self.embedding(src))
        #embedding: [batch size, seq len, emb_dim]
        output, hidden = self.lstm(embedding, hidden)      
        #output: [batch size, seq len, hid_dim]
        #hidden = h, c = [num_layers * direction, seq len, hid_dim)
        output = self.dropout(output) 
        prediction = self.fc(output)
        #prediction: [batch size, seq_len, vocab size]
        return prediction, hidden

In [40]:
# Training

In [41]:
vocab_size = len(vocab)
emb_dim = 1024                # 400 in the paper
hid_dim = 1024                # 1150 in the paper
num_layers = 2                # 3 in the paper
dropout_rate = 0.65              
lr = 1e-3    

In [42]:
model = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 38,371,619 trainable parameters


In [43]:
def get_batch(data, seq_len, idx):
    #data #[batch size, bunch of tokens]
    src    = data[:, idx:idx+seq_len]                   
    target = data[:, idx+1:idx+seq_len+1]  #target simply is ahead of src by 1            
    return src, target

In [44]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    
    epoch_loss = 0
    model.train()
    # drop all batches that are not a multiple of seq_len
    # data #[batch size, bunch of tokens]
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]  #we need to -1 because we start at 0
    num_batches = data.shape[-1]
    
    #reset the hidden every epoch
    hidden = model.init_hidden(batch_size, device)
    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):
        optimizer.zero_grad()
        
        #hidden does not need to be in the computational graph for efficiency
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, idx) #src, target: [batch size, seq len]
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)               

        #need to reshape because criterion expects pred to be 2d and target to be 1d
        prediction = prediction.reshape(batch_size * seq_len, -1)  #prediction: [batch size * seq len, vocab size]  
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [45]:
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [46]:
n_epochs = 50
seq_len  = 50 #<----decoding length
clip    = 0.25

lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

best_valid_loss = float('inf')

for epoch in range(n_epochs):
    train_loss = train(model, train_data, optimizer, criterion, 
                batch_size, seq_len, clip, device)
    valid_loss = evaluate(model, valid_data, criterion, batch_size, 
                seq_len, device)

    lr_scheduler.step(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best-val-lstm_phone_seq.pt')

    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')

                                                           

	Train Perplexity: 39.857
	Valid Perplexity: 61.660


                                                           

	Train Perplexity: 14.703
	Valid Perplexity: 12.004


                                                           

	Train Perplexity: 12.800
	Valid Perplexity: 11.245


                                                           

	Train Perplexity: 12.042
	Valid Perplexity: 10.825


                                                           

	Train Perplexity: 11.538
	Valid Perplexity: 10.546


                                                           

	Train Perplexity: 11.172
	Valid Perplexity: 10.351


                                                           

	Train Perplexity: 10.881
	Valid Perplexity: 10.205


                                                           

	Train Perplexity: 10.648
	Valid Perplexity: 10.115


                                                           

	Train Perplexity: 10.453
	Valid Perplexity: 10.026


                                                           

	Train Perplexity: 10.280
	Valid Perplexity: 9.966


                                                           

	Train Perplexity: 10.124
	Valid Perplexity: 9.917


                                                           

	Train Perplexity: 9.986
	Valid Perplexity: 9.891


                                                           

	Train Perplexity: 9.868
	Valid Perplexity: 9.860


                                                           

	Train Perplexity: 9.751
	Valid Perplexity: 9.833


                                                           

	Train Perplexity: 9.648
	Valid Perplexity: 9.823


                                                           

	Train Perplexity: 9.558
	Valid Perplexity: 9.816


                                                           

	Train Perplexity: 9.476
	Valid Perplexity: 9.804


                                                           

	Train Perplexity: 9.392
	Valid Perplexity: 9.804


                                                           

	Train Perplexity: 9.200
	Valid Perplexity: 9.779


                                                           

	Train Perplexity: 9.124
	Valid Perplexity: 9.782


                                                           

	Train Perplexity: 9.016
	Valid Perplexity: 9.771


                                                           

	Train Perplexity: 8.973
	Valid Perplexity: 9.770


                                                           

	Train Perplexity: 8.917
	Valid Perplexity: 9.770


                                                           

	Train Perplexity: 8.887
	Valid Perplexity: 9.765


                                                           

	Train Perplexity: 8.870
	Valid Perplexity: 9.764


                                                           

	Train Perplexity: 8.851
	Valid Perplexity: 9.763


                                                           

	Train Perplexity: 8.843
	Valid Perplexity: 9.762


                                                           

	Train Perplexity: 8.841
	Valid Perplexity: 9.763


                                                           

	Train Perplexity: 8.835
	Valid Perplexity: 9.761


                                                           

	Train Perplexity: 8.834
	Valid Perplexity: 9.761


                                                           

	Train Perplexity: 8.837
	Valid Perplexity: 9.761


                                                           

	Train Perplexity: 8.837
	Valid Perplexity: 9.761


                                                           

	Train Perplexity: 8.834
	Valid Perplexity: 9.761


                                                           

	Train Perplexity: 8.834
	Valid Perplexity: 9.761


                                                           

	Train Perplexity: 8.830
	Valid Perplexity: 9.761


                                                           

	Train Perplexity: 8.831
	Valid Perplexity: 9.761


                                                           

	Train Perplexity: 8.835
	Valid Perplexity: 9.761


                                                           

	Train Perplexity: 8.836
	Valid Perplexity: 9.761


                                                           

	Train Perplexity: 8.838
	Valid Perplexity: 9.761


                                                           

	Train Perplexity: 8.836
	Valid Perplexity: 9.761


                                                           

	Train Perplexity: 8.833
	Valid Perplexity: 9.761


                                                           

	Train Perplexity: 8.831
	Valid Perplexity: 9.761


                                                           

	Train Perplexity: 8.831
	Valid Perplexity: 9.761


                                                           

	Train Perplexity: 8.834
	Valid Perplexity: 9.761


                                                           

	Train Perplexity: 8.830
	Valid Perplexity: 9.761


                                                           

	Train Perplexity: 8.835
	Valid Perplexity: 9.761


                                                           

	Train Perplexity: 8.832
	Valid Perplexity: 9.761


                                                           

	Train Perplexity: 8.834
	Valid Perplexity: 9.761


                                                           

	Train Perplexity: 8.831
	Valid Perplexity: 9.761


                                                           

	Train Perplexity: 8.830
	Valid Perplexity: 9.761


In [47]:
model.load_state_dict(torch.load('best-val-lstm_phone_seq.pt',  map_location=device))
test_loss = evaluate(model, test_data, criterion, batch_size, seq_len, device)
print(f'Test Perplexity: {math.exp(test_loss):.3f}')

Test Perplexity: 9.803


In [48]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            
            #prediction: [batch size, seq len, vocab size]
            #prediction[:, -1]: [batch size, vocab size] #probability of last vocab
            
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            while prediction == vocab['<unk>']: #if it is unk, we sample again
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:    #if it is eos, we stop
                break

            indices.append(prediction) #autoregressive, thus output becomes input

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [51]:
prompt = 'How are '
max_seq_len = 30
seed = 0

#smaller the temperature, more diverse tokens but comes 
#with a tradeoff of less-make-sense sentence
temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer, 
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

0.5
how are you going to be able to do that . /

0.7
how are their , -/

0.75
how are their into the problem . /

0.8
how are their into the problem . /

1.0
how are their into the album . /



In [52]:
prompt = 'Where are '
max_seq_len = 30
seed = 0

#smaller the temperature, more diverse tokens but comes 
#with a tradeoff of less-make-sense sentence
temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer, 
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

0.5
where are you going to be able to buy them . /

0.7
where are their , {f uh , } {f uh , } student and the other . /

0.75
where are their part , /

0.8
where are their into the problem you can do . /

1.0
where are their into the problem you put on and those . /



In [57]:
prompt = 'The country '
max_seq_len = 30
seed = 0

#smaller the temperature, more diverse tokens but comes 
#with a tradeoff of less-make-sense sentence
temperatures = [0.2, 0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer, 
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

0.2
the country is going to be a problem . /

0.5
the country is going to be a lot of money . /

0.7
the country is going to be a lot of money . /

0.75
the country is going into the hospital you can do . /

0.8
the country is going into the hospital you can do . /

1.0
the country is going into the hospital you put it within those once . /



In [58]:
prompt = 'She was in '
max_seq_len = 30
seed = 0

#smaller the temperature, more diverse tokens but comes 
#with a tradeoff of less-make-sense sentence
temperatures = [0.2, 0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer, 
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

0.2
she was in the middle of the house . /

0.5
she was in , {f uh , } the league of the city , /

0.7
she was in , {f uh , } ten years old . /

0.75
she was in , {f uh , } ten years old . /

0.8
she was in their own state /

1.0
she was in their own state /

