# Coding: Language Model based on LSTM

 - Dataset: WikiText-2
 - Model: LSTM/GRU
 - Optimizer: Adam
 - Learning Rate: 0.001
 - Batch Size: 64
 - Epochs: 10
 - Dropout: 0.2

In [4]:
# 可以先测试网络连接
import requests
try:
    response = requests.get("https://huggingface.co")
    print("网络连接正常")
except:
    print("网络连接可能存在问题")

网络连接正常


In [6]:
# Load WikiText-2 Dataset using Hugging Face datasets

import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from collections import Counter
import re

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

# Load WikiText-2 dataset from Hugging Face
print("Loading WikiText-2 dataset from Hugging Face...")
dataset = load_dataset("wikitext", "wikitext-2-v1",cache_dir="D:\Developer\LLM\FuggingFace-cache-model")

# Access the splits
train_dataset = dataset["train"]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# Show a sample
# Find a non-empty sample
print(f"\nLooking for non-empty samples...")
sample_text = None
for i, example in enumerate(train_dataset):
    if example["text"].strip ():    # Find first non-empty line
        sample_text = example["text"]
        print(f"Found non-empty sample at index {i}")
        break
print(f"First sample: {sample_text[:200]}")

# Basic tokenization function
def basic_tokenize(text):
    # Convert to lowercase and split on whitespace and punctuation
    text = text.lower()
    tokens = re.findall(r'\b\w+\b', text)
    return tokens

# Build vocabulary from training data
print("\nBuilding vocabulary...")
all_tokens = []
non_empty_lines = 0
for example in train_dataset:
    if example['text'].strip():  # Skip empty lines
        tokens = basic_tokenize(example['text'])
        all_tokens.extend(tokens)
        non_empty_lines += 1

# Create vocabulary
vocab_counter = Counter(all_tokens)
vocab_size = len(vocab_counter)
print(f"Non-empty lines: {non_empty_lines}")
print(f"Total tokens: {len(all_tokens)}")
print(f"Vocabulary size: {vocab_size}")

# Create word to index mapping
vocab = {'<unk>': 0, '<pad>': 1}
for word, count in vocab_counter.most_common():
    if word not in vocab:
        vocab[word] = len(vocab)

# Create index to word mapping
idx_to_word = {idx: word for word, idx in vocab.items()}

print(f"Final vocabulary size (with special tokens): {len(vocab)}")
print(f"Most common words: {list(vocab_counter.most_common(10))}")

# Function to convert text to indices
def text_to_indices(text, vocab):
    tokens = basic_tokenize(text)
    return [vocab.get(token, vocab['<unk>']) for token in tokens]

# Convert a sample to see the tokenization
sample_indices = text_to_indices(sample_text, vocab)
print(f"\nSample tokenization:")
print(f"Original: {sample_text[:100]}")
print(f"Tokens: {basic_tokenize(sample_text)[:20]}")
print(f"Indices: {sample_indices[:20]}")

print("\nDataset loaded successfully!")

  dataset = load_dataset("wikitext", "wikitext-2-v1",cache_dir="D:\Developer\LLM\FuggingFace-cache-model")


Using device: cpu
Loading WikiText-2 dataset from Hugging Face...
Train dataset size: 36718
Validation dataset size: 3760
Test dataset size: 4358

Looking for non-empty samples...
Found non-empty sample at index 1
First sample:  = Valkyria Chronicles III = 


Building vocabulary...
Non-empty lines: 23767
Total tokens: 1750345
Vocabulary size: 28710
Final vocabulary size (with special tokens): 28712
Most common words: [('the', 130768), ('of', 57030), ('unk', 54625), ('and', 50735), ('in', 45015), ('to', 39521), ('a', 36523), ('was', 21008), ('on', 15140), ('as', 15058)]

Sample tokenization:
Original:  = Valkyria Chronicles III = 

Tokens: ['valkyria', 'chronicles', 'iii']
Indices: [3767, 3830, 860]

Dataset loaded successfully!


In [7]:
# Data Preprocess & Init DataLoader

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

class WikiTextDataset(Dataset):
    def __init__(self, hf_dataset, vocab, sequence_length=50):
        """
        Custom Dataset for WikiText-2

        Args:
            hf_dataset: Hugging Face dataset split
            vocab: vocabulary dictionary
            sequence_length: length of input sequences
        """
        self.vocab = vocab
        self.sequence_length = sequence_length

        # Process all text and create sequences
        self.sequences = []
        self.targets = []

        print(f"Processing dataset with sequence length: {sequence_length}")

        # Concatenate all non-empty text and tokenize
        all_indices = []
        for example in hf_dataset:
            if example['text'].strip():  # Skip empty lines
                indices = text_to_indices(example['text'], vocab)
                all_indices.extend(indices)

        print(f"Total tokens in dataset: {len(all_indices)}")

        # Create sequences of specified length
        for i in range(len(all_indices) - sequence_length):
            # Input sequence
            seq = all_indices[i:i + sequence_length]
            # Target is the next token
            target = all_indices[i + sequence_length]

            self.sequences.append(seq)
            self.targets.append(target)

        print(f"Created {len(self.sequences)} sequences")

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.sequences[idx], dtype=torch.long),
            torch.tensor(self.targets[idx], dtype=torch.long)
        )

# Create datasets
sequence_length = 50
batch_size = 64

print("Creating training dataset...")
train_wiki_dataset = WikiTextDataset(train_dataset, vocab, sequence_length)

print("\nCreating validation dataset...")
val_wiki_dataset = WikiTextDataset(val_dataset, vocab, sequence_length)

print("\nCreating test dataset...")
test_wiki_dataset = WikiTextDataset(test_dataset, vocab, sequence_length)

# Create DataLoaders
train_loader = DataLoader(
    train_wiki_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0  # Set to 0 for compatibility
)

val_loader = DataLoader(
    val_wiki_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=0
)

test_loader = DataLoader(
    test_wiki_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=0
)

print(f"\nDataLoader Summary:")
print(f"Training batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")
print(f"Test batches: {len(test_loader)}")
print(f"Batch size: {batch_size}")
print(f"Sequence length: {sequence_length}")

# Test the DataLoader
for batch_idx, (sequences, targets) in enumerate(train_loader):
    print(f"Batch {batch_idx + 1}:")
    print(f"  Sequences shape: {sequences.shape}")
    print(f"  Targets shape: {targets.shape}")
    print(f"  Sample sequence: {sequences[0][:10].tolist()}")
    print(f"  Sample target: {targets[0].item()}")

    # Convert back to words for verification
    sample_words = [idx_to_word[idx.item()] for idx in sequences[0]]
    target_word = idx_to_word[targets[0].item()]
    print(f"  Sample words: {sample_words}")
    print(f"  Target word: {target_word}")
    break  # Only show first batch

print("\nDataLoader created successfully!")

Creating training dataset...
Processing dataset with sequence length: 50
Total tokens in dataset: 1750345
Created 1750295 sequences

Creating validation dataset...
Processing dataset with sequence length: 50
Total tokens in dataset: 183606
Created 183556 sequences

Creating test dataset...
Processing dataset with sequence length: 50
Total tokens in dataset: 206381
Created 206331 sequences

DataLoader Summary:
Training batches: 27349
Validation batches: 2869
Test batches: 3224
Batch size: 64
Sequence length: 50
Batch 1:
  Sequences shape: torch.Size([64, 50])
  Targets shape: torch.Size([64])
  Sample sequence: [1635, 479, 3249, 40, 5913, 2468, 548, 91, 38, 2]
  Sample target: 3
  Sample words: ['organized', 'human', 'criminal', 'or', 'rebel', 'opposition', 'ten', 'years', 'after', 'the', 'official', 'end', 'of', 'the', 'zombie', 'war', 'millions', 'of', 'zombies', 'are', 'still', 'active', 'mainly', 'on', 'the', 'ocean', 'floor', 'or', 'on', 'snow', 'line', 'islands', 'a', 'democratic'