## FasText (Scractch) Experimental

In [33]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from collections import Counter, defaultdict
import numpy as np
import re
from typing import List, Dict, Tuple, Set

In [34]:
class SubwordTokenizer:
    def __init__(self, min_ngram: int = 3, max_ngram: int = 6):
        self.min_ngram = min_ngram
        self.max_ngram = max_ngram
        self.vocab = set()
        self.subword2idx = {'<PAD>': 0}  # Add padding token
        self.idx2subword = {0: '<PAD>'}
        self.word2subwords = defaultdict(list)
        
    def generate_subwords(self, word: str) -> List[str]:
        word = f"<{word}>"  # Add boundaries
        subwords = []
        
        # Add the word itself
        subwords.append(word)
        
        # Generate n-grams
        for n in range(self.min_ngram, self.max_ngram + 1):
            for i in range(len(word) - n + 1):
                subwords.append(word[i:i+n])
                
        return subwords
    
    def build_vocab(self, texts: List[str], min_freq: int = 5):
        # Count subword frequencies
        subword_counts = Counter()
        
        for text in texts:
            words = self._preprocess_text(text)
            for word in words:
                subwords = self.generate_subwords(word)
                subword_counts.update(subwords)
                self.word2subwords[word] = subwords
        
        # Filter by frequency and create vocabulary
        filtered_subwords = [sw for sw, count in subword_counts.items() 
                           if count >= min_freq]
        
        # Create mappings (starting from 1 since 0 is for padding)
        for idx, subword in enumerate(filtered_subwords, start=1):
            self.subword2idx[subword] = idx
            self.idx2subword[idx] = subword
            self.vocab.add(subword)
    
    def _preprocess_text(self, text: str) -> List[str]:
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        return text.split()
    
    def encode_word(self, word: str) -> torch.Tensor:
        subwords = self.generate_subwords(word)
        indices = [self.subword2idx[sw] for sw in subwords 
                  if sw in self.subword2idx]
        if not indices:  # Handle unknown words
            return torch.tensor([0])  # Use padding index
        return torch.tensor(indices)

In [35]:
def collate_fn(batch):
    """Custom collate function to handle variable-length sequences"""
    # Separate target and context
    target_sequences = [item[0] for item in batch]
    context_sequences = [item[1] for item in batch]
    
    # Get lengths for packing
    target_lengths = torch.tensor([len(seq) for seq in target_sequences])
    context_lengths = torch.tensor([len(seq) for seq in context_sequences])
    
    # Pad sequences
    target_padded = pad_sequence(target_sequences, batch_first=True, padding_value=0)
    context_padded = pad_sequence(context_sequences, batch_first=True, padding_value=0)
    
    return (target_padded, target_lengths), (context_padded, context_lengths)

In [36]:
class FastTextDataset(Dataset):
    def __init__(self, texts: List[str], tokenizer: SubwordTokenizer, 
                 window_size: int = 5):
        self.tokenizer = tokenizer
        self.window_size = window_size
        self.data = []
        
        # Create training pairs
        for text in texts:
            words = tokenizer._preprocess_text(text)
            for i, target_word in enumerate(words):
                # Get context words within window
                context_indices = list(range(max(0, i - window_size), i)) + \
                                list(range(i + 1, min(len(words), i + window_size + 1)))
                
                target_subwords = tokenizer.encode_word(target_word)
                
                for j in context_indices:
                    context_subwords = tokenizer.encode_word(words[j])
                    self.data.append((target_subwords, context_subwords))
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]


In [37]:
class FastTextModel(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int):
        super(FastTextModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.output = nn.Linear(embedding_dim, vocab_size)
        
    def forward(self, x: torch.Tensor, lengths: torch.Tensor) -> torch.Tensor:
        # x shape: [batch_size, max_seq_len]
        # lengths shape: [batch_size]
        
        # Get embeddings
        embedded = self.embeddings(x)  # [batch_size, max_seq_len, embedding_dim]
        
        # Create mask for padding
        mask = (x != 0).float().unsqueeze(-1)  # [batch_size, max_seq_len, 1]
        
        # Apply mask and average
        masked_embedded = embedded * mask
        summed = masked_embedded.sum(dim=1)  # [batch_size, embedding_dim]
        averaged = summed / lengths.float().unsqueeze(1)  # [batch_size, embedding_dim]
        
        # Project to vocabulary size
        output = self.output(averaged)  # [batch_size, vocab_size]
        return output

In [38]:
def train_fasttext(model: FastTextModel, 
                  train_loader: DataLoader,
                  num_epochs: int,
                  learning_rate: float = 0.001,
                  device: str = 'cuda' if torch.cuda.is_available() else 'cpu'):
    
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    for epoch in range(num_epochs):
        total_loss = 0
        model.train()
        
        for batch_idx, ((target_words, target_lengths), (context_words, context_lengths)) in enumerate(train_loader):
            target_words = target_words.to(device)
            context_words = context_words.to(device)
            target_lengths = target_lengths.to(device)
            context_lengths = context_lengths.to(device)
            
            optimizer.zero_grad()
            
            # Forward pass
            output = model(target_words, target_lengths)
            
            # Calculate target for loss
            context_target = torch.argmax(model(context_words, context_lengths), dim=1)
            
            # Calculate loss
            loss = criterion(output, context_target)
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')


In [39]:
def fine_tune_fasttext(model: FastTextModel,
                      fine_tune_texts: List[str],
                      tokenizer: SubwordTokenizer,
                      num_epochs: int = 5,
                      learning_rate: float = 0.0001):
    """Fine-tune the model on domain-specific data"""
    
    # Create fine-tuning dataset
    fine_tune_dataset = FastTextDataset(fine_tune_texts, tokenizer)
    fine_tune_loader = DataLoader(
        fine_tune_dataset,
        batch_size=32,
        shuffle=True,
        collate_fn=collate_fn
    )
    
    # Use smaller learning rate for fine-tuning
    train_fasttext(model, fine_tune_loader, num_epochs, learning_rate)

In [40]:
def get_word_vector(model: FastTextModel,
                   word: str,
                   tokenizer: SubwordTokenizer,
                   device: str = 'cuda' if torch.cuda.is_available() else 'cpu'):
    """Get the vector representation of a word"""
    
    model.eval()
    with torch.no_grad():
        subwords = tokenizer.encode_word(word).unsqueeze(0).to(device)
        lengths = torch.tensor([len(subwords[0])]).to(device)
        embedded = model.embeddings(subwords)
        mask = (subwords != 0).float().unsqueeze(-1)
        masked_embedded = embedded * mask
        return torch.sum(masked_embedded, dim=1) / lengths.float().unsqueeze(1)


In [41]:
def find_similar_products(model: FastTextModel,
                        query: str,
                        product_texts: List[str],
                        tokenizer: SubwordTokenizer,
                        top_k: int = 5,
                        device: str = 'cuda' if torch.cuda.is_available() else 'cpu'):
    """Find similar products based on text similarity"""
    
    # Get query vector
    query_vector = get_word_vector(model, query, tokenizer, device)
    
    # Calculate similarities with all products
    similarities = []
    for product in product_texts:
        product_vector = get_word_vector(model, product, tokenizer, device)
        similarity = F.cosine_similarity(query_vector, product_vector)
        similarities.append((product, similarity.item()))
    
    # Sort by similarity and return top k
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_k]


In [42]:
sample_texts = [
    "Computer Networking: A Top-Down Approach",
    "Data Communications and Networking",
    "Networking All-in-One For Dummies",
    "Computer Networks",
    "The Linux Command Line",
    "Network Warrior",
    "Internetworking with TCP/IP",
    "TCP/IP Illustrated, Volume 1",
    "CompTIA Network+ Guide to Networks",
    "Mastering Bitcoin",
    "Hacking: The Art of Exploitation",
    "The Phoenix Project",
    "Artificial Intelligence: A Modern Approach",
    "Machine Learning Yearning",
    "Deep Learning",
    "Introduction to the Theory of Computation",
    "The Pragmatic Programmer",
    "Clean Code: A Handbook of Agile Software Craftsmanship",
    "Code Complete",
    "Structure and Interpretation of Computer Programs",
    "Designing Data-Intensive Applications",
    "Python Crash Course",
    "Learning Python",
    "Automate the Boring Stuff with Python",
    "Introduction to Algorithms",
    "Algorithms Unlocked",
    "Network Security Essentials",
    "Computer Security: Principles and Practice",
    "Applied Cryptography",
    "The C Programming Language",
    "Effective Java",
    "Java: The Complete Reference",
    "Programming Perl",
    "Head First Java",
    "Python Programming: An Introduction to Computer Science",
    "C++ Primer",
    "Effective C++",
    "Eloquent JavaScript",
    "Modern Operating Systems",
    "Operating System Concepts",
    "Computer Organization and Design",
    "Digital Design and Computer Architecture",
    "Microservices Patterns",
    "Docker Deep Dive",
    "Kubernetes Up & Running",
    "Building Microservices",
    "The Go Programming Language",
    "The Rust Programming Language",
    "Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow",
    "Artificial Intelligence with Python",
    "Neural Networks and Deep Learning",
    "Blockchain Basics",
    "Cloud Computing: Concepts, Technology & Architecture",
    "Mastering Cloud Computing",
    "Cloud Native Patterns",
    "AWS Certified Solutions Architect Study Guide",
    "Kubernetes for Developers",
    "Introduction to Machine Learning with Python",
    "Hands-On Networking Fundamentals",
    "Networking Basics",
    "Wireless Communications: Principles and Practice",
    "Cisco CCNA 200-301 Official Cert Guide",
    "Networking Fundamentals",
    "Building Scalable Web Applications",
    "The Art of Scalability",
    "Pro Git",
    "Linux Kernel Development",
    "Shell Scripting: How to Automate Command Line Tasks",
    "Advanced Programming in the UNIX Environment",
    "Network Programmability and Automation",
    "Networking for Systems Administrators",
    "Computer Vision: Algorithms and Applications",
    "Digital Image Processing",
    "Natural Language Processing with Python",
    "Practical Packet Analysis",
    "Wireshark for Security Professionals",
    "Security Engineering",
    "Certified Ethical Hacker (CEH) Study Guide",
    "Linux Bible",
    "Learning Kali Linux",
    "Cisco Networking Essentials",
    "CompTIA Security+ Study Guide",
    "Linux Networking Cookbook",
    "Networking for Dummies",
    "Network Simulation Experiments Manual",
    "Cisco CCNP and CCIE Enterprise Core",
    "Mastering Python Networking",
    "Fundamentals of Database Systems",
    "SQL in 10 Minutes, Sams Teach Yourself",
    "Database System Concepts",
    "PostgreSQL: Up and Running",
    "MySQL High Availability",
    "The Definitive Guide to Django",
    "Fluent Python",
    "High Performance MySQL",
    "Network Management Fundamentals",
    "DNS and BIND",
    "Web Security for Developers",
    "Cybersecurity Essentials",
    "Cyber War: The Next Threat to National Security",
    "Cyber-Physical Systems",
    "Penetration Testing",
    "The Cyber Effect",
    "CISSP All-in-One Exam Guide",
    "AWS Certified Developer Official Study Guide",
    "CompTIA A+ Certification All-in-One Exam Guide",
    "Operating Systems: Three Easy Pieces",
    "The Algorithm Design Manual",
    "Computer Graphics: Principles and Practice",
    "Learning Computer Architecture with Raspberry Pi",
    "Embedded Systems: Introduction to ARM Cortex-M Microcontrollers",
    "Hands-On Embedded Programming with C++",
    "ARM System Developer's Guide",
    "Make: Electronics",
    "Programming Arduino: Getting Started with Sketches",
    "Raspberry Pi Cookbook",
    "Computer Networking Problems and Solutions",
    "Computer Science Distilled",
    "C Programming Absolute Beginner's Guide",
    "JavaScript: The Good Parts",
    "You Don't Know JS",
    "Learn C the Hard Way",
    "JavaScript and JQuery: Interactive Front-End Web Development",
    "Software Engineering at Google",
    "The Mythical Man-Month",
    "An Introduction to Statistical Learning",
    "Practical Data Science with R",
    "Machine Learning for Dummies",
    "Reinforcement Learning: An Introduction",
    "Deep Reinforcement Learning Hands-On",
    "Computer Vision with OpenCV",
    "Learning OpenCV 4 Computer Vision",
    "Hands-On GPU Programming with CUDA",
    "CUDA by Example",
    "Parallel Programming with OpenMP",
    "GPU Programming for Beginners",
    "Computer Systems: A Programmer's Perspective",
    "Systems Performance: Enterprise and the Cloud",
    "Computer Architecture: A Quantitative Approach",
    "Linux Administration Handbook",
    "Network Analysis with Wireshark",
    "Computer Networking: Principles, Protocols and Practice",
    "Networking for Game Developers",
    "Programming Massively Parallel Processors",
    "Learning Spark",
    "Spark: The Definitive Guide",
    "Kafka: The Definitive Guide",
    "Big Data: Principles and Best Practices",
    "Data Science from Scratch",
    "Artificial Intelligence by Example",
    "Software Architecture in Practice",
    "Mastering OpenCV with Practical Computer Vision Projects"
]

In [43]:
tokenizer = SubwordTokenizer()
tokenizer.build_vocab(sample_texts)
    

In [44]:
dataset = FastTextDataset(sample_texts, tokenizer)
dataloader = DataLoader(
        dataset,
        batch_size=32,
        shuffle=True,
        collate_fn=collate_fn  # Use custom collate function
    )
    

In [45]:
model = FastTextModel(
        vocab_size=len(tokenizer.subword2idx),
        embedding_dim=100
)

In [46]:
train_fasttext(model, dataloader, num_epochs=20)

Epoch [1/20], Loss: 6.2936
Epoch [2/20], Loss: 5.8324
Epoch [3/20], Loss: 4.8292
Epoch [4/20], Loss: 2.8538
Epoch [5/20], Loss: 1.2899
Epoch [6/20], Loss: 0.7737
Epoch [7/20], Loss: 0.6185
Epoch [8/20], Loss: 0.5645
Epoch [9/20], Loss: 0.5377
Epoch [10/20], Loss: 0.5219
Epoch [11/20], Loss: 0.5120
Epoch [12/20], Loss: 0.5033
Epoch [13/20], Loss: 0.4976
Epoch [14/20], Loss: 0.4926
Epoch [15/20], Loss: 0.4868
Epoch [16/20], Loss: 0.4827
Epoch [17/20], Loss: 0.4791
Epoch [18/20], Loss: 0.4753
Epoch [19/20], Loss: 0.4708
Epoch [20/20], Loss: 0.4666


In [74]:
domain_specific_texts = [
        "Networking for Dummies",
    "Network Simulation Experiments Manual",
    "Cisco CCNP and CCIE Enterprise Core",
    "Mastering Python Networking",
    "Fundamentals of Database Systems",
    "SQL in 10 Minutes, Sams Teach Yourself",
    "Database System Concepts",
    "PostgreSQL: Up and Running",
    "MySQL High Availability",
    "The Definitive Guide to Django",
    "Fluent Python",
    "High Performance MySQL",
    "Network Management Fundamentals",
    "DNS and BIND",
    ]

In [75]:
fine_tune_fasttext(model, domain_specific_texts, tokenizer)

Epoch [1/5], Loss: 1.0993
Epoch [2/5], Loss: 1.0991
Epoch [3/5], Loss: 1.0988
Epoch [4/5], Loss: 1.0985
Epoch [5/5], Loss: 1.0983


In [79]:
query = "DNS and BIDN"

In [80]:
similar_products = find_similar_products(model, query, sample_texts, tokenizer)

In [81]:
print("\nProducts similar to query:", query)
for product, similarity in similar_products:
    print(f"{product}: {similarity:.4f}")


Products similar to query: DNS and BIDN
DNS and BIND: 1.0000
Clean Code: A Handbook of Agile Software Craftsmanship: 0.5898
PostgreSQL: Up and Running: 0.5512
Cisco CCNP and CCIE Enterprise Core: 0.5367
Shell Scripting: How to Automate Command Line Tasks: 0.5272


## Model Save

In [69]:
def save_model(model: FastTextModel, 
                tokenizer: SubwordTokenizer,
                save_path: str):
    """Save both the model and tokenizer state"""
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'vocab_size': len(tokenizer.subword2idx),
        'embedding_dim': model.embeddings.embedding_dim,
        # Save tokenizer state
        'tokenizer_state': {
            'min_ngram': tokenizer.min_ngram,
            'max_ngram': tokenizer.max_ngram,
            'vocab': tokenizer.vocab,
            'subword2idx': tokenizer.subword2idx,
            'idx2subword': tokenizer.idx2subword,
            'word2subwords': dict(tokenizer.word2subwords)  # Convert defaultdict to dict
        }
    }
    torch.save(checkpoint, save_path)
    print(f"Model and tokenizer saved to {save_path}")

In [70]:
save_path = "fastext_ComputeBooks.pt"
save_model(model, tokenizer, save_path)

Model and tokenizer saved to fastext_ComputeBooks.pt
