# TF-IDF Training on ANTIQUE Dataset
## 3-Day Implementation - Day 1: TF-IDF Model Training

This notebook trains TF-IDF models on ANTIQUE training data and saves them for use in the IR system.

In [None]:
# Install required packages
!pip install ir-datasets nltk scikit-learn joblib pandas numpy tqdm textblob

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

In [None]:
import ir_datasets
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import html
import joblib
from tqdm import tqdm
import json
from collections import defaultdict
import pickle

In [None]:
# Enhanced Text Cleaning Service
import unicodedata
from textblob import TextBlob
from nltk.corpus import wordnet
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer

class EnhancedTextCleaner:
    def __init__(self, enable_spell_check=True):
        self.enable_spell_check = enable_spell_check
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        
        # Enhanced stopwords
        self.technical_stopwords = {
            'code', 'function', 'method', 'class', 'variable', 'return',
            'import', 'from', 'def', 'if', 'else', 'for', 'while', 'try'
        }
        
        self.domain_stopwords = {
            'antique', 'vintage', 'old', 'item', 'piece', 'thing', 'stuff',
            'want', 'need', 'looking', 'find', 'search', 'help', 'please'
        }
        
        self.all_stopwords = self.stop_words.union(
            self.technical_stopwords
        ).union(self.domain_stopwords)
        
        # Spell check cache
        self.spell_check_cache = {}
        
        # Contractions
        self.contractions = {
            "won't": "will not", "can't": "cannot", "n't": " not",
            "'re": " are", "'ve": " have", "'ll": " will",
            "'d": " would", "'m": " am", "it's": "it is"
        }
    
    def get_wordnet_pos(self, word):
        tag = pos_tag([word])[0][1][0].upper()
        tag_dict = {'J': wordnet.ADJ, 'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)
    
    def expand_contractions(self, text):
        for contraction, expansion in self.contractions.items():
            text = re.sub(re.escape(contraction), expansion, text, flags=re.IGNORECASE)
        return text
    
    def normalize_unicode(self, text):
        text = unicodedata.normalize('NFD', text)
        text = ''.join(char for char in text if unicodedata.category(char) != 'Mn')
        return text.encode('ascii', 'ignore').decode('ascii')
    
    def spell_check_word(self, word):
        if not self.enable_spell_check or len(word) < 4:
            return word
        
        if word in self.spell_check_cache:
            return self.spell_check_cache[word]
        
        try:
            blob = TextBlob(word)
            corrected = str(blob.correct())
            
            if (corrected != word and word not in self.stop_words and
                abs(len(corrected) - len(word)) <= 2 and
                len(set(corrected.lower()) & set(word.lower())) >= min(len(word), len(corrected)) * 0.6):
                self.spell_check_cache[word] = corrected
                return corrected
        except:
            pass
        
        self.spell_check_cache[word] = word
        return word
    
    def clean_text_basic(self, text):
        if not text or not isinstance(text, str):
            return ""
        
        # HTML decoding and tag removal
        text = html.unescape(text)
        text = re.sub(r'<[^>]+>', ' ', text)
        
        # Normalize Unicode
        text = self.normalize_unicode(text)
        
        # URLs and emails
        text = re.sub(r'https?://[^\s<>"]{2,}', ' URL ', text)
        text = re.sub(r'\S+@\S+', ' EMAIL ', text)
        
        # Expand contractions
        text = self.expand_contractions(text)
        
        # Remove extra punctuation but preserve sentence boundaries
        text = re.sub(r'[^\w\s\.\!\?]', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip().lower()
        
        return text
    
    def preprocess_for_tfidf(self, text):
        # Basic cleaning
        text = self.clean_text_basic(text)
        
        if not text.strip():
            return ""
        
        # Tokenize
        tokens = word_tokenize(text)
        
        # Process tokens
        processed_tokens = []
        for token in tokens:
            if len(token) <= 2 or not token.isalpha() or token in self.all_stopwords:
                continue
            
            # Spell checking
            if self.enable_spell_check:
                token = self.spell_check_word(token)
            
            # Lemmatization with POS
            pos = self.get_wordnet_pos(token)
            lemmatized = self.lemmatizer.lemmatize(token, pos)
            
            # Stemming
            stemmed = self.stemmer.stem(lemmatized)
            
            if len(stemmed) > 2:
                processed_tokens.append(stemmed)
        
        return ' '.join(processed_tokens)
    
    def batch_preprocess(self, texts):
        return [self.preprocess_for_tfidf(text) for text in tqdm(texts, desc="Enhanced Preprocessing")]

# Enhanced Tokenizer for TF-IDF Vectorizer
class EnhancedTokenizer:
    def __init__(self, enable_spell_check=True, enable_lemmatization=True, enable_stemming=True):
        self.enable_spell_check = enable_spell_check
        self.enable_lemmatization = enable_lemmatization
        self.enable_stemming = enable_stemming
        
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        
        # Enhanced stopwords
        self.technical_stopwords = {
            'code', 'function', 'method', 'class', 'variable', 'return',
            'import', 'from', 'def', 'if', 'else', 'for', 'while', 'try'
        }
        
        self.domain_stopwords = {
            'antique', 'vintage', 'old', 'item', 'piece', 'thing', 'stuff',
            'want', 'need', 'looking', 'find', 'search', 'help', 'please'
        }
        
        self.all_stopwords = self.stop_words.union(
            self.technical_stopwords
        ).union(self.domain_stopwords)
        
        self.spell_check_cache = {}
    
    def get_wordnet_pos(self, word):
        try:
            tag = pos_tag([word])[0][1][0].upper()
            tag_dict = {'J': wordnet.ADJ, 'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV}
            return tag_dict.get(tag, wordnet.NOUN)
        except:
            return wordnet.NOUN
    
    def spell_check_token(self, token):
        if not self.enable_spell_check or len(token) < 4:
            return token
        
        if token in self.spell_check_cache:
            return self.spell_check_cache[token]
        
        try:
            blob = TextBlob(token)
            corrected = str(blob.correct())
            
            if (corrected != token and token not in self.stop_words and
                abs(len(corrected) - len(token)) <= 2 and
                len(set(corrected.lower()) & set(token.lower())) >= min(len(token), len(corrected)) * 0.6):
                self.spell_check_cache[token] = corrected
                return corrected
        except:
            pass
        
        self.spell_check_cache[token] = token
        return token
    
    def __call__(self, text):
        if not text or not isinstance(text, str):
            return []
        
        try:
            tokens = word_tokenize(text)
        except:
            tokens = text.split()
        
        processed_tokens = []
        for token in tokens:
            if (len(token) < 3 or len(token) > 50 or
                not token.isalpha() or
                token.lower() in self.all_stopwords):
                continue
            
            token = token.lower()
            
            # Spell checking
            if self.enable_spell_check:
                token = self.spell_check_token(token)
            
            # Lemmatization
            if self.enable_lemmatization:
                pos = self.get_wordnet_pos(token)
                token = self.lemmatizer.lemmatize(token, pos)
            
            # Stemming
            if self.enable_stemming:
                token = self.stemmer.stem(token)
            
            if len(token) >= 3:
                processed_tokens.append(token)
        
        return processed_tokens

In [None]:
# Load ANTIQUE dataset
print("Loading ANTIQUE dataset...")
dataset = ir_datasets.load("antique")

# Load documents
print("Loading documents...")
docs = {}
doc_texts = []
doc_ids = []

for doc in tqdm(dataset.docs_iter(), desc="Loading documents"):
    docs[doc.doc_id] = doc.text
    doc_texts.append(doc.text)
    doc_ids.append(doc.doc_id)

print(f"Loaded {len(docs)} documents")

# Load training queries
print("Loading queries...")
train_queries = {}
for query in dataset.queries_iter():
    train_queries[query.query_id] = query.text

print(f"Loaded {len(train_queries)} queries")

# Load qrels for evaluation
print("Loading qrels...")
train_qrels = defaultdict(dict)
for qrel in dataset.qrels_iter():
    train_qrels[qrel.query_id][qrel.doc_id] = qrel.relevance

print(f"Loaded qrels for {len(train_qrels)} queries")

In [None]:
# Initialize enhanced preprocessor
preprocessor = EnhancedTextCleaner(enable_spell_check=True)
enhanced_tokenizer = EnhancedTokenizer(
    enable_spell_check=True,
    enable_lemmatization=True,
    enable_stemming=True
)

# Preprocess all documents with enhanced cleaning
print("Preprocessing documents with enhanced text processing...")
processed_doc_texts = []
for text in tqdm(doc_texts, desc="Enhanced preprocessing"):
    cleaned = preprocessor.clean_text_basic(text)
    if cleaned.strip():
        processed_doc_texts.append(cleaned)
    else:
        processed_doc_texts.append("")

# Filter out empty documents
valid_docs = [(doc_id, doc_text, processed_text) 
              for doc_id, doc_text, processed_text 
              in zip(doc_ids, doc_texts, processed_doc_texts) 
              if processed_text.strip()]

print(f"Valid documents after preprocessing: {len(valid_docs)}")

# Separate the valid data
valid_doc_ids = [item[0] for item in valid_docs]
valid_doc_texts = [item[1] for item in valid_docs]
valid_processed_texts = [item[2] for item in valid_docs]

In [None]:
# Import Enhanced TF-IDF Service V2
import sys
sys.path.append('.')

# Enhanced TF-IDF Service V2 Implementation
class EnhancedTFIDFServiceV2:
    def __init__(self, enable_spell_check=True, enable_lemmatization=True, enable_stemming=True):
        self.text_cleaner = EnhancedTextCleaner(enable_spell_check=enable_spell_check)
        self.enhanced_tokenizer = EnhancedTokenizer(
            enable_spell_check=enable_spell_check,
            enable_lemmatization=enable_lemmatization,
            enable_stemming=enable_stemming
        )
        self.vectorizer = None
        self.tfidf_matrix = None
        self.inverted_index = None
        self.doc_id_to_idx = None
        self.idx_to_doc_id = None
        self.document_metadata = None
        self.training_stats = None
        self.is_trained = False
    
    def create_optimized_vectorizer(self, **params):
        default_params = {
            'max_features': 100000,    # Large vocabulary for better coverage
            'min_df': 2,               # Remove very rare terms
            'max_df': 0.85,            # Remove very common terms
            'ngram_range': (1, 3),     # Include trigrams for better matching
            'sublinear_tf': True,      # Log scaling for TF
            'norm': 'l2',              # L2 normalization
            'smooth_idf': True,        # Smooth IDF weights
            'use_idf': True,           # Use IDF weighting
            'tokenizer': self.enhanced_tokenizer,
            'preprocessor': None,      # All preprocessing handled by tokenizer
            'lowercase': False,        # Handled by tokenizer
            'stop_words': None,        # Handled by tokenizer
            'token_pattern': None,     # Using custom tokenizer
        }
        default_params.update(params)
        return TfidfVectorizer(**default_params)
    
    def train_enhanced_tfidf(self, documents, doc_ids, vectorizer_params=None, build_inverted_index=True):
        print(f"Training Enhanced TF-IDF V2 on {len(documents)} documents...")
        
        # Step 1: Clean text using enhanced cleaner
        print("Step 1: Enhanced text cleaning...")
        valid_docs = []
        for doc_id, doc_text in zip(doc_ids, documents):
            cleaned_text = self.text_cleaner.clean_text_basic(doc_text)
            if cleaned_text.strip():
                valid_docs.append((doc_id, doc_text, cleaned_text))
        
        print(f"Valid documents after cleaning: {len(valid_docs)}")
        
        # Step 2: Create enhanced vectorizer
        vectorizer_params = vectorizer_params or {}
        self.vectorizer = self.create_optimized_vectorizer(**vectorizer_params)
        
        # Step 3: Fit and transform with enhanced tokenization
        print("Step 2: Fitting TF-IDF with enhanced tokenization...")
        training_texts = [item[2] for item in valid_docs]
        self.tfidf_matrix = self.vectorizer.fit_transform(training_texts)
        
        # Step 4: Create mappings
        valid_doc_ids = [item[0] for item in valid_docs]
        self.doc_id_to_idx = {doc_id: idx for idx, doc_id in enumerate(valid_doc_ids)}
        self.idx_to_doc_id = {idx: doc_id for doc_id, idx in self.doc_id_to_idx.items()}
        
        # Step 5: Create metadata
        self.document_metadata = {
            doc_id: {
                'original_text': item[1],
                'cleaned_text': item[2],
                'index': idx
            }
            for doc_id, idx in self.doc_id_to_idx.items()
            for item in valid_docs if item[0] == doc_id
        }
        
        # Step 6: Build inverted index
        if build_inverted_index:
            print("Step 3: Building optimized inverted index...")
            self.inverted_index = self.build_optimized_inverted_index()
        
        # Step 7: Calculate statistics
        self.training_stats = {
            'total_documents': len(documents),
            'valid_documents': len(valid_docs),
            'vocabulary_size': len(self.vectorizer.vocabulary_),
            'matrix_shape': self.tfidf_matrix.shape,
            'non_zero_entries': int(self.tfidf_matrix.nnz),
            'sparsity': float((1 - self.tfidf_matrix.nnz / (self.tfidf_matrix.shape[0] * self.tfidf_matrix.shape[1])) * 100),
            'inverted_index_terms': len(self.inverted_index) if self.inverted_index else 0,
            'spell_check_enabled': self.enhanced_tokenizer.enable_spell_check,
            'lemmatization_enabled': self.enhanced_tokenizer.enable_lemmatization,
            'stemming_enabled': self.enhanced_tokenizer.enable_stemming
        }
        
        self.is_trained = True
        print("✓ Enhanced TF-IDF V2 training completed!")
        return self.training_stats
    
    def build_optimized_inverted_index(self):
        inverted_index = defaultdict(lambda: {'postings': [], 'df': 0, 'max_tfidf': 0.0, 'avg_tfidf': 0.0})
        feature_names = self.vectorizer.get_feature_names_out()
        coo_matrix = self.tfidf_matrix.tocoo()
        term_stats = defaultdict(list)
        
        for doc_idx, term_idx, tfidf_score in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data):
            term = feature_names[term_idx]
            doc_id = self.idx_to_doc_id[doc_idx]
            inverted_index[term]['postings'].append((doc_id, float(tfidf_score)))
            term_stats[term].append(float(tfidf_score))
        
        for term in inverted_index:
            scores = term_stats[term]
            inverted_index[term]['df'] = len(scores)
            inverted_index[term]['max_tfidf'] = max(scores)
            inverted_index[term]['avg_tfidf'] = sum(scores) / len(scores)
            inverted_index[term]['postings'].sort(key=lambda x: x[1], reverse=True)
        
        return dict(inverted_index)

# Create and train enhanced TF-IDF service
enhanced_tfidf_service = EnhancedTFIDFServiceV2(
    enable_spell_check=True,
    enable_lemmatization=True,
    enable_stemming=True
)

# Train with enhanced processing
training_stats = enhanced_tfidf_service.train_enhanced_tfidf(
    documents=valid_doc_texts,
    doc_ids=valid_doc_ids,
    build_inverted_index=True
)

print("\n=== Enhanced Training Statistics ===")
for key, value in training_stats.items():
    print(f"{key}: {value}")

# Extract components for backward compatibility
tfidf_vectorizer = enhanced_tfidf_service.vectorizer
tfidf_matrix = enhanced_tfidf_service.tfidf_matrix
inverted_index = enhanced_tfidf_service.inverted_index
doc_id_to_idx = enhanced_tfidf_service.doc_id_to_idx
idx_to_doc_id = enhanced_tfidf_service.idx_to_doc_id
document_metadata = enhanced_tfidf_service.document_metadata

In [None]:
# Create inverted index for efficient retrieval
print("Building inverted index...")

def build_inverted_index(tfidf_matrix, vectorizer, doc_ids):
    """Build inverted index from TF-IDF matrix."""
    inverted_index = defaultdict(list)
    feature_names = vectorizer.get_feature_names_out()
    
    # Convert to COO format for efficient iteration
    coo_matrix = tfidf_matrix.tocoo()
    
    # Build index: term -> [(doc_id, tfidf_score), ...]
    for doc_idx, term_idx, tfidf_score in tqdm(
        zip(coo_matrix.row, coo_matrix.col, coo_matrix.data),
        total=coo_matrix.nnz,
        desc="Building inverted index"
    ):
        term = feature_names[term_idx]
        doc_id = doc_ids[doc_idx]
        inverted_index[term].append((doc_id, float(tfidf_score)))
    
    # Sort each posting list by TF-IDF score (descending)
    for term in inverted_index:
        inverted_index[term].sort(key=lambda x: x[1], reverse=True)
    
    return dict(inverted_index)

inverted_index = build_inverted_index(tfidf_matrix, tfidf_vectorizer, valid_doc_ids)
print(f"Inverted index built with {len(inverted_index)} terms")

In [None]:
# Create document ID to index mapping
doc_id_to_idx = {doc_id: idx for idx, doc_id in enumerate(valid_doc_ids)}
idx_to_doc_id = {idx: doc_id for doc_id, idx in doc_id_to_idx.items()}

# Create document metadata
document_metadata = {
    doc_id: {
        'original_text': valid_doc_texts[idx],
        'processed_text': valid_processed_texts[idx],
        'index': idx
    }
    for doc_id, idx in doc_id_to_idx.items()
}

print(f"Created metadata for {len(document_metadata)} documents")

In [None]:
# Save all trained models and data
print("Saving trained models and data...")

# Save TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer_antique.joblib')
print("✓ TF-IDF vectorizer saved")

# Save TF-IDF matrix (sparse)
joblib.dump(tfidf_matrix, 'tfidf_matrix_antique.joblib')
print("✓ TF-IDF matrix saved")

# Save inverted index
with open('inverted_index_antique.pkl', 'wb') as f:
    pickle.dump(inverted_index, f)
print("✓ Inverted index saved")

# Save document mappings
with open('doc_mappings_antique.json', 'w') as f:
    json.dump({
        'doc_id_to_idx': doc_id_to_idx,
        'idx_to_doc_id': idx_to_doc_id
    }, f)
print("✓ Document mappings saved")

# Save document metadata
with open('document_metadata_antique.json', 'w') as f:
    json.dump(document_metadata, f)
print("✓ Document metadata saved")

# Save preprocessing statistics
stats = {
    'total_documents': len(doc_texts),
    'valid_documents': len(valid_docs),
    'vocabulary_size': len(tfidf_vectorizer.vocabulary_),
    'matrix_shape': tfidf_matrix.shape,
    'non_zero_entries': tfidf_matrix.nnz,
    'sparsity': (1 - tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1])) * 100,
    'inverted_index_terms': len(inverted_index)
}

with open('training_statistics_antique.json', 'w') as f:
    json.dump(stats, f, indent=2)
print("✓ Training statistics saved")

print("\n=== Training Complete ===")
print(f"Documents processed: {stats['valid_documents']}")
print(f"Vocabulary size: {stats['vocabulary_size']}")
print(f"Matrix sparsity: {stats['sparsity']:.2f}%")
print(f"Inverted index terms: {stats['inverted_index_terms']}")

In [None]:
# Quick test of the trained model
print("\n=== Quick Test ===")

def test_tfidf_search(query, top_k=5):
    """Test Enhanced TF-IDF search functionality."""
    # Preprocess query with enhanced cleaning
    processed_query = preprocessor.clean_text_basic(query)
    print(f"Query: {query}")
    print(f"Processed: {processed_query}")
    
    if not processed_query.strip():
        print("Empty query after preprocessing")
        return
    
    # Transform query to TF-IDF
    query_vector = tfidf_vectorizer.transform([processed_query])
    
    # Calculate similarities
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    
    # Get top results
    top_indices = similarities.argsort()[-top_k:][::-1]
    
    print(f"\nTop {top_k} results:")
    for i, idx in enumerate(top_indices):
        doc_id = valid_doc_ids[idx]
        score = similarities[idx]
        doc_text = valid_doc_texts[idx][:150] + "..."
        print(f"{i+1}. Doc {doc_id} (Score: {score:.4f})")
        print(f"   {doc_text}\n")

# Test with sample queries
test_queries = [
    "antique furniture restoration",
    "vintage jewelry appraisal",
    "old coins value"
]

for test_query in test_queries:
    test_tfidf_search(test_query, top_k=3)
    print("-" * 80)

In [None]:
# Download trained files
from google.colab import files

print("Downloading trained model files...")

try:
    files.download('tfidf_vectorizer_antique.joblib')
    files.download('tfidf_matrix_antique.joblib')
    files.download('inverted_index_antique.pkl')
    files.download('doc_mappings_antique.json')
    files.download('document_metadata_antique.json')
    files.download('training_statistics_antique.json')
    print("✓ All files downloaded successfully!")
except Exception as e:
    print(f"Download error: {e}")
    print("Files are saved in Colab session and can be downloaded manually.")