# Enhanced TF-IDF Training on ANTIQUE Dataset
## Advanced Text Processing with Spell Checking, Lemmatization, and Stemming

This notebook trains enhanced TF-IDF models with:
- Spell checking (conservative approach to preserve MAP)
- Lemmatization with POS tagging
- Stemming for vocabulary reduction
- Optimized inverted index
- MAP-preserving preprocessing

The generated files will be used by the TF-IDF microservices.

In [None]:
# Install required packages
!pip install ir-datasets nltk scikit-learn joblib pandas numpy tqdm textblob

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

In [None]:
import ir_datasets
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tag import pos_tag
import re
import html
import unicodedata
import joblib
from tqdm import tqdm
import json
from collections import defaultdict
import pickle
from textblob import TextBlob

In [None]:
# Enhanced Text Cleaning Service
class EnhancedTextCleaner:
    def __init__(self, enable_spell_check=True, language='english'):
        self.enable_spell_check = enable_spell_check
        self.language = language
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words(language))
        
        # Enhanced stopwords for better IR performance
        self.technical_stopwords = {
            'code', 'function', 'method', 'class', 'variable', 'return',
            'import', 'from', 'def', 'if', 'else', 'for', 'while', 'try',
            'catch', 'finally', 'throw', 'throws', 'public', 'private',
            'protected', 'static', 'final', 'abstract', 'interface'
        }
        
        self.domain_stopwords = {
            'antique', 'vintage', 'old', 'item', 'piece', 'thing', 'stuff',
            'want', 'need', 'looking', 'find', 'search', 'help', 'please',
            'anyone', 'someone', 'know', 'tell', 'show', 'give', 'get',
            'would', 'could', 'should', 'might', 'maybe', 'perhaps'
        }
        
        self.all_stopwords = self.stop_words.union(
            self.technical_stopwords
        ).union(self.domain_stopwords)
        
        # Spell check cache for performance
        self.spell_check_cache = {}
        
        # Contractions expansion
        self.contractions = {
            "won't": "will not", "can't": "cannot", "n't": " not",
            "'re": " are", "'ve": " have", "'ll": " will",
            "'d": " would", "'m": " am", "it's": "it is",
            "that's": "that is", "what's": "what is",
            "where's": "where is", "how's": "how is"
        }
        
        # Normalization patterns
        self.normalization_patterns = [
            (r'https?://[^\s<>"]{2,}', ' URL '),  # URLs
            (r'www\.[^\s<>"]{2,}', ' URL '),      # www URLs
            (r'\S+@\S+', ' EMAIL '),              # Email addresses
            (r'\$\d+(?:\.\d+)?', ' PRICE '),      # Prices
            (r'\d{4}-\d{2}-\d{2}', ' DATE '),     # Dates
            (r'\b\d{4}\b', ' YEAR '),             # Years
        ]
    
    def get_wordnet_pos(self, word):
        """Map POS tag to WordNet POS tag for lemmatization."""
        try:
            tag = pos_tag([word])[0][1][0].upper()
            tag_dict = {
                'J': wordnet.ADJ,
                'N': wordnet.NOUN,
                'V': wordnet.VERB,
                'R': wordnet.ADV
            }
            return tag_dict.get(tag, wordnet.NOUN)
        except:
            return wordnet.NOUN
    
    def expand_contractions(self, text):
        """Expand contractions in text."""
        for contraction, expansion in self.contractions.items():
            text = re.sub(re.escape(contraction), expansion, text, flags=re.IGNORECASE)
        return text
    
    def normalize_unicode(self, text):
        """Normalize Unicode characters to ASCII equivalents."""
        text = unicodedata.normalize('NFD', text)
        text = ''.join(char for char in text if unicodedata.category(char) != 'Mn')
        return text.encode('ascii', 'ignore').decode('ascii')
    
    def apply_normalization_patterns(self, text):
        """Apply normalization patterns."""
        for pattern, replacement in self.normalization_patterns:
            text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
        return text
    
    def spell_check_word(self, word, conservative=True):
        """Apply conservative spell checking to preserve MAP performance."""
        if not self.enable_spell_check or len(word) < 4:
            return word
        
        if word in self.spell_check_cache:
            return self.spell_check_cache[word]
        
        try:
            blob = TextBlob(word)
            corrected = str(blob.correct())
            
            # Conservative approach: only use correction if it's significantly better
            if conservative:
                if (corrected != word and 
                    word not in self.stop_words and
                    abs(len(corrected) - len(word)) <= 2 and
                    len(set(corrected.lower()) & set(word.lower())) >= min(len(word), len(corrected)) * 0.6):
                    
                    self.spell_check_cache[word] = corrected
                    return corrected
            else:
                if corrected != word and word not in self.stop_words:
                    self.spell_check_cache[word] = corrected
                    return corrected
            
            self.spell_check_cache[word] = word
            return word
            
        except:
            self.spell_check_cache[word] = word
            return word
    
    def clean_text_basic(self, text):
        """Basic text cleaning for MAP-preserving preprocessing."""
        if not text or not isinstance(text, str):
            return ""
        
        # HTML decoding and tag removal
        text = html.unescape(text)
        text = re.sub(r'<[^>]+>', ' ', text)
        
        # Normalize Unicode
        text = self.normalize_unicode(text)
        
        # Apply normalization patterns
        text = self.apply_normalization_patterns(text)
        
        # Expand contractions
        text = self.expand_contractions(text)
        
        # Remove extra punctuation but preserve sentence boundaries
        text = re.sub(r'[^\w\s\.!?]', ' ', text)
        
        # Normalize whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text.lower()
    
    def get_cleaning_statistics(self, original, cleaned):
        """Get cleaning statistics."""
        original_tokens = word_tokenize(original.lower()) if original else []
        cleaned_tokens = cleaned.split() if cleaned else []
        
        return {
            'original_length': len(original) if original else 0,
            'cleaned_length': len(cleaned) if cleaned else 0,
            'original_tokens': len(original_tokens),
            'cleaned_tokens': len(cleaned_tokens),
            'token_reduction_ratio': 1 - (len(cleaned_tokens) / max(len(original_tokens), 1)),
            'char_reduction_ratio': 1 - (len(cleaned) / max(len(original), 1)) if original else 0,
            'spell_corrections': len(self.spell_check_cache)
        }

print("✓ Enhanced Text Cleaner loaded")

In [None]:
# Enhanced Tokenizer for TF-IDF Vectorizer
class EnhancedTokenizer:
    def __init__(self, enable_spell_check=True, enable_lemmatization=True, 
                 enable_stemming=True, language='english', min_token_length=3, max_token_length=50):
        self.enable_spell_check = enable_spell_check
        self.enable_lemmatization = enable_lemmatization
        self.enable_stemming = enable_stemming
        self.language = language
        self.min_token_length = min_token_length
        self.max_token_length = max_token_length
        
        # Initialize NLTK components
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words(language))
        
        # Enhanced stopwords
        self.technical_stopwords = {
            'code', 'function', 'method', 'class', 'variable', 'return',
            'import', 'from', 'def', 'if', 'else', 'for', 'while', 'try',
            'catch', 'finally', 'throw', 'throws', 'public', 'private',
            'protected', 'static', 'final', 'abstract', 'interface'
        }
        
        self.domain_stopwords = {
            'antique', 'vintage', 'old', 'item', 'piece', 'thing', 'stuff',
            'want', 'need', 'looking', 'find', 'search', 'help', 'please',
            'anyone', 'someone', 'know', 'tell', 'show', 'give', 'get',
            'would', 'could', 'should', 'might', 'maybe', 'perhaps'
        }
        
        self.all_stopwords = self.stop_words.union(
            self.technical_stopwords
        ).union(self.domain_stopwords)
        
        # Spell check cache
        self.spell_check_cache = {}
    
    def get_wordnet_pos(self, word):
        """Map POS tag to WordNet POS tag."""
        try:
            tag = pos_tag([word])[0][1][0].upper()
            tag_dict = {
                'J': wordnet.ADJ,
                'N': wordnet.NOUN,
                'V': wordnet.VERB,
                'R': wordnet.ADV
            }
            return tag_dict.get(tag, wordnet.NOUN)
        except:
            return wordnet.NOUN
    
    def spell_check_token(self, token):
        """Apply spell checking to a token."""
        if not self.enable_spell_check or len(token) < 4:
            return token
        
        if token in self.spell_check_cache:
            return self.spell_check_cache[token]
        
        try:
            blob = TextBlob(token)
            corrected = str(blob.correct())
            
            # Conservative correction to preserve MAP
            if (corrected != token and 
                token not in self.stop_words and
                abs(len(corrected) - len(token)) <= 2 and
                len(set(corrected.lower()) & set(token.lower())) >= min(len(token), len(corrected)) * 0.6):
                
                self.spell_check_cache[token] = corrected
                return corrected
            
            self.spell_check_cache[token] = token
            return token
            
        except:
            self.spell_check_cache[token] = token
            return token
    
    def process_token(self, token):
        """Process a single token through the complete pipeline."""
        # Basic filtering
        if (len(token) < self.min_token_length or 
            len(token) > self.max_token_length or
            not token.isalpha() or
            token.lower() in self.all_stopwords):
            return None
        
        # Convert to lowercase
        token = token.lower()
        
        # Apply spell checking
        if self.enable_spell_check:
            token = self.spell_check_token(token)
        
        # Apply lemmatization
        if self.enable_lemmatization:
            pos = self.get_wordnet_pos(token)
            token = self.lemmatizer.lemmatize(token, pos)
        
        # Apply stemming
        if self.enable_stemming:
            token = self.stemmer.stem(token)
        
        # Final length check
        if len(token) < self.min_token_length:
            return None
        
        return token
    
    def tokenize(self, text):
        """Tokenize text using the enhanced pipeline."""
        if not text or not isinstance(text, str):
            return []
        
        try:
            raw_tokens = word_tokenize(text)
        except:
            raw_tokens = text.split()
        
        # Process each token
        processed_tokens = []
        for token in raw_tokens:
            processed_token = self.process_token(token)
            if processed_token:
                processed_tokens.append(processed_token)
        
        return processed_tokens
    
    def __call__(self, text):
        """Make tokenizer callable for sklearn TfidfVectorizer."""
        return self.tokenize(text)
    
    def get_tokenizer_info(self):
        """Get tokenizer configuration."""
        return {
            'spell_check_enabled': self.enable_spell_check,
            'lemmatization_enabled': self.enable_lemmatization,
            'stemming_enabled': self.enable_stemming,
            'language': self.language,
            'min_token_length': self.min_token_length,
            'max_token_length': self.max_token_length,
            'total_stopwords': len(self.all_stopwords),
            'spell_check_cache_size': len(self.spell_check_cache)
        }

print("✓ Enhanced Tokenizer loaded")

In [None]:
# Load ANTIQUE dataset
print("Loading ANTIQUE dataset...")
dataset = ir_datasets.load("antique")

# Load documents
print("Loading documents...")
docs = {}
doc_texts = []
doc_ids = []

for doc in tqdm(dataset.docs_iter(), desc="Loading documents"):
    docs[doc.doc_id] = doc.text
    doc_texts.append(doc.text)
    doc_ids.append(doc.doc_id)

print(f"Loaded {len(docs)} documents")

# Load training queries
print("Loading queries...")
train_queries = {}
for query in dataset.queries_iter():
    train_queries[query.query_id] = query.text

print(f"Loaded {len(train_queries)} queries")

# Load qrels for evaluation
print("Loading qrels...")
train_qrels = defaultdict(dict)
for qrel in dataset.qrels_iter():
    train_qrels[qrel.query_id][qrel.doc_id] = qrel.relevance

print(f"Loaded qrels for {len(train_qrels)} queries")

In [None]:
# Initialize enhanced text cleaner
text_cleaner = EnhancedTextCleaner(enable_spell_check=True)

# Preprocess all documents with enhanced cleaning
print("Preprocessing documents with enhanced text cleaning...")
processed_doc_texts = []
cleaning_stats = []

for text in tqdm(doc_texts, desc="Enhanced preprocessing"):
    cleaned = text_cleaner.clean_text_basic(text)
    processed_doc_texts.append(cleaned)
    
    # Collect cleaning statistics for analysis
    stats = text_cleaner.get_cleaning_statistics(text, cleaned)
    cleaning_stats.append(stats)

# Filter out empty documents
valid_docs = [(doc_id, doc_text, processed_text) 
              for doc_id, doc_text, processed_text 
              in zip(doc_ids, doc_texts, processed_doc_texts) 
              if processed_text.strip()]

print(f"Valid documents after preprocessing: {len(valid_docs)}")

# Separate the valid data
valid_doc_ids = [item[0] for item in valid_docs]
valid_doc_texts = [item[1] for item in valid_docs]
valid_processed_texts = [item[2] for item in valid_docs]

# Calculate preprocessing statistics
total_original_chars = sum(stats['original_length'] for stats in cleaning_stats)
total_cleaned_chars = sum(stats['cleaned_length'] for stats in cleaning_stats)
total_original_tokens = sum(stats['original_tokens'] for stats in cleaning_stats)
total_cleaned_tokens = sum(stats['cleaned_tokens'] for stats in cleaning_stats)

preprocessing_summary = {
    'total_documents': len(doc_texts),
    'valid_documents': len(valid_docs),
    'documents_filtered': len(doc_texts) - len(valid_docs),
    'total_char_reduction': 1 - (total_cleaned_chars / total_original_chars),
    'total_token_reduction': 1 - (total_cleaned_tokens / total_original_tokens),
    'spell_corrections': text_cleaner.spell_check_cache
}

print("\n=== Preprocessing Summary ===")
for key, value in preprocessing_summary.items():
    if key != 'spell_corrections':
        print(f"{key}: {value}")
print(f"spell_corrections_count: {len(preprocessing_summary['spell_corrections'])}")

In [None]:
# Create enhanced tokenizer for TF-IDF
enhanced_tokenizer = EnhancedTokenizer(
    enable_spell_check=True,
    enable_lemmatization=True,
    enable_stemming=True,
    language='english',
    min_token_length=3,
    max_token_length=50
)

# Configure Enhanced TF-IDF vectorizer with optimized parameters for MAP > 0.4
print("Training Enhanced TF-IDF vectorizer...")

enhanced_tfidf_vectorizer = TfidfVectorizer(
    max_features=100000,      # Large vocabulary for better coverage
    min_df=2,                 # Remove very rare terms
    max_df=0.85,              # Remove very common terms (conservative)
    ngram_range=(1, 3),       # Include trigrams for better phrase matching
    sublinear_tf=True,        # Log scaling for TF
    norm='l2',                # L2 normalization
    smooth_idf=True,          # Smooth IDF weights
    use_idf=True,             # Use IDF weighting
    tokenizer=enhanced_tokenizer,  # Enhanced tokenizer with preprocessing
    preprocessor=None,        # All preprocessing handled by tokenizer
    lowercase=False,          # Handled by tokenizer
    stop_words=None,          # Handled by tokenizer
    token_pattern=None,       # Using custom tokenizer
)

# Fit and transform documents
print("Fitting Enhanced TF-IDF on documents...")
enhanced_tfidf_matrix = enhanced_tfidf_vectorizer.fit_transform(valid_processed_texts)

print(f"Enhanced TF-IDF matrix shape: {enhanced_tfidf_matrix.shape}")
print(f"Vocabulary size: {len(enhanced_tfidf_vectorizer.vocabulary_)}")
print(f"Non-zero entries: {enhanced_tfidf_matrix.nnz}")
print(f"Sparsity: {(1 - enhanced_tfidf_matrix.nnz / (enhanced_tfidf_matrix.shape[0] * enhanced_tfidf_matrix.shape[1])) * 100:.2f}%")
print(f"Average document length: {enhanced_tfidf_matrix.nnz / enhanced_tfidf_matrix.shape[0]:.2f} features")

# Get tokenizer statistics
tokenizer_info = enhanced_tokenizer.get_tokenizer_info()
print("\n=== Enhanced Tokenizer Info ===")
for key, value in tokenizer_info.items():
    print(f"{key}: {value}")

In [None]:
# Build optimized inverted index for enhanced search
print("Building optimized inverted index...")

def build_optimized_inverted_index(tfidf_matrix, vectorizer, doc_ids):
    """Build optimized inverted index with term statistics."""
    inverted_index = defaultdict(lambda: {
        'postings': [],
        'df': 0,
        'max_tfidf': 0.0,
        'avg_tfidf': 0.0
    })
    
    feature_names = vectorizer.get_feature_names_out()
    coo_matrix = tfidf_matrix.tocoo()
    term_stats = defaultdict(list)
    
    # Build index with statistics
    for doc_idx, term_idx, tfidf_score in tqdm(
        zip(coo_matrix.row, coo_matrix.col, coo_matrix.data),
        total=coo_matrix.nnz,
        desc="Building optimized inverted index"
    ):
        term = feature_names[term_idx]
        doc_id = doc_ids[doc_idx]
        
        # Add posting
        inverted_index[term]['postings'].append((doc_id, float(tfidf_score)))
        term_stats[term].append(float(tfidf_score))
    
    # Calculate statistics and sort postings
    for term in inverted_index:
        scores = term_stats[term]
        inverted_index[term]['df'] = len(scores)
        inverted_index[term]['max_tfidf'] = max(scores)
        inverted_index[term]['avg_tfidf'] = sum(scores) / len(scores)
        
        # Sort postings by TF-IDF score (descending)
        inverted_index[term]['postings'].sort(key=lambda x: x[1], reverse=True)
    
    return dict(inverted_index)

enhanced_inverted_index = build_optimized_inverted_index(
    enhanced_tfidf_matrix, 
    enhanced_tfidf_vectorizer, 
    valid_doc_ids
)

print(f"Optimized inverted index built with {len(enhanced_inverted_index)} terms")

# Calculate index statistics
avg_postings_per_term = np.mean([len(data['postings']) for data in enhanced_inverted_index.values()])
max_postings = max([len(data['postings']) for data in enhanced_inverted_index.values()])
min_postings = min([len(data['postings']) for data in enhanced_inverted_index.values()])

print(f"Average postings per term: {avg_postings_per_term:.2f}")
print(f"Max postings per term: {max_postings}")
print(f"Min postings per term: {min_postings}")

In [None]:
# Create document ID to index mapping
enhanced_doc_id_to_idx = {doc_id: idx for idx, doc_id in enumerate(valid_doc_ids)}
enhanced_idx_to_doc_id = {idx: doc_id for doc_id, idx in enhanced_doc_id_to_idx.items()}

# Create enhanced document metadata
enhanced_document_metadata = {
    doc_id: {
        'original_text': valid_doc_texts[idx],
        'processed_text': valid_processed_texts[idx],
        'index': idx,
        'original_length': len(valid_doc_texts[idx]),
        'processed_length': len(valid_processed_texts[idx]),
        'text_reduction_ratio': 1 - (len(valid_processed_texts[idx]) / max(len(valid_doc_texts[idx]), 1))
    }
    for doc_id, idx in enhanced_doc_id_to_idx.items()
}

print(f"Created enhanced metadata for {len(enhanced_document_metadata)} documents")

# Calculate comprehensive training statistics
feature_names = enhanced_tfidf_vectorizer.get_feature_names_out()
idf_scores = enhanced_tfidf_vectorizer.idf_

enhanced_training_stats = {
    # Document statistics
    'total_documents': len(doc_texts),
    'valid_documents': len(valid_docs),
    'documents_filtered': len(doc_texts) - len(valid_docs),
    'filter_rate_percent': ((len(doc_texts) - len(valid_docs)) / len(doc_texts)) * 100,
    
    # Text processing statistics
    'total_char_reduction_percent': preprocessing_summary['total_char_reduction'] * 100,
    'total_token_reduction_percent': preprocessing_summary['total_token_reduction'] * 100,
    'spell_corrections_count': len(preprocessing_summary['spell_corrections']),
    
    # TF-IDF matrix statistics
    'matrix_shape': enhanced_tfidf_matrix.shape,
    'vocabulary_size': len(feature_names),
    'non_zero_entries': int(enhanced_tfidf_matrix.nnz),
    'sparsity_percent': float((1 - enhanced_tfidf_matrix.nnz / (enhanced_tfidf_matrix.shape[0] * enhanced_tfidf_matrix.shape[1])) * 100),
    'avg_doc_length': float(enhanced_tfidf_matrix.nnz / enhanced_tfidf_matrix.shape[0]),
    
    # Vocabulary statistics
    'min_idf': float(np.min(idf_scores)),
    'max_idf': float(np.max(idf_scores)),
    'avg_idf': float(np.mean(idf_scores)),
    'median_idf': float(np.median(idf_scores)),
    
    # Inverted index statistics
    'inverted_index_terms': len(enhanced_inverted_index),
    'avg_postings_per_term': float(avg_postings_per_term),
    'max_postings_per_term': int(max_postings),
    'min_postings_per_term': int(min_postings),
    
    # Configuration
    'vectorizer_params': enhanced_tfidf_vectorizer.get_params(),
    'tokenizer_info': tokenizer_info,
    
    # Processing features enabled
    'spell_check_enabled': True,
    'lemmatization_enabled': True,
    'stemming_enabled': True,
    'ngram_range': (1, 3),
    'optimization_target': 'MAP_above_0.4'
}

print("\n=== Enhanced Training Statistics ===")
for key, value in enhanced_training_stats.items():
    if key not in ['vectorizer_params', 'tokenizer_info']:
        print(f"{key}: {value}")

In [None]:
# Save all enhanced trained models and data
print("Saving enhanced trained models and data...")

# Save Enhanced TF-IDF vectorizer
joblib.dump(enhanced_tfidf_vectorizer, 'enhanced_tfidf_vectorizer_antique.joblib')
print("✓ Enhanced TF-IDF vectorizer saved")

# Save Enhanced TF-IDF matrix (sparse)
joblib.dump(enhanced_tfidf_matrix, 'enhanced_tfidf_matrix_antique.joblib')
print("✓ Enhanced TF-IDF matrix saved")

# Save optimized inverted index
with open('enhanced_inverted_index_antique.pkl', 'wb') as f:
    pickle.dump(enhanced_inverted_index, f)
print("✓ Optimized inverted index saved")

# Save enhanced document mappings
with open('enhanced_doc_mappings_antique.json', 'w') as f:
    json.dump({
        'doc_id_to_idx': enhanced_doc_id_to_idx,
        'idx_to_doc_id': enhanced_idx_to_doc_id
    }, f)
print("✓ Enhanced document mappings saved")

# Save enhanced document metadata
with open('enhanced_document_metadata_antique.json', 'w') as f:
    json.dump(enhanced_document_metadata, f)
print("✓ Enhanced document metadata saved")

# Save enhanced training statistics
with open('enhanced_training_statistics_antique.json', 'w') as f:
    # Convert numpy types for JSON serialization
    serializable_stats = {}
    for key, value in enhanced_training_stats.items():
        if isinstance(value, (np.int64, np.int32)):
            serializable_stats[key] = int(value)
        elif isinstance(value, (np.float64, np.float32)):
            serializable_stats[key] = float(value)
        elif isinstance(value, tuple):
            serializable_stats[key] = list(value)
        else:
            serializable_stats[key] = value
    
    json.dump(serializable_stats, f, indent=2)
print("✓ Enhanced training statistics saved")

# Save text cleaning methods and configurations
cleaning_config = {
    'spell_check_cache': dict(text_cleaner.spell_check_cache),
    'tokenizer_spell_check_cache': dict(enhanced_tokenizer.spell_check_cache),
    'preprocessing_summary': preprocessing_summary,
    'text_cleaner_config': {
        'enable_spell_check': text_cleaner.enable_spell_check,
        'language': text_cleaner.language,
        'total_stopwords': len(text_cleaner.all_stopwords),
        'technical_stopwords_count': len(text_cleaner.technical_stopwords),
        'domain_stopwords_count': len(text_cleaner.domain_stopwords)
    }
}

with open('enhanced_text_cleaning_config_antique.json', 'w') as f:
    json.dump(cleaning_config, f, indent=2)
print("✓ Enhanced text cleaning configuration saved")

print("\n=== Enhanced Training Complete ===")
print(f"Documents processed: {enhanced_training_stats['valid_documents']}")
print(f"Vocabulary size: {enhanced_training_stats['vocabulary_size']}")
print(f"Matrix sparsity: {enhanced_training_stats['sparsity_percent']:.2f}%")
print(f"Inverted index terms: {enhanced_training_stats['inverted_index_terms']}")
print(f"Spell corrections applied: {enhanced_training_stats['spell_corrections_count']}")
print(f"Text reduction: {enhanced_training_stats['total_char_reduction_percent']:.2f}%")
print("\nOptimizations enabled:")
print("✓ Spell checking (conservative)")
print("✓ Lemmatization with POS tagging")
print("✓ Stemming for vocabulary reduction")
print("✓ Enhanced stopword filtering")
print("✓ Optimized inverted index with term statistics")
print("✓ MAP-preserving preprocessing")

In [None]:
# Test enhanced search functionality
print("\n=== Testing Enhanced Search ===")

def test_enhanced_search(query, top_k=5):
    """Test enhanced TF-IDF search with inverted index."""
    # Clean query using the same process
    processed_query = text_cleaner.clean_text_basic(query)
    print(f"Query: {query}")
    print(f"Processed: {processed_query}")
    
    if not processed_query.strip():
        print("Empty query after preprocessing")
        return
    
    # Transform query to TF-IDF
    query_vector = enhanced_tfidf_vectorizer.transform([processed_query])
    
    # Test 1: Full matrix search
    similarities = cosine_similarity(query_vector, enhanced_tfidf_matrix).flatten()
    top_indices = similarities.argsort()[-top_k:][::-1]
    
    print(f"\nTop {top_k} results (Full Matrix Search):")
    for i, idx in enumerate(top_indices):
        doc_id = valid_doc_ids[idx]
        score = similarities[idx]
        doc_text = valid_doc_texts[idx][:150] + "..."
        print(f"{i+1}. Doc {doc_id} (Score: {score:.4f})")
        print(f"   {doc_text}\n")
    
    # Test 2: Inverted index search simulation
    query_terms = enhanced_tokenizer.tokenize(processed_query)
    print(f"Query terms after tokenization: {query_terms}")
    
    # Get candidate documents from inverted index
    candidate_docs = set()
    term_doc_scores = defaultdict(float)
    
    for term in query_terms:
        if term in enhanced_inverted_index:
            term_data = enhanced_inverted_index[term]
            print(f"Term '{term}': {term_data['df']} documents, max_tfidf: {term_data['max_tfidf']:.4f}")
            
            for doc_id, tfidf_score in term_data['postings'][:10]:  # Top 10 for this term
                candidate_docs.add(doc_id)
                term_doc_scores[doc_id] += tfidf_score
    
    print(f"Found {len(candidate_docs)} candidate documents from inverted index")

# Test with sample queries
test_queries = [
    "antique furniture restoration techniques",
    "vintage jewelry appraisal value",
    "old coins historical significance"
]

for test_query in test_queries:
    test_enhanced_search(test_query, top_k=3)
    print("-" * 80)

In [None]:
# Download all enhanced trained files
from google.colab import files

print("Downloading enhanced trained model files...")

try:
    files.download('enhanced_tfidf_vectorizer_antique.joblib')
    files.download('enhanced_tfidf_matrix_antique.joblib')
    files.download('enhanced_inverted_index_antique.pkl')
    files.download('enhanced_doc_mappings_antique.json')
    files.download('enhanced_document_metadata_antique.json')
    files.download('enhanced_training_statistics_antique.json')
    files.download('enhanced_text_cleaning_config_antique.json')
    print("✓ All enhanced files downloaded successfully!")
    
    print("\n=== Files Generated ===")
    print("1. enhanced_tfidf_vectorizer_antique.joblib - Enhanced TF-IDF vectorizer")
    print("2. enhanced_tfidf_matrix_antique.joblib - TF-IDF matrix with enhanced features")
    print("3. enhanced_inverted_index_antique.pkl - Optimized inverted index with statistics")
    print("4. enhanced_doc_mappings_antique.json - Document ID mappings")
    print("5. enhanced_document_metadata_antique.json - Document metadata with processing stats")
    print("6. enhanced_training_statistics_antique.json - Comprehensive training statistics")
    print("7. enhanced_text_cleaning_config_antique.json - Text cleaning configuration and caches")
    
    print("\n=== Usage Instructions ===")
    print("1. Upload these files to your backend/models/ directory")
    print("2. Use the Enhanced TF-IDF Service to load the pre-trained models")
    print("3. The service will automatically use the optimized inverted index")
    print("4. Text cleaning configuration includes spell-check caches for consistency")
    
except Exception as e:
    print(f"Download error: {e}")
    print("Files are saved in Colab session and can be downloaded manually.")
    print("\nAvailable files:")
    import os
    for file in os.listdir('.'):
        if 'enhanced' in file and (file.endswith('.joblib') or file.endswith('.pkl') or file.endswith('.json')):
            print(f"  - {file}")