# Quora Dataset TF-IDF Implementation with Advanced Text Cleaning

This notebook implements TF-IDF vectorization on the Quora dataset with:
- **Advanced custom text cleaning optimized for Quora question pairs**
- **Custom tokenization with semantic preservation**
- **Inverted index construction**
- **Model persistence using joblib**
- **Evaluation using MAP metric (target: ≥ 0.3)**

## Dataset Structure
- Documents: `/content/drive/MyDrive/downloads/docs.csv`
- Queries: `/content/drive/MyDrive/downloads/queries.csv`
- Relevance judgments: `/content/drive/MyDrive/downloads/qrels.csv`

## Key Optimizations for Quora
- Question-specific text preprocessing
- Preservation of question markers (what, how, why, etc.)
- Handling of duplicate question patterns
- Optimized n-gram features for question matching

## 1. Setup and Installation

In [None]:
# Install required packages
!pip install nltk scikit-learn pandas numpy joblib tqdm

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import re
import os
import joblib
import nltk
from collections import defaultdict, Counter
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

print("Setup complete!")

## 2. Data Loading and Verification

In [None]:
# Define file paths
DATA_PATH = '/content/drive/MyDrive/downloads/'
DOCS_FILE = os.path.join(DATA_PATH, 'docs.csv')
QUERIES_FILE = os.path.join(DATA_PATH, 'queries.csv')
QRELS_FILE = os.path.join(DATA_PATH, 'qrels.csv')

# Verify files exist
files_to_check = [DOCS_FILE, QUERIES_FILE, QRELS_FILE]
for file_path in files_to_check:
    if os.path.exists(file_path):
        print(f"✓ Found: {file_path}")
    else:
        print(f"✗ Missing: {file_path}")

# Load datasets
print("\nLoading datasets...")
docs_df = pd.read_csv(DOCS_FILE)
queries_df = pd.read_csv(QUERIES_FILE)
qrels_df = pd.read_csv(QRELS_FILE)

print(f"Documents: {len(docs_df)} rows")
print(f"Queries: {len(queries_df)} rows")
print(f"Qrels: {len(qrels_df)} rows")

# Display sample data
print("\nDocument columns:", docs_df.columns.tolist())
print("Query columns:", queries_df.columns.tolist())
print("Qrels columns:", qrels_df.columns.tolist())

print("\nSample document:")
print(docs_df.head(1))

print("\nSample query:")
print(queries_df.head(1))

print("\nSample qrel:")
print(qrels_df.head(1))

## 3. Advanced Text Cleaning for Quora Questions

In [None]:
class QuoraTextCleaner:
    """
    Advanced text cleaning class optimized for Quora question pairs
    with semantic preservation and question-specific optimizations.
    """
    
    def __init__(self):
        # Setup stopwords with exceptions for important question words
        self.stop_words = set(stopwords.words('english'))
        
        # Remove question words and semantic indicators that are crucial for Quora
        question_words = {
            'what', 'when', 'where', 'why', 'who', 'which', 'how',
            'can', 'could', 'would', 'should', 'will', 'shall',
            'do', 'does', 'did', 'is', 'are', 'was', 'were',
            'not', 'no', 'never', 'none', 'nothing', 'neither',
            'more', 'most', 'less', 'least', 'very', 'quite',
            'much', 'many', 'few', 'some', 'any', 'all',
            'best', 'better', 'good', 'bad', 'right', 'wrong'
        }
        self.stop_words = self.stop_words - question_words
        
        # Initialize lemmatizer
        self.lemmatizer = WordNetLemmatizer()
        
        # Common contractions for question text
        self.contractions = {
            "don't": "do not",
            "won't": "will not",
            "can't": "cannot",
            "n't": " not",
            "'re": " are",
            "'ve": " have",
            "'ll": " will",
            "'d": " would",
            "'m": " am",
            "what's": "what is",
            "that's": "that is",
            "there's": "there is",
            "here's": "here is",
            "where's": "where is",
            "how's": "how is"
        }
        
        # Question patterns that should be normalized
        self.question_patterns = {
            r'\bhow do i\b': 'how to',
            r'\bhow can i\b': 'how to',
            r'\bhow should i\b': 'how to',
            r'\bwhat is the best way to\b': 'how to',
            r'\bwhat are the ways to\b': 'how to',
            r'\bwhat are some\b': 'what are',
            r'\bwhat are the\b': 'what are'
        }
    
    def smart_clean_text(self, text):
        """
        Enhanced text cleaning optimized for Quora questions.
        
        Args:
            text (str): Input text to clean
            
        Returns:
            str: Cleaned text
        """
        if pd.isna(text) or not isinstance(text, str):
            return ""
            
        # Convert to lowercase
        text = text.lower()
        
        # Expand contractions
        for contraction, expansion in self.contractions.items():
            text = text.replace(contraction, expansion)
        
        # Normalize question patterns
        for pattern, replacement in self.question_patterns.items():
            text = re.sub(pattern, replacement, text)
        
        # Remove or normalize specific patterns
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' URL ', text)
        text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', ' EMAIL ', text)
        text = re.sub(r'<.*?>', ' ', text)
        
        # Handle numbers more intelligently for questions
        text = re.sub(r'\b(19|20)\d{2}\b', ' YEAR ', text)  # Years
        text = re.sub(r'\b\d+\.\d+\b', ' DECIMAL ', text)  # Decimals
        text = re.sub(r'\b\d+(?:st|nd|rd|th)\b', ' ORDINAL ', text)  # Ordinals
        text = re.sub(r'\b\d+\b', ' NUMBER ', text)  # Other numbers
        
        # Handle emphasis and punctuation
        text = re.sub(r'[!]{2,}', ' EMPHASIS ', text)
        text = re.sub(r'[?]{2,}', ' MULTIQUEST ', text)
        text = re.sub(r'[.]{3,}', ' ELLIPSIS ', text)
        
        # Remove special characters but preserve some important ones
        text = re.sub(r'[^a-zA-Z0-9\s\-\'_]', ' ', text)
        
        # Handle hyphenated words carefully (important for compound terms)
        text = re.sub(r'\b(\w+)-(\w+)\b', r'\1 \2 \1\2', text)  # Keep both forms
        
        # Normalize whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def custom_tokenizer(self, text):
        """
        Custom tokenizer optimized for Quora questions.
        
        Args:
            text (str): Input text
            
        Returns:
            list: List of processed tokens
        """
        # Clean the text first
        cleaned_text = self.smart_clean_text(text)
        
        # Tokenize
        tokens = word_tokenize(cleaned_text)
        
        # Filter and lemmatize
        processed_tokens = []
        for token in tokens:
            # Skip very short tokens or stopwords
            if len(token) < 2 or token in self.stop_words:
                continue
                
            # Skip tokens that are just underscores or dashes
            if re.match(r'^[_\-]+$', token):
                continue
                
            # Lemmatize
            lemmatized = self.lemmatizer.lemmatize(token)
            processed_tokens.append(lemmatized)
        
        return processed_tokens

# Initialize the text cleaner
text_cleaner = QuoraTextCleaner()

# Test the cleaning function
sample_text = "What's the best way to learn machine learning? How can I improve my programming skills?"
cleaned_sample = text_cleaner.smart_clean_text(sample_text)
tokens_sample = text_cleaner.custom_tokenizer(sample_text)

print("Original text:", sample_text)
print("Cleaned text:", cleaned_sample)
print("Tokens:", tokens_sample)
print("\nQuora-optimized text cleaning functions ready!")

## 4. Data Preprocessing and Preparation

In [None]:
# Preprocess documents
print("Preprocessing documents...")

# Handle different possible column names for documents
doc_text_col = 'text' if 'text' in docs_df.columns else 'question' if 'question' in docs_df.columns else docs_df.columns[1]
doc_id_col = 'doc_id' if 'doc_id' in docs_df.columns else 'id' if 'id' in docs_df.columns else docs_df.columns[0]

docs_df['cleaned_text'] = docs_df[doc_text_col].apply(text_cleaner.smart_clean_text)
docs_df['doc_id'] = docs_df[doc_id_col].astype(str)

# Remove empty documents
docs_df = docs_df[docs_df['cleaned_text'].str.len() > 0]
print(f"Documents after cleaning: {len(docs_df)}")

# Preprocess queries
print("Preprocessing queries...")

# Handle different possible column names for queries
query_text_col = 'query' if 'query' in queries_df.columns else 'text' if 'text' in queries_df.columns else 'question' if 'question' in queries_df.columns else queries_df.columns[1]
query_id_col = 'query_id' if 'query_id' in queries_df.columns else 'id' if 'id' in queries_df.columns else queries_df.columns[0]

queries_df['cleaned_query'] = queries_df[query_text_col].apply(text_cleaner.smart_clean_text)
queries_df['query_id'] = queries_df[query_id_col].astype(str)

# Remove empty queries
queries_df = queries_df[queries_df['cleaned_query'].str.len() > 0]
print(f"Queries after cleaning: {len(queries_df)}")

# Prepare qrels
qrels_columns = qrels_df.columns.tolist()
if 'query_id' not in qrels_columns:
    qrels_df['query_id'] = qrels_df[qrels_columns[0]].astype(str)
if 'doc_id' not in qrels_columns:
    qrels_df['doc_id'] = qrels_df[qrels_columns[1]].astype(str)
    
qrels_df['query_id'] = qrels_df['query_id'].astype(str)
qrels_df['doc_id'] = qrels_df['doc_id'].astype(str)

print("\nData preprocessing complete!")
print(f"Final dataset sizes:")
print(f"- Documents: {len(docs_df)}")
print(f"- Queries: {len(queries_df)}")
print(f"- Qrels: {len(qrels_df)}")

# Display sample of cleaned data
print("\nSample cleaned document:")
print(f"Original: {docs_df.iloc[0][doc_text_col][:200]}...")
print(f"Cleaned: {docs_df.iloc[0]['cleaned_text'][:200]}...")

print("\nSample cleaned query:")
print(f"Original: {queries_df.iloc[0][query_text_col]}")
print(f"Cleaned: {queries_df.iloc[0]['cleaned_query']}")

## 5. TF-IDF Vectorization with Custom Preprocessing

In [None]:
# Create TF-IDF vectorizer with custom preprocessing
print("Creating TF-IDF vectorizer optimized for Quora questions...")

tfidf_vectorizer = TfidfVectorizer(
    preprocessor=None,  # We handle preprocessing ourselves
    tokenizer=text_cleaner.custom_tokenizer,  # Use our custom tokenizer
    token_pattern=None,  # Disable default tokenization
    lowercase=False,  # Already handled in custom tokenizer
    stop_words=None,  # Already handled in custom tokenizer
    max_features=12000,  # Optimized vocabulary size for questions
    min_df=1,  # Keep rare terms (important for specific questions)
    max_df=0.85,  # Remove very common terms
    ngram_range=(1, 2),  # Use unigrams and bigrams
    use_idf=True,
    smooth_idf=True,
    sublinear_tf=True,  # Apply sublinear TF scaling
    norm='l2'  # L2 normalization
)

# Fit and transform documents
print("Fitting TF-IDF vectorizer on documents...")
document_texts = docs_df['cleaned_text'].tolist()
tfidf_matrix = tfidf_vectorizer.fit_transform(document_texts)

print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"Number of features: {len(tfidf_vectorizer.get_feature_names_out())}")
print(f"Matrix sparsity: {(1 - tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1])) * 100:.2f}%")

# Display sample features
feature_names = tfidf_vectorizer.get_feature_names_out()
print(f"\nSample features: {feature_names[:20]}")
bigrams = [f for f in feature_names if ' ' in f]
print(f"Sample bigrams: {bigrams[:10]}")

# Show some question-specific terms
question_terms = [f for f in feature_names if any(q in f for q in ['what', 'how', 'why', 'where', 'when'])]
print(f"Question-related terms: {question_terms[:15]}")

## 6. Inverted Index Construction

In [None]:
def build_inverted_index(tfidf_matrix, feature_names, doc_ids):
    """
    Build inverted index from TF-IDF matrix.
    
    Args:
        tfidf_matrix: Sparse TF-IDF matrix
        feature_names: List of feature names
        doc_ids: List of document IDs
        
    Returns:
        dict: Inverted index mapping terms to documents and scores
    """
    print("Building inverted index...")
    
    inverted_index = defaultdict(dict)
    
    # Convert to COO format for efficient iteration
    coo_matrix = tfidf_matrix.tocoo()
    
    # Build inverted index
    for doc_idx, term_idx, score in tqdm(zip(coo_matrix.row, coo_matrix.col, coo_matrix.data), 
                                          total=coo_matrix.nnz, desc="Building index"):
        if score > 0:  # Only include non-zero scores
            term = feature_names[term_idx]
            doc_id = doc_ids[doc_idx]
            inverted_index[term][doc_id] = float(score)
    
    # Sort documents by score for each term
    for term in inverted_index:
        inverted_index[term] = dict(sorted(inverted_index[term].items(), 
                                          key=lambda x: x[1], reverse=True))
    
    return dict(inverted_index)

# Build inverted index
doc_ids = docs_df['doc_id'].tolist()
inverted_index = build_inverted_index(tfidf_matrix, feature_names, doc_ids)

print(f"\nInverted index statistics:")
print(f"Number of terms: {len(inverted_index)}")
print(f"Average documents per term: {np.mean([len(docs) for docs in inverted_index.values()]):.2f}")

# Show most frequent terms
most_frequent_terms = sorted(inverted_index.keys(), key=lambda x: len(inverted_index[x]), reverse=True)[:10]
print(f"Most frequent terms: {most_frequent_terms}")

# Display sample inverted index entries
sample_term = list(inverted_index.keys())[0]
print(f"\nSample inverted index entry for '{sample_term}':")
sample_docs = dict(list(inverted_index[sample_term].items())[:5])
print(sample_docs)

## 7. Save Models and Data using Joblib

In [None]:
# Create output directory
output_dir = '/content/drive/MyDrive/quora_tfidf_models/'
os.makedirs(output_dir, exist_ok=True)

print("Saving models and data...")

# Save TF-IDF vectorizer
vectorizer_path = os.path.join(output_dir, 'tfidf_vectorizer.joblib')
joblib.dump(tfidf_vectorizer, vectorizer_path)
print(f"✓ Saved TF-IDF vectorizer to {vectorizer_path}")

# Save TF-IDF matrix
matrix_path = os.path.join(output_dir, 'tfidf_matrix.joblib')
joblib.dump(tfidf_matrix, matrix_path)
print(f"✓ Saved TF-IDF matrix to {matrix_path}")

# Save inverted index
index_path = os.path.join(output_dir, 'inverted_index.joblib')
joblib.dump(inverted_index, index_path)
print(f"✓ Saved inverted index to {index_path}")

# Save document mappings
doc_mapping = {
    'doc_ids': doc_ids,
    'docs_df': docs_df,
    'queries_df': queries_df,
    'qrels_df': qrels_df
}
mapping_path = os.path.join(output_dir, 'document_mappings.joblib')
joblib.dump(doc_mapping, mapping_path)
print(f"✓ Saved document mappings to {mapping_path}")

# Save text cleaner
cleaner_path = os.path.join(output_dir, 'text_cleaner.joblib')
joblib.dump(text_cleaner, cleaner_path)
print(f"✓ Saved text cleaner to {cleaner_path}")

print(f"\nAll models saved successfully to {output_dir}")

# Display saved files
saved_files = os.listdir(output_dir)
print(f"\nSaved files:")
for file in saved_files:
    file_path = os.path.join(output_dir, file)
    file_size = os.path.getsize(file_path) / (1024 * 1024)  # MB
    print(f"- {file}: {file_size:.2f} MB")

## 8. Search Function Implementation

In [None]:
def search_documents(query, tfidf_vectorizer, tfidf_matrix, doc_ids, top_k=1000):
    """
    Search documents using TF-IDF cosine similarity.
    
    Args:
        query (str): Search query
        tfidf_vectorizer: Fitted TF-IDF vectorizer
        tfidf_matrix: Document TF-IDF matrix
        doc_ids (list): List of document IDs
        top_k (int): Number of top results to return
        
    Returns:
        list: List of (doc_id, score) tuples ranked by relevance
    """
    # Transform query using the fitted vectorizer
    query_vector = tfidf_vectorizer.transform([query])
    
    # Calculate cosine similarity
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    
    # Get top-k results
    top_indices = np.argsort(similarities)[::-1][:top_k]
    
    # Create results list
    results = []
    for idx in top_indices:
        if similarities[idx] > 0:  # Only include documents with non-zero similarity
            results.append((doc_ids[idx], similarities[idx]))
    
    return results

def search_with_inverted_index(query, inverted_index, tfidf_vectorizer, doc_ids, top_k=1000):
    """
    Search documents using inverted index for faster retrieval.
    
    Args:
        query (str): Search query
        inverted_index (dict): Inverted index
        tfidf_vectorizer: Fitted TF-IDF vectorizer
        doc_ids (list): List of document IDs
        top_k (int): Number of top results to return
        
    Returns:
        list: List of (doc_id, score) tuples ranked by relevance
    """
    # Get query terms using the same tokenizer
    query_terms = tfidf_vectorizer.build_analyzer()(query)
    
    # Collect candidate documents
    candidate_docs = defaultdict(float)
    
    for term in query_terms:
        if term in inverted_index:
            for doc_id, score in inverted_index[term].items():
                candidate_docs[doc_id] += score
    
    # Sort by score and return top-k
    sorted_docs = sorted(candidate_docs.items(), key=lambda x: x[1], reverse=True)
    
    return sorted_docs[:top_k]

# Test search function
test_query = "How to learn programming efficiently?"
print(f"Testing search with query: '{test_query}'")

# Search using TF-IDF matrix
results_tfidf = search_documents(test_query, tfidf_vectorizer, tfidf_matrix, doc_ids, top_k=5)
print(f"\nTop 5 results (TF-IDF):")
for i, (doc_id, score) in enumerate(results_tfidf, 1):
    print(f"{i}. Doc {doc_id}: {score:.4f}")

# Search using inverted index
results_index = search_with_inverted_index(test_query, inverted_index, tfidf_vectorizer, doc_ids, top_k=5)
print(f"\nTop 5 results (Inverted Index):")
for i, (doc_id, score) in enumerate(results_index, 1):
    print(f"{i}. Doc {doc_id}: {score:.4f}")

print("\nSearch functions implemented successfully!")

## 9. Evaluation Implementation

In [None]:
def calculate_average_precision(retrieved_docs, relevant_docs):
    """
    Calculate Average Precision for a single query.
    
    Args:
        retrieved_docs (list): List of retrieved document IDs in rank order
        relevant_docs (set): Set of relevant document IDs
        
    Returns:
        float: Average Precision score
    """
    if not relevant_docs:
        return 0.0
    
    precision_at_k = []
    relevant_retrieved = 0
    
    for k, doc_id in enumerate(retrieved_docs, 1):
        if doc_id in relevant_docs:
            relevant_retrieved += 1
            precision_at_k.append(relevant_retrieved / k)
    
    if not precision_at_k:
        return 0.0
    
    return sum(precision_at_k) / len(relevant_docs)

def calculate_map(queries_df, qrels_df, search_function, **search_kwargs):
    """
    Calculate Mean Average Precision (MAP) for all queries.
    
    Args:
        queries_df (pd.DataFrame): DataFrame with queries
        qrels_df (pd.DataFrame): DataFrame with relevance judgments
        search_function (callable): Search function to use
        **search_kwargs: Additional arguments for search function
        
    Returns:
        tuple: (MAP score, list of individual AP scores)
    """
    # Group relevance judgments by query
    qrels_grouped = qrels_df.groupby('query_id')['doc_id'].apply(set).to_dict()
    
    ap_scores = []
    
    print("Calculating MAP...")
    
    for _, query_row in tqdm(queries_df.iterrows(), total=len(queries_df), desc="Evaluating queries"):
        query_id = query_row['query_id']
        query_text = query_row['cleaned_query']
        
        # Get relevant documents for this query
        relevant_docs = qrels_grouped.get(query_id, set())
        
        if not relevant_docs:
            continue
        
        # Search for documents
        results = search_function(query_text, **search_kwargs)
        
        # Extract document IDs from results
        retrieved_docs = [doc_id for doc_id, _ in results]
        
        # Calculate Average Precision
        ap = calculate_average_precision(retrieved_docs, relevant_docs)
        ap_scores.append(ap)
    
    # Calculate MAP
    map_score = np.mean(ap_scores) if ap_scores else 0.0
    
    return map_score, ap_scores

def evaluate_system(queries_df, qrels_df, tfidf_vectorizer, tfidf_matrix, inverted_index, doc_ids):
    """
    Comprehensive evaluation of the TF-IDF system.
    
    Args:
        queries_df (pd.DataFrame): DataFrame with queries
        qrels_df (pd.DataFrame): DataFrame with relevance judgments
        tfidf_vectorizer: Fitted TF-IDF vectorizer
        tfidf_matrix: Document TF-IDF matrix
        inverted_index (dict): Inverted index
        doc_ids (list): List of document IDs
        
    Returns:
        dict: Evaluation results
    """
    results = {}
    
    # Evaluate using TF-IDF matrix search
    print("Evaluating TF-IDF matrix search...")
    map_tfidf, ap_scores_tfidf = calculate_map(
        queries_df, qrels_df, search_documents,
        tfidf_vectorizer=tfidf_vectorizer,
        tfidf_matrix=tfidf_matrix,
        doc_ids=doc_ids,
        top_k=1000
    )
    
    results['tfidf_matrix'] = {
        'MAP': map_tfidf,
        'AP_scores': ap_scores_tfidf,
        'num_queries': len(ap_scores_tfidf)
    }
    
    # Evaluate using inverted index search
    print("Evaluating inverted index search...")
    map_index, ap_scores_index = calculate_map(
        queries_df, qrels_df, search_with_inverted_index,
        inverted_index=inverted_index,
        tfidf_vectorizer=tfidf_vectorizer,
        doc_ids=doc_ids,
        top_k=1000
    )
    
    results['inverted_index'] = {
        'MAP': map_index,
        'AP_scores': ap_scores_index,
        'num_queries': len(ap_scores_index)
    }
    
    return results

print("Evaluation functions implemented successfully!")

## 10. Run Comprehensive Evaluation

In [None]:
# Run comprehensive evaluation
print("Starting comprehensive evaluation...")
print("=" * 50)

evaluation_results = evaluate_system(
    queries_df, qrels_df, tfidf_vectorizer, tfidf_matrix, inverted_index, doc_ids
)

print("\n" + "=" * 50)
print("EVALUATION RESULTS")
print("=" * 50)

for method, results in evaluation_results.items():
    print(f"\n{method.upper()} SEARCH:")
    print(f"MAP Score: {results['MAP']:.4f}")
    print(f"Number of queries evaluated: {results['num_queries']}")
    print(f"Average Precision scores - Min: {min(results['AP_scores']):.4f}, Max: {max(results['AP_scores']):.4f}")
    print(f"Standard deviation: {np.std(results['AP_scores']):.4f}")
    
    # Check if MAP is above 0.3
    if results['MAP'] > 0.3:
        print(f"✅ MAP > 0.3 TARGET ACHIEVED! ({results['MAP']:.4f})")
    else:
        print(f"❌ MAP < 0.3 target not met ({results['MAP']:.4f})")

print("\n" + "=" * 50)
print("PERFORMANCE ANALYSIS")
print("=" * 50)

# Performance breakdown
tfidf_ap_scores = evaluation_results['tfidf_matrix']['AP_scores']
index_ap_scores = evaluation_results['inverted_index']['AP_scores']

print(f"\nDetailed Performance Analysis:")
print(f"TF-IDF Matrix Search:")
print(f"  - Queries with AP > 0.5: {sum(1 for ap in tfidf_ap_scores if ap > 0.5)}")
print(f"  - Queries with AP > 0.3: {sum(1 for ap in tfidf_ap_scores if ap > 0.3)}")
print(f"  - Queries with AP > 0.1: {sum(1 for ap in tfidf_ap_scores if ap > 0.1)}")
print(f"  - Queries with AP = 0: {sum(1 for ap in tfidf_ap_scores if ap == 0)}")

print(f"\nInverted Index Search:")
print(f"  - Queries with AP > 0.5: {sum(1 for ap in index_ap_scores if ap > 0.5)}")
print(f"  - Queries with AP > 0.3: {sum(1 for ap in index_ap_scores if ap > 0.3)}")
print(f"  - Queries with AP > 0.1: {sum(1 for ap in index_ap_scores if ap > 0.1)}")
print(f"  - Queries with AP = 0: {sum(1 for ap in index_ap_scores if ap == 0)}")

# Save evaluation results
eval_results_path = os.path.join(output_dir, 'evaluation_results.joblib')
joblib.dump(evaluation_results, eval_results_path)
print(f"\n✓ Evaluation results saved to {eval_results_path}")

print("\n" + "=" * 50)
print("EVALUATION COMPLETE!")
print("=" * 50)

## 11. Optimization for Better MAP Performance

In [None]:
# If MAP is below 0.3, try optimization strategies
current_map = evaluation_results['tfidf_matrix']['MAP']

print("PERFORMANCE OPTIMIZATION")
print("=" * 40)
print(f"\nCurrent MAP: {current_map:.4f}")
print(f"Target MAP: 0.3000")

if current_map < 0.3:
    print("\n🔧 IMPLEMENTING OPTIMIZATIONS...")
    
    # Strategy 1: Adjusted TF-IDF parameters
    print("\n1. Testing optimized TF-IDF parameters...")
    
    optimized_vectorizer = TfidfVectorizer(
        preprocessor=None,
        tokenizer=text_cleaner.custom_tokenizer,
        token_pattern=None,
        lowercase=False,
        stop_words=None,
        max_features=15000,  # Increased vocabulary
        min_df=1,  # Keep all terms
        max_df=0.8,  # More restrictive on common terms
        ngram_range=(1, 3),  # Include trigrams
        use_idf=True,
        smooth_idf=True,
        sublinear_tf=True,
        norm='l2'
    )
    
    # Fit optimized vectorizer
    optimized_tfidf_matrix = optimized_vectorizer.fit_transform(document_texts)
    print(f"Optimized TF-IDF matrix shape: {optimized_tfidf_matrix.shape}")
    
    # Evaluate optimized system
    print("Evaluating optimized system...")
    optimized_map, optimized_ap_scores = calculate_map(
        queries_df, qrels_df, search_documents,
        tfidf_vectorizer=optimized_vectorizer,
        tfidf_matrix=optimized_tfidf_matrix,
        doc_ids=doc_ids,
        top_k=1000
    )
    
    print(f"\nOptimization Results:")
    print(f"Original MAP: {current_map:.4f}")
    print(f"Optimized MAP: {optimized_map:.4f}")
    print(f"Improvement: {optimized_map - current_map:.4f}")
    
    if optimized_map > 0.3:
        print(f"\n🎉 SUCCESS! MAP > 0.3 TARGET ACHIEVED!")
        
        # Save optimized models
        optimized_vectorizer_path = os.path.join(output_dir, 'optimized_tfidf_vectorizer.joblib')
        optimized_matrix_path = os.path.join(output_dir, 'optimized_tfidf_matrix.joblib')
        
        joblib.dump(optimized_vectorizer, optimized_vectorizer_path)
        joblib.dump(optimized_tfidf_matrix, optimized_matrix_path)
        
        print(f"✓ Saved optimized models to {output_dir}")
        
        # Update the main models
        tfidf_vectorizer = optimized_vectorizer
        tfidf_matrix = optimized_tfidf_matrix
        current_map = optimized_map
        
    elif optimized_map > current_map:
        print(f"\n⚡ Improvement achieved but still below target.")
        print(f"\n📝 Additional strategies to try:")
        print(f"   - Query expansion using word similarity")
        print(f"   - Different text preprocessing approaches")
        print(f"   - BM25 scoring instead of TF-IDF")
        print(f"   - Learning-to-rank methods")
        
        # Update with improved model
        tfidf_vectorizer = optimized_vectorizer
        tfidf_matrix = optimized_tfidf_matrix
        current_map = optimized_map
    
    else:
        print(f"\n⚠️ No improvement with parameter optimization.")
        print(f"Consider more advanced techniques.")

else:
    print(f"\n🎉 EXCELLENT! MAP > 0.3 TARGET ACHIEVED!")
    print(f"The system is performing well with current configuration.")

print(f"\nFinal MAP Score: {current_map:.4f}")
print("\n" + "=" * 40)
print("OPTIMIZATION COMPLETE!")
print("=" * 40)

## 12. Sample Query Testing

In [None]:
# Test with sample queries to demonstrate the system
print("SAMPLE QUERY TESTING")
print("=" * 40)

# Select some sample queries
sample_queries = queries_df.head(5)

for _, query_row in sample_queries.iterrows():
    query_id = query_row['query_id']
    original_query = query_row[query_text_col]
    cleaned_query = query_row['cleaned_query']
    
    print(f"\nQuery ID: {query_id}")
    print(f"Original: {original_query}")
    print(f"Cleaned: {cleaned_query}")
    
    # Get search results
    results = search_documents(cleaned_query, tfidf_vectorizer, tfidf_matrix, doc_ids, top_k=5)
    
    # Get relevant documents from qrels
    relevant_docs = set(qrels_df[qrels_df['query_id'] == query_id]['doc_id'].astype(str))
    
    print(f"Relevant documents: {len(relevant_docs)}")
    print(f"Top 5 search results:")
    
    for i, (doc_id, score) in enumerate(results[:5], 1):
        relevance = "✓" if doc_id in relevant_docs else "✗"
        print(f"  {i}. Doc {doc_id} ({relevance}): {score:.4f}")
        
        # Show snippet of the document
        if doc_id in docs_df['doc_id'].values:
            doc_text = docs_df[docs_df['doc_id'] == doc_id][doc_text_col].iloc[0]
            snippet = doc_text[:200] + "..." if len(doc_text) > 200 else doc_text
            print(f"     \"{snippet}\"")
    
    print("-" * 40)

print("\nSample query testing complete!")

## 13. Final Summary and Results

In [None]:
print("FINAL SUMMARY - QUORA TF-IDF IMPLEMENTATION")
print("=" * 60)

print(f"\n📊 DATASET STATISTICS:")
print(f"Documents processed: {len(docs_df)}")
print(f"Queries processed: {len(queries_df)}")
print(f"Relevance judgments: {len(qrels_df)}")

print(f"\n🔧 MODEL CONFIGURATION:")
print(f"TF-IDF Features: {tfidf_matrix.shape[1]}")
print(f"Matrix sparsity: {(1 - tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1])) * 100:.2f}%")
print(f"Inverted index terms: {len(inverted_index)}")
print(f"N-gram range: {tfidf_vectorizer.ngram_range}")

print(f"\n📈 PERFORMANCE RESULTS:")
print(f"Final MAP Score: {current_map:.4f}")

print(f"\n💾 SAVED MODELS:")
saved_files = os.listdir(output_dir)
for file in saved_files:
    print(f"- {file}")

print(f"\n🎯 TARGET ACHIEVEMENT:")
if current_map >= 0.3:
    print(f"✅ SUCCESS! MAP score: {current_map:.4f} ≥ 0.3")
    print(f"🎉 Quora TF-IDF system meets performance requirements!")
else:
    print(f"❌ Target not fully met. MAP score: {current_map:.4f} < 0.3")
    print(f"⚡ Consider implementing advanced optimization techniques.")

print(f"\n🚀 SYSTEM FEATURES:")
print(f"✓ Advanced Quora-specific text cleaning")
print(f"✓ Custom tokenization with semantic preservation")
print(f"✓ Optimized TF-IDF vectorization")
print(f"✓ Efficient inverted index")
print(f"✓ Comprehensive MAP evaluation")
print(f"✓ Complete model persistence")

print(f"\n📂 SYSTEM READY FOR USE!")
print(f"All models saved to: {output_dir}")
print(f"\nTo use the system:")
print(f"1. Load models using joblib.load()")
print(f"2. Use search_documents() for new queries")
print(f"3. Inverted index provides faster term-based search")

print("\n" + "=" * 60)
print("QUORA TF-IDF IMPLEMENTATION COMPLETE!")
print("=" * 60)

## 14. Usage Example for Future Use

In [None]:
# Example code for loading and using the saved models
print("USAGE EXAMPLE FOR FUTURE USE")
print("=" * 40)

example_code = '''
# How to load and use the saved Quora TF-IDF models
import joblib
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load saved models
output_dir = '/content/drive/MyDrive/quora_tfidf_models/'

tfidf_vectorizer = joblib.load(output_dir + 'tfidf_vectorizer.joblib')
tfidf_matrix = joblib.load(output_dir + 'tfidf_matrix.joblib')
inverted_index = joblib.load(output_dir + 'inverted_index.joblib')
doc_mappings = joblib.load(output_dir + 'document_mappings.joblib')
text_cleaner = joblib.load(output_dir + 'text_cleaner.joblib')

# Extract document IDs
doc_ids = doc_mappings['doc_ids']

# Search function for new queries
def search_quora_questions(query, top_k=10):
    """Search for similar Quora questions"""
    # Transform query using the fitted vectorizer
    query_vector = tfidf_vectorizer.transform([query])
    
    # Calculate similarities
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    
    # Get top results
    top_indices = np.argsort(similarities)[::-1][:top_k]
    
    results = []
    for idx in top_indices:
        if similarities[idx] > 0:
            results.append((doc_ids[idx], similarities[idx]))
    
    return results

# Example usage
query = "How to learn machine learning effectively?"
results = search_quora_questions(query)
print(f"Top results for '{query}':")
for doc_id, score in results[:5]:
    print(f"Doc {doc_id}: {score:.4f}")
'''

print("Copy and save this code for future use:")
print(example_code)

# Save the example code to a file
example_file_path = os.path.join(output_dir, 'quora_usage_example.py')
with open(example_file_path, 'w') as f:
    f.write(example_code)

print(f"\n✓ Usage example saved to: {example_file_path}")
print("\nThis completes the Quora TF-IDF implementation!")