<a href="https://colab.research.google.com/github/yourusername/custom-search-engine/blob/main/backend/tfidf_quora_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TF-IDF Training on Quora Dataset

This notebook trains TF-IDF models on the Quora dataset for use in the custom search engine.


## Installation and Setup

In [None]:
# Install required packages
!pip install scikit-learn numpy pandas matplotlib seaborn nltk joblib tqdm

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re
from tqdm import tqdm
import json
import os
from typing import List, Dict, Any
import time

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## Data Loading and Preprocessing

In [None]:
# Load Quora dataset
# Adjust the path based on where you upload your Quora dataset
# Example formats: CSV, JSON, JSONL

def load_quora_dataset(file_path):
    """
    Load Quora dataset from various formats
    Expected columns: id, text (or question/answer)
    """
    if file_path.endswith('.csv'):
        df = pd.read_csv(file_path)
    elif file_path.endswith('.json'):
        df = pd.read_json(file_path)
    elif file_path.endswith('.jsonl'):
        data = []
        with open(file_path, 'r') as f:
            for line in f:
                data.append(json.loads(line))
        df = pd.DataFrame(data)
    else:
        raise ValueError("Unsupported file format")
    
    return df

# Upload your Quora dataset file to Colab
from google.colab import files
uploaded = files.upload()

# Load the dataset
file_name = list(uploaded.keys())[0]
quora_df = load_quora_dataset(file_name)

print(f"Dataset shape: {quora_df.shape}")
print("\nColumns:", quora_df.columns.tolist())
print("\nFirst few rows:")
quora_df.head()

In [None]:
# Data preprocessing and cleaning
class QuoraTextProcessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = PorterStemmer()
        self.stop_words = set(stopwords.words('english'))
    
    def clean_text(self, text):
        """Basic text cleaning"""
        if pd.isna(text):
            return ""
        
        # Convert to string and lowercase
        text = str(text).lower()
        
        # Remove HTML tags
        text = re.sub(r'<[^>]+>', '', text)
        
        # Remove special characters, keep only alphanumeric and spaces
        text = re.sub(r'[^a-z0-9\s]', ' ', text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def tokenize_and_process(self, text):
        """Tokenize and apply lemmatization and stemming"""
        if not text:
            return []
        
        # Tokenize
        tokens = word_tokenize(text)
        
        processed_tokens = []
        for token in tokens:
            # Filter out short tokens and non-alphanumeric
            if len(token) < 2 or not token.isalnum():
                continue
            
            # Remove stopwords
            if token in self.stop_words:
                continue
            
            # Apply lemmatization then stemming
            lemmatized = self.lemmatizer.lemmatize(token)
            stemmed = self.stemmer.stem(lemmatized)
            processed_tokens.append(stemmed)
        
        return processed_tokens
    
    def process_text(self, text):
        """Complete text processing pipeline"""
        cleaned = self.clean_text(text)
        tokens = self.tokenize_and_process(cleaned)
        return ' '.join(tokens)

# Initialize processor
processor = QuoraTextProcessor()

# Identify text column(s)
# Adjust these based on your dataset structure
text_columns = ['text', 'question', 'answer', 'content']
available_text_cols = [col for col in text_columns if col in quora_df.columns]

if not available_text_cols:
    print("Available columns:", quora_df.columns.tolist())
    text_col = input("Enter the name of the text column: ")
else:
    text_col = available_text_cols[0]
    
print(f"Using text column: {text_col}")

# Process a subset for faster experimentation
sample_size = min(10000, len(quora_df))  # Adjust as needed
quora_sample = quora_df.sample(n=sample_size, random_state=42).copy()

print(f"Processing {len(quora_sample)} documents...")
quora_sample['processed_text'] = quora_sample[text_col].apply(
    lambda x: processor.process_text(x)
)

# Filter out empty processed texts
quora_sample = quora_sample[quora_sample['processed_text'].str.len() > 0]
print(f"After filtering: {len(quora_sample)} documents")

## TF-IDF Model Training

In [None]:
# TF-IDF Vectorizer configuration
tfidf_config = {
    'max_features': 10000,      # Limit vocabulary size
    'ngram_range': (1, 2),      # Use unigrams and bigrams
    'min_df': 2,                # Ignore terms in less than 2 documents
    'max_df': 0.8,              # Ignore terms in more than 80% of documents
    'sublinear_tf': True,       # Apply sublinear TF scaling
    'use_idf': True,            # Enable IDF
    'smooth_idf': True,         # Smooth IDF weights
    'norm': 'l2'                # L2 normalization
}

print("Training TF-IDF vectorizer...")
print(f"Configuration: {tfidf_config}")

# Create and train vectorizer
vectorizer = TfidfVectorizer(**tfidf_config)

# Fit and transform the documents
start_time = time.time()
tfidf_matrix = vectorizer.fit_transform(quora_sample['processed_text'])
training_time = time.time() - start_time

print(f"\nTraining completed in {training_time:.2f} seconds")
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")
print(f"Matrix density: {tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1]):.4f}")

In [None]:
# Analyze vocabulary and feature importance
feature_names = vectorizer.get_feature_names_out()
idf_scores = vectorizer.idf_

# Create vocabulary analysis
vocab_df = pd.DataFrame({
    'term': feature_names,
    'idf': idf_scores
})

# Sort by IDF score
vocab_df = vocab_df.sort_values('idf', ascending=False)

print("Top 20 terms by IDF score (most discriminative):")
print(vocab_df.head(20))

print("\nBottom 20 terms by IDF score (most common):")
print(vocab_df.tail(20))

# Plot IDF distribution
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.hist(idf_scores, bins=50, alpha=0.7, edgecolor='black')
plt.xlabel('IDF Score')
plt.ylabel('Frequency')
plt.title('Distribution of IDF Scores')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.boxplot(idf_scores)
plt.ylabel('IDF Score')
plt.title('IDF Score Distribution (Box Plot)')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nIDF Statistics:")
print(f"Mean: {np.mean(idf_scores):.3f}")
print(f"Median: {np.median(idf_scores):.3f}")
print(f"Std: {np.std(idf_scores):.3f}")
print(f"Min: {np.min(idf_scores):.3f}")
print(f"Max: {np.max(idf_scores):.3f}")

## Model Evaluation and Testing

In [None]:
# Create search function for testing
def search_documents(query, top_k=10, return_scores=True):
    """
    Search documents using the trained TF-IDF model
    """
    # Process the query
    processed_query = processor.process_text(query)
    
    # Transform query to TF-IDF vector
    query_vector = vectorizer.transform([processed_query])
    
    # Calculate cosine similarity
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    
    # Get top-k results
    top_indices = np.argsort(similarities)[::-1][:top_k]
    
    results = []
    for idx in top_indices:
        if similarities[idx] > 0:  # Only include results with positive similarity
            doc_data = quora_sample.iloc[idx]
            result = {
                'index': idx,
                'score': similarities[idx],
                'original_text': doc_data[text_col],
                'processed_text': doc_data['processed_text']
            }
            
            # Add any additional metadata
            for col in quora_sample.columns:
                if col not in ['processed_text', text_col]:
                    result[col] = doc_data[col]
            
            results.append(result)
    
    return results

# Test the search function with sample queries
test_queries = [
    "machine learning algorithms",
    "python programming",
    "data science",
    "artificial intelligence",
    "web development"
]

print("Testing search functionality with sample queries:")
print("=" * 60)

for query in test_queries:
    print(f"\nQuery: '{query}'")
    print("-" * 40)
    
    results = search_documents(query, top_k=3)
    
    if results:
        for i, result in enumerate(results, 1):
            print(f"{i}. Score: {result['score']:.4f}")
            print(f"   Text: {result['original_text'][:100]}...")
            print()
    else:
        print("   No relevant results found.")
    
    print("=" * 40)

In [None]:
# Analyze model performance and statistics
print("Model Performance Analysis")
print("=" * 50)

# Document length statistics
doc_lengths = [len(text.split()) for text in quora_sample['processed_text']]
print(f"Document lengths (words):")
print(f"  Mean: {np.mean(doc_lengths):.1f}")
print(f"  Median: {np.median(doc_lengths):.1f}")
print(f"  Std: {np.std(doc_lengths):.1f}")
print(f"  Min: {np.min(doc_lengths)}")
print(f"  Max: {np.max(doc_lengths)}")

# TF-IDF matrix statistics
print(f"\nTF-IDF Matrix Statistics:")
print(f"  Shape: {tfidf_matrix.shape}")
print(f"  Non-zero elements: {tfidf_matrix.nnz:,}")
print(f"  Sparsity: {1 - (tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1])):.4f}")
print(f"  Memory usage: {tfidf_matrix.data.nbytes / 1024 / 1024:.2f} MB")

# Vocabulary statistics
print(f"\nVocabulary Statistics:")
print(f"  Total terms: {len(vectorizer.vocabulary_):,}")
print(f"  Unigrams: {sum(1 for term in feature_names if ' ' not in term):,}")
print(f"  Bigrams: {sum(1 for term in feature_names if ' ' in term):,}")

# Plot document length distribution
plt.figure(figsize=(12, 8))

plt.subplot(2, 2, 1)
plt.hist(doc_lengths, bins=50, alpha=0.7, edgecolor='black')
plt.xlabel('Document Length (words)')
plt.ylabel('Frequency')
plt.title('Distribution of Document Lengths')
plt.grid(True, alpha=0.3)

plt.subplot(2, 2, 2)
plt.boxplot(doc_lengths)
plt.ylabel('Document Length (words)')
plt.title('Document Length Distribution')
plt.grid(True, alpha=0.3)

# TF-IDF score distribution
plt.subplot(2, 2, 3)
sample_scores = tfidf_matrix.data[:10000]  # Sample for performance
plt.hist(sample_scores, bins=50, alpha=0.7, edgecolor='black')
plt.xlabel('TF-IDF Score')
plt.ylabel('Frequency')
plt.title('Distribution of TF-IDF Scores')
plt.grid(True, alpha=0.3)

# Feature frequency
plt.subplot(2, 2, 4)
feature_doc_counts = np.array((tfidf_matrix > 0).sum(axis=0)).flatten()
plt.hist(feature_doc_counts, bins=50, alpha=0.7, edgecolor='black')
plt.xlabel('Number of Documents')
plt.ylabel('Number of Features')
plt.title('Feature Document Frequency')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Model Saving and Export

In [None]:
# Prepare document metadata for saving
document_metadata = []
for idx, row in quora_sample.iterrows():
    metadata = {
        'doc_id': str(idx),
        'raw_text': row[text_col],
        'processed_text': row['processed_text'],
        'length': len(row['processed_text'].split())
    }
    
    # Add any additional metadata columns
    for col in quora_sample.columns:
        if col not in [text_col, 'processed_text']:
            metadata[col] = row[col]
    
    document_metadata.append(metadata)

print(f"Prepared metadata for {len(document_metadata)} documents")

# Save all model components
model_files = {
    'quora_tfidf_vectorizer.joblib': vectorizer,
    'quora_tfidf_matrix.joblib': tfidf_matrix,
    'quora_document_metadata.joblib': document_metadata
}

print("\nSaving model files...")
for filename, data in model_files.items():
    try:
        joblib.dump(data, filename)
        file_size = os.path.getsize(filename) / 1024 / 1024  # MB
        print(f"✅ Saved {filename} ({file_size:.2f} MB)")
    except Exception as e:
        print(f"❌ Error saving {filename}: {e}")

# Create a summary report
summary_report = {
    'dataset_info': {
        'total_documents': len(quora_sample),
        'text_column': text_col,
        'avg_doc_length': np.mean(doc_lengths),
        'processing_time': training_time
    },
    'model_config': tfidf_config,
    'model_stats': {
        'vocabulary_size': len(vectorizer.vocabulary_),
        'matrix_shape': tfidf_matrix.shape,
        'sparsity': 1 - (tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1])),
        'memory_usage_mb': tfidf_matrix.data.nbytes / 1024 / 1024
    },
    'evaluation': {
        'test_queries': test_queries,
        'avg_results_per_query': np.mean([len(search_documents(q, top_k=10)) for q in test_queries])
    }
}

# Save summary report
with open('quora_tfidf_training_report.json', 'w') as f:
    json.dump(summary_report, f, indent=2, default=str)

print("\n✅ Training complete! Summary report saved.")
print("\nFiles created:")
for filename in model_files.keys():
    print(f"  - {filename}")
print("  - quora_tfidf_training_report.json")

# Display summary
print("\n" + "="*50)
print("TRAINING SUMMARY")
print("="*50)
print(f"Documents processed: {len(quora_sample):,}")
print(f"Vocabulary size: {len(vectorizer.vocabulary_):,}")
print(f"Training time: {training_time:.2f} seconds")
print(f"Matrix sparsity: {1 - (tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1])):.4f}")
print(f"Memory usage: {tfidf_matrix.data.nbytes / 1024 / 1024:.2f} MB")

## Download Trained Models

In [None]:
# Download all model files
from google.colab import files

print("Downloading trained models...")

# Download model files
for filename in model_files.keys():
    try:
        files.download(filename)
        print(f"✅ Downloaded {filename}")
    except Exception as e:
        print(f"❌ Error downloading {filename}: {e}")

# Download summary report
try:
    files.download('quora_tfidf_training_report.json')
    print("✅ Downloaded training report")
except Exception as e:
    print(f"❌ Error downloading report: {e}")

print("\n✅ All files downloaded successfully!")
print("\nNext steps:")
print("1. Upload the .joblib files to your backend server")
print("2. Update the TF-IDF Quora service configuration")
print("3. Start the Quora TF-IDF service")
print("4. Test the search functionality")

## Final Testing and Verification

In [None]:
# Load saved models to verify they work correctly
print("Verifying saved models...")

try:
    # Load saved components
    loaded_vectorizer = joblib.load('quora_tfidf_vectorizer.joblib')
    loaded_matrix = joblib.load('quora_tfidf_matrix.joblib')
    loaded_metadata = joblib.load('quora_document_metadata.joblib')
    
    print("✅ All models loaded successfully")
    
    # Test with loaded models
    test_query = "machine learning"
    processed_query = processor.process_text(test_query)
    query_vector = loaded_vectorizer.transform([processed_query])
    similarities = cosine_similarity(query_vector, loaded_matrix).flatten()
    top_idx = np.argmax(similarities)
    
    print(f"\nTest query: '{test_query}'")
    print(f"Top result score: {similarities[top_idx]:.4f}")
    print(f"Top result text: {loaded_metadata[top_idx]['raw_text'][:100]}...")
    
    print("\n✅ Model verification successful!")
    
except Exception as e:
    print(f"❌ Error verifying models: {e}")

print("\n" + "="*60)
print("🎉 TF-IDF QUORA MODEL TRAINING COMPLETED SUCCESSFULLY! 🎉")
print("="*60)
print("\nYour models are ready for deployment in the search engine.")
print("Remember to update the service configuration with the correct file paths.")