In [None]:
# Install required packages
!pip install nltk scikit-learn pandas numpy joblib contractions beautifulsoup4 inflect

# Download NLTK data
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import re
import unicodedata
import logging
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer # Corrected import statement

from nltk.corpus import stopwords, wordnet
from bs4 import BeautifulSoup
import contractions
import inflect
import joblib
from google.colab import files
import warnings
warnings.filterwarnings('ignore')


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Text Processing Class
class TextProcessor:
    def __init__(self):
        self.tokenizer = nltk.tokenize
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        self.inflect_engine = inflect.engine()

    def clean_text(self, text, words_to_remove):
        words = text.split()
        cleaned_words = [word for word in words if word not in words_to_remove]
        cleaned_text = ' '.join(cleaned_words)
        return cleaned_text

    def number_to_words(self, text):
        words = self.tokenizer.word_tokenize(text)
        converted_words = []
        for word in words:
            if word.replace('.', '', 1).isdigit():
                converted_words.append(word)
            else:
                if word.isdigit():
                    try:
                        num = int(word)
                        if num <= 999999999999999:
                            converted_word = self.inflect_engine.number_to_words(word)
                            converted_words.append(converted_word)
                        else:
                            converted_words.append('[Number Out of Range]')
                    except:
                        converted_words.append('[Number Out of Range]')
                else:
                    converted_words.append(word)
        return ' '.join(converted_words)

    def remove_html_tags(self, text):
        try:
            if '<' in text and '>' in text:
                return BeautifulSoup(text, 'html.parser').get_text()
            else:
                return text
        except:
            logging.warning('MarkupResemblesLocatorWarning: The input looks more like a filename than markup.')
            return text

    def normalize_unicode(self, text):
        return unicodedata.normalize('NFKD', text)

    def expand_contractions(self, text):
        return contractions.fix(text)

    def cleaned_text(self, text):
        text = re.sub(r'\W', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text

    def normalization_example(self, text):
        return text.lower()

    def stemming_example(self, text):
        words = self.tokenizer.word_tokenize(text)
        stemmed_words = [self.stemmer.stem(word) for word in words]
        return ' '.join(stemmed_words)

    def lemmatization_example(self, text):
        words = self.tokenizer.word_tokenize(text)
        lemmatized_words = [self.lemmatizer.lemmatize(word) for word in words]
        return ' '.join(lemmatized_words)

    def remove_stopwords(self, text):
        words = self.tokenizer.word_tokenize(text)
        filtered_words = [word for word in words if word.lower() not in self.stop_words]
        return ' '.join(filtered_words)

    def remove_punctuation(self, text):
        return re.sub(r'[^\w\s]', '', text)

    def remove_urls(self, text):
        return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    def remove_special_characters_and_emojis(self, text):
        return re.sub(r'[^A-Za-z0-9\s]+', '', text)

    def replace_synonyms(self, text):
        words = self.tokenizer.word_tokenize(text)
        synonym_words = [self.get_synonym(word) for word in words]
        return ' '.join(synonym_words)

    def get_synonym(self, word):
        synonyms = nltk.corpus.wordnet.synsets(word)
        if synonyms:
            return synonyms[0].lemmas()[0].name()
        return word

    def handle_negations(self, text):
        words = self.tokenizer.word_tokenize(text)
        negated_text = []
        negate = False
        for word in words:
            if word.lower() in ['not', "n't"]:
                negate = True
            elif negate:
                negated_text.append(f'NOT_{word}')
                negate = False
            else:
                negated_text.append(word)
        return ' '.join(negated_text)

    def remove_non_english_words(self, text):
        words = self.tokenizer.word_tokenize(text)
        english_words = [word for word in words if wordnet.synsets(word)]
        return ' '.join(english_words)


In [None]:
# Initialize text processor
processor = TextProcessor()

# Custom text processing function
def processed_text(text):
    if text is None:
        return text
    text = processor.cleaned_text(text)
    text = processor.normalization_example(text)
    text = processor.stemming_example(text)
    text = processor.lemmatization_example(text)
    text = processor.remove_stopwords(text)
    text = processor.number_to_words(text)
    text = processor.remove_punctuation(text)
    text = processor.expand_contractions(text)
    text = processor.normalize_unicode(text)
    text = processor.handle_negations(text)
    text = processor.remove_urls(text)
    return text


In [None]:
# Load Quora dataset files from Google Drive
docs_path = '/content/drive/MyDrive/downloads/docs.tsv'
queries_path = '/content/drive/MyDrive/downloads/queries.tsv'
qrels_path = '/content/drive/MyDrive/downloads/qrels.tsv'

# Load documents
print('Loading documents...')
docs_df = pd.read_csv(docs_path, sep='\t', header=None, names=['doc_id', 'text'])
print(f'Documents loaded: {len(docs_df)}')
print(f'Sample document: {docs_df.iloc[0]}')

# Load queries
print('\nLoading queries...')
queries_df = pd.read_csv(queries_path, sep='\t', header=None, names=['query_id', 'text'])
print(f'Queries loaded: {len(queries_df)}')
print(f'Sample query: {queries_df.iloc[0]}')

# Load relevance judgments
print('\nLoading relevance judgments...')
qrels_df = pd.read_csv(qrels_path, sep='\t', header=None, names=['query_id', 'Q0', 'doc_id', 'relevance'])
print(f'Relevance judgments loaded: {len(qrels_df)}')
print(f'Sample qrel: {qrels_df.iloc[0]}')


Loading documents...
Documents loaded: 522771
Sample document: doc_id    doc_id
text        text
Name: 0, dtype: object

Loading queries...
Queries loaded: 5001
Sample query: query_id    query_id
text            text
Name: 0, dtype: object

Loading relevance judgments...
Relevance judgments loaded: 7627
Sample qrel: query_id      query_id
Q0              doc_id
doc_id       relevance
relevance          NaN
Name: 0, dtype: object


In [None]:
# Prepare documents for processing
print('Preparing documents for TF-IDF processing...')

# Remove null values and ensure text is string
docs_df = docs_df.dropna(subset=['text'])
docs_df['text'] = docs_df['text'].astype(str)

# Extract documents and their IDs
documents = docs_df['text'].tolist()
doc_ids = docs_df['doc_id'].tolist()

print(f'Number of documents to process: {len(documents)}')
print(f'Sample document text: {documents[0][:200]}...')

# Create document ID to index mapping
doc_id_to_index = {doc_id: idx for idx, doc_id in enumerate(doc_ids)}
index_to_doc_id = {idx: doc_id for idx, doc_id in enumerate(doc_ids)}
print(f'Document mapping created: {len(doc_id_to_index)} documents')


Preparing documents for TF-IDF processing...
Number of documents to process: 522769
Sample document text: text...
Document mapping created: 522769 documents


In [None]:
# Apply custom text processing to all documents
print('Processing documents with custom cleaning method...')
processed_documents = []

for i, doc in enumerate(documents):
    if i % 10000 == 0:
        print(f'Processed {i}/{len(documents)} documents')

    processed_doc = processed_text(doc)
    processed_documents.append(processed_doc)

print(f'Text processing complete. Sample processed document: {processed_documents[0][:200]}...')


Processing documents with custom cleaning method...
Processed 0/522769 documents
Processed 10000/522769 documents
Processed 20000/522769 documents
Processed 30000/522769 documents
Processed 40000/522769 documents
Processed 50000/522769 documents
Processed 60000/522769 documents
Processed 70000/522769 documents
Processed 80000/522769 documents
Processed 90000/522769 documents
Processed 100000/522769 documents
Processed 110000/522769 documents
Processed 120000/522769 documents
Processed 130000/522769 documents
Processed 140000/522769 documents
Processed 150000/522769 documents
Processed 160000/522769 documents
Processed 170000/522769 documents
Processed 180000/522769 documents
Processed 190000/522769 documents
Processed 200000/522769 documents
Processed 210000/522769 documents
Processed 220000/522769 documents
Processed 230000/522769 documents
Processed 240000/522769 documents
Processed 250000/522769 documents
Processed 260000/522769 documents
Processed 270000/522769 documents
Processed 

In [9]:
# Create custom tokenizer that applies our text processing
def custom_tokenizer(text):
    """Custom tokenizer using our processed_text function"""
    # Apply our custom text processing
    processed = processed_text(text)
    # Tokenize the processed text
    tokens = processor.tokenizer.word_tokenize(processed) if processed else []
    return tokens

# Create TF-IDF vectorizer with custom tokenizer
print('Creating TF-IDF vectorizer with custom tokenizer...')

tfidf_vectorizer = TfidfVectorizer(
    preprocessor=None,  # Disable built-in preprocessing
    tokenizer=custom_tokenizer,  # Use our custom tokenizer
    token_pattern=None,  # Disable default token pattern
    lowercase=False,     # We already handle lowercasing in custom processing
    stop_words=None,     # We already handle stopwords in custom processing
    max_features=10000,  # Larger vocabulary for Quora
    min_df=2,           # Minimum document frequency
    max_df=0.95,        # Maximum document frequency
    ngram_range=(1, 2),  # Include unigrams and bigrams
    sublinear_tf=True    # Use sublinear tf scaling
)

print('Fitting TF-IDF vectorizer with custom tokenizer...')
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)  # Use original documents, not processed ones
print(f'TF-IDF matrix shape: {tfidf_matrix.shape}')
print(f'Vocabulary size: {len(tfidf_vectorizer.vocabulary_)}')
print(f'Matrix sparsity: {(1 - tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1])):.4f}')


Creating TF-IDF vectorizer with custom tokenizer...
Fitting TF-IDF vectorizer with custom tokenizer...
TF-IDF matrix shape: (522769, 10000)
Vocabulary size: 10000
Matrix sparsity: 0.9994


In [10]:
# Build inverted index from TF-IDF matrix
print('Building inverted index...')

# Get feature names (terms)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Create inverted index with document IDs
inverted_index = defaultdict(list)

# Convert sparse matrix to coordinate format for efficient iteration
coo_matrix = tfidf_matrix.tocoo()

print(f'Processing {len(coo_matrix.data)} non-zero entries...')
for i, (doc_idx, term_idx, tfidf_score) in enumerate(zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)):
    if i % 100000 == 0 and i > 0:
        print(f'Processed {i}/{len(coo_matrix.data)} entries')

    term = feature_names[term_idx]
    doc_id = index_to_doc_id[doc_idx]
    inverted_index[term].append((doc_id, tfidf_score))

# Sort document lists by TF-IDF score (descending)
print('Sorting inverted index entries...')
for term in inverted_index:
    inverted_index[term].sort(key=lambda x: x[1], reverse=True)

print(f'Inverted index created with {len(inverted_index)} terms')
print(f'Sample term: {list(inverted_index.keys())[0]} -> {inverted_index[list(inverted_index.keys())[0]][:3]}')


Building inverted index...
Processing 3292278 non-zero entries...
Processed 100000/3292278 entries
Processed 200000/3292278 entries
Processed 300000/3292278 entries
Processed 400000/3292278 entries
Processed 500000/3292278 entries
Processed 600000/3292278 entries
Processed 700000/3292278 entries
Processed 800000/3292278 entries
Processed 900000/3292278 entries
Processed 1000000/3292278 entries
Processed 1100000/3292278 entries
Processed 1200000/3292278 entries
Processed 1300000/3292278 entries
Processed 1400000/3292278 entries
Processed 1500000/3292278 entries
Processed 1600000/3292278 entries
Processed 1700000/3292278 entries
Processed 1800000/3292278 entries
Processed 1900000/3292278 entries
Processed 2000000/3292278 entries
Processed 2100000/3292278 entries
Processed 2200000/3292278 entries
Processed 2300000/3292278 entries
Processed 2400000/3292278 entries
Processed 2500000/3292278 entries
Processed 2600000/3292278 entries
Processed 2700000/3292278 entries
Processed 2800000/3292278

In [13]:
# Calculate document similarity matrix (sample for large datasets)
print('Calculating document similarity matrix...')

# For large datasets, sample a subset for similarity calculation
max_docs_for_similarity = 5000
if len(documents) > max_docs_for_similarity:
    print(f'Dataset too large ({len(documents)} docs). Sampling {max_docs_for_similarity} documents for similarity calculation.')
    sample_indices = np.random.choice(len(documents), max_docs_for_similarity, replace=False)
    sample_matrix = tfidf_matrix[sample_indices]
else:
    sample_matrix = tfidf_matrix
    sample_indices = np.arange(len(documents))

# Calculate cosine similarity
similarity_matrix = cosine_similarity(sample_matrix)
print(f'Similarity matrix shape: {similarity_matrix.shape}')

# Calculate statistics
mean_similarity = np.mean(similarity_matrix)
max_similarity = np.max(similarity_matrix)
min_similarity = np.min(similarity_matrix)

print(f'Mean similarity: {mean_similarity:.4f}')
print(f'Max similarity: {max_similarity:.4f}')
print(f'Min similarity: {min_similarity:.4f}')


Calculating document similarity matrix...
Dataset too large (522769 docs). Sampling 5000 documents for similarity calculation.
Similarity matrix shape: (5000, 5000)
Mean similarity: 0.0042
Max similarity: 1.0000
Min similarity: 0.0000


In [None]:
# Calculate MAP using relevance judgments (proper IR evaluation)
print('Calculating MAP score using relevance judgments...')

# Recreate the TF-IDF matrix (run this first)
print('Fitting TF-IDF vectorizer with custom tokenizer...')
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
print(f'TF-IDF matrix shape: {tfidf_matrix.shape}')


def calculate_map_with_qrels(tfidf_matrix, queries_df, qrels_df, doc_id_to_index, k=1000):
    """Calculate MAP score using actual relevance judgments"""
    # Process queries with the same text processing
    processed_queries = []
    for query_text in queries_df['text']:
        processed_query = processed_text(str(query_text))
        processed_queries.append(processed_query)

    # Transform queries using the fitted vectorizer
    query_vectors = tfidf_vectorizer.transform(processed_queries)

    # Calculate similarities between queries and documents
    query_doc_similarities = cosine_similarity(query_vectors, tfidf_matrix)

    average_precisions = []

    for i, query_id in enumerate(queries_df['query_id']):
        # Get relevance judgments for this query
        query_qrels = qrels_df[qrels_df['query_id'] == query_id]

        if len(query_qrels) == 0:
            continue

        # Get similarity scores for this query
        similarities = query_doc_similarities[i]

        # Sort documents by similarity (descending)
        sorted_indices = np.argsort(similarities)[::-1][:k]

        # Calculate precision at each relevant document
        relevant_docs = set(query_qrels[query_qrels['relevance'] > 0]['doc_id'].values)

        if len(relevant_docs) == 0:
            continue

        precisions = []
        num_relevant_found = 0

        for rank, doc_index in enumerate(sorted_indices, 1):
            doc_id = index_to_doc_id[doc_index]

            if doc_id in relevant_docs:
                num_relevant_found += 1
                precision = num_relevant_found / rank
                precisions.append(precision)

        if precisions:
            average_precision = np.mean(precisions)
            average_precisions.append(average_precision)

    return np.mean(average_precisions) if average_precisions else 0.0

# Calculate MAP score
map_score = calculate_map_with_qrels(tfidf_matrix, queries_df, qrels_df, doc_id_to_index)
print(f'MAP score: {map_score:.4f}')

if map_score >= 0.4:
    print('✅ MAP score requirement met (>= 0.4)')
else:
    print('❌ MAP score below 0.4. This is normal for challenging datasets like Quora.')
    print('   Consider adjusting TF-IDF parameters or text processing steps.')


Calculating MAP score using relevance judgments...
Fitting TF-IDF vectorizer with custom tokenizer...
TF-IDF matrix shape: (522769, 10000)


In [None]:
# Calculate additional IR metrics
print('Calculating additional IR metrics...')

def calculate_precision_at_k(tfidf_matrix, queries_df, qrels_df, doc_id_to_index, k_values=[1, 5, 10, 20]):
    """Calculate Precision@K for different K values"""
    processed_queries = []
    for query_text in queries_df['text']:
        processed_query = processed_text(str(query_text))
        processed_queries.append(processed_query)

    query_vectors = tfidf_vectorizer.transform(processed_queries)
    query_doc_similarities = cosine_similarity(query_vectors, tfidf_matrix)

    precision_at_k = {k: [] for k in k_values}

    for i, query_id in enumerate(queries_df['query_id']):
        query_qrels = qrels_df[qrels_df['query_id'] == query_id]

        if len(query_qrels) == 0:
            continue

        similarities = query_doc_similarities[i]
        sorted_indices = np.argsort(similarities)[::-1]

        relevant_docs = set(query_qrels[query_qrels['relevance'] > 0]['doc_id'].values)

        for k in k_values:
            top_k_indices = sorted_indices[:k]
            top_k_doc_ids = [index_to_doc_id[idx] for idx in top_k_indices]
            relevant_in_top_k = len([doc_id for doc_id in top_k_doc_ids if doc_id in relevant_docs])
            precision_k = relevant_in_top_k / k
            precision_at_k[k].append(precision_k)

    return {k: np.mean(precisions) for k, precisions in precision_at_k.items()}

# Calculate Precision@K
precision_metrics = calculate_precision_at_k(tfidf_matrix, queries_df, qrels_df, doc_id_to_index)

print('Precision@K results:')
for k, precision in precision_metrics.items():
    print(f'P@{k}: {precision:.4f}')


In [11]:
# Create document-term matrix and term analysis
print('Creating document-term matrix and analyzing terms...')

# Calculate term frequencies across the corpus
term_frequencies = np.asarray(tfidf_matrix.sum(axis=0)).flatten()
term_freq_dict = dict(zip(feature_names, term_frequencies))

# Get top terms
top_terms = sorted(term_freq_dict.items(), key=lambda x: x[1], reverse=True)[:30]
print('Top 30 terms by TF-IDF frequency:')
for i, (term, freq) in enumerate(top_terms, 1):
    print(f'{i:2d}. {term}: {freq:.4f}')

# Document-term matrix (keep as sparse for memory efficiency)
doc_term_matrix = tfidf_matrix
print(f'\nDocument-term matrix shape: {doc_term_matrix.shape}')
print(f'Matrix density: {doc_term_matrix.nnz / (doc_term_matrix.shape[0] * doc_term_matrix.shape[1]):.6f}')


Creating document-term matrix and analyzing terms...
Top 30 terms by TF-IDF frequency:
 1. whi: 10248.8112
 2. best: 8543.6953
 3. doe: 7893.5112
 4. get: 5973.3430
 5. differ: 5066.2137
 6. use: 4856.1986
 7. like: 4814.8779
 8. india: 4292.4377
 9. good: 4273.7330
10. peopl: 3846.5899
11. make: 3789.0381
12. would: 3375.9457
13. way: 3364.1534
14. work: 3205.7624
15. one: 3187.3033
16. mean: 3031.6566
17. wa: 2941.1856
18. learn: 2859.5873
19. ani: 2785.0647
20. life: 2690.1202
21. time: 2559.8999
22. thi: 2476.1039
23. think: 2402.3537
24. much: 2388.5032
25. know: 2382.0080
26. engin: 2348.4792
27. quora: 2337.1986
28. thing: 2254.3858
29. ha: 2220.7763
30. becom: 2216.8523

Document-term matrix shape: (522769, 10000)
Matrix density: 0.000630


In [14]:
# Save all components using joblib
print('Saving all components...')

# Save TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, 'quora_tfidf_vectorizer.joblib')
print('TF-IDF vectorizer saved')

# Save TF-IDF matrix
joblib.dump(tfidf_matrix, 'quora_tfidf_matrix.joblib')
print('TF-IDF matrix saved')

# Save similarity matrix
joblib.dump(similarity_matrix, 'quora_similarity_matrix.joblib')
print('Similarity matrix saved')

# Save inverted index
joblib.dump(dict(inverted_index), 'quora_inverted_index.joblib')
print('Inverted index saved')

# Save document mappings
joblib.dump(doc_id_to_index, 'quora_doc_id_to_index.joblib')
joblib.dump(index_to_doc_id, 'quora_index_to_doc_id.joblib')
print('Document mappings saved')

# Save feature names
joblib.dump(feature_names, 'quora_feature_names.joblib')
print('Feature names saved')

# Save processed documents
joblib.dump(processed_documents, 'quora_processed_documents.joblib')
print('Processed documents saved')

# Save evaluation metrics
metrics = {
    'map_score': map_score,
    'precision_at_k': precision_metrics,
    'mean_similarity': mean_similarity,
    'max_similarity': max_similarity,
    'min_similarity': min_similarity,
    'vocab_size': len(tfidf_vectorizer.vocabulary_),
    'num_documents': len(documents),
    'num_queries': len(queries_df),
    'num_qrels': len(qrels_df),
    'matrix_shape': tfidf_matrix.shape,
    'matrix_sparsity': 1 - tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1])
}
joblib.dump(metrics, 'quora_ir_metrics.joblib')
print('IR metrics saved')

print('All components saved successfully!')


Saving all components...
TF-IDF vectorizer saved
TF-IDF matrix saved
Similarity matrix saved
Inverted index saved
Document mappings saved
Feature names saved
Processed documents saved


NameError: name 'map_score' is not defined

In [1]:
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Create directory if it doesn't exist
folder_path = '/content/drive/MyDrive/quora_tfidf'
os.makedirs(folder_path, exist_ok=True)

# Files to save
files_to_save = [
    'quora_tfidf_vectorizer.joblib',
    'quora_tfidf_matrix.joblib',
    'quora_similarity_matrix.joblib',
    'quora_inverted_index.joblib',
    'quora_doc_id_to_index.joblib',
    'quora_index_to_doc_id.joblib',
    'quora_feature_names.joblib',
    'quora_processed_documents.joblib',
    'quora_ir_metrics.joblib'
]

print('Saving files to Google Drive...')
for file_name in files_to_save:
    try:
        # Source path (in Colab's temporary storage)
        src_path = f'/content/{file_name}'

        # Destination path in Google Drive
        dest_path = f'{folder_path}/{file_name}'

        # Copy file
        !cp "{src_path}" "{dest_path}"
        print(f'Saved: {dest_path}')
    except Exception as e:
        print(f'Error saving {file_name}: {e}')

print('All files saved to Google Drive!')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saving files to Google Drive...
Saved: /content/drive/MyDrive/quora_tfidf/quora_tfidf_vectorizer.joblib
Saved: /content/drive/MyDrive/quora_tfidf/quora_tfidf_matrix.joblib
Saved: /content/drive/MyDrive/quora_tfidf/quora_similarity_matrix.joblib
Saved: /content/drive/MyDrive/quora_tfidf/quora_inverted_index.joblib
Saved: /content/drive/MyDrive/quora_tfidf/quora_doc_id_to_index.joblib
Saved: /content/drive/MyDrive/quora_tfidf/quora_index_to_doc_id.joblib
Saved: /content/drive/MyDrive/quora_tfidf/quora_feature_names.joblib
Saved: /content/drive/MyDrive/quora_tfidf/quora_processed_documents.joblib
cp: cannot stat '/content/quora_ir_metrics.joblib': No such file or directory
Saved: /content/drive/MyDrive/quora_tfidf/quora_ir_metrics.joblib
All files saved to Google Drive!


In [None]:
# Summary and validation
print('=== QUORA TF-IDF SYSTEM SUMMARY ===')
print(f'Documents processed: {len(documents):,}')
print(f'Queries processed: {len(queries_df):,}')
print(f'Relevance judgments: {len(qrels_df):,}')
print(f'Vocabulary size: {len(tfidf_vectorizer.vocabulary_):,}')
print(f'TF-IDF matrix shape: {tfidf_matrix.shape}')
print(f'Matrix sparsity: {(1 - tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1])):.4f}')
print(f'Inverted index terms: {len(inverted_index):,}')
print(f'MAP score: {map_score:.4f}')

print('Precision@K scores:')
for k, precision in precision_metrics.items():
    print(f'  P@{k}: {precision:.4f}')

if map_score >= 0.4:
    print('✅ MAP score requirement met (>= 0.4)')
else:
    print('ℹ️  MAP score below 0.4 - this is common for challenging datasets like Quora')

print('\n=== USAGE EXAMPLE ===')
print('To load the saved components:')
print('import joblib')
print('vectorizer = joblib.load("quora_tfidf_vectorizer.joblib")')
print('matrix = joblib.load("quora_tfidf_matrix.joblib")')
print('inverted_index = joblib.load("quora_inverted_index.joblib")')
print('doc_mappings = joblib.load("quora_doc_id_to_index.joblib")')
print('metrics = joblib.load("quora_ir_metrics.joblib")')

print('\n=== SEARCH EXAMPLE ===')
print('# To search for similar documents:')
print('query = "your search query"')
print('processed_query = processed_text(query)')
print('query_vector = vectorizer.transform([processed_query])')
print('similarities = cosine_similarity(query_vector, matrix)')
print('top_docs = similarities.argsort()[0][::-1][:10]')
