# Fixed Quora Dataset Processing and Embedding Generation

This notebook fixes the issue where queries were being removed due to incorrect column identification.

**Key Fixes:**
1. **Correct Column Detection**: Properly identifies the text column containing actual questions
2. **Preserved Data**: Ensures no queries are lost during processing
3. **Smart Text Processing**: Preserves semantic information while cleaning
4. **Optimized Embeddings**: Uses best models for high MAP scores


## Step 1: Install Required Packages

In [None]:
# Install required packages
!pip install --upgrade pip
!pip install sentence-transformers>=2.2.2
!pip install transformers>=4.21.0
!pip install torch>=1.13.0
!pip install pandas numpy scikit-learn
!pip install joblib nltk tqdm
!pip install faiss-cpu
!pip install beir
!pip install datasets
!pip install ir_datasets

print("\n[INFO] Packages installed! Please restart runtime if needed.")

Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.1.1
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m135.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0
Collecting beir
  Downloading beir-2.2.0-py3-none-any.whl.metadata (28 kB)
Collecting pytrec-eval-terrier (from beir)
  Downloading pytrec_eval_terri

## Step 2: Import Libraries

In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
import joblib
import os
import warnings
import torch
import zipfile
from collections import defaultdict
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import faiss
from sentence_transformers import SentenceTransformer

warnings.filterwarnings('ignore')

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

# Download NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet=True)

print("✅ All packages imported successfully!")

Using device: cuda
GPU: Tesla T4
GPU Memory: 14.7 GB
✅ All packages imported successfully!


## Step 3: Load Dataset

In [None]:
# For Google Colab
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/gdrive')
    base_path = '/content/gdrive/MyDrive/quora_datset'
else:
    # For local environment
    base_path = '/Users/raafatmhanna/Desktop/Quora'

# Load the dataset files
print("Loading dataset files...")

# Try different possible file names and paths
file_patterns = {
    'docs': ['docs.tsv', 'documents.tsv', 'docs.tsv'],
    'queries': ['queries.tsv', 'questions.tsv', 'query.tsv'],
    'qrels': ['qrels.tsv', 'relevance.tsv', 'labels.tsv']
}

datasets = {}
for data_type, patterns in file_patterns.items():
    for pattern in patterns:
        try:
            file_path = os.path.join(base_path, pattern)
            if os.path.exists(file_path):
                datasets[data_type] = pd.read_csv(file_path, sep='\t')
                print(f"✅ Loaded {data_type}: {len(datasets[data_type])} rows from {pattern}")
                break
        except Exception as e:
            continue

# Verify we have the necessary data
if 'docs' not in datasets or 'queries' not in datasets:
    print("❌ Error: Could not load required files")
    print("Please ensure docs.tsv and queries.tsv are in the correct location")
else:
    print("\n✅ Dataset loaded successfully!")
    print(f"Documents: {len(datasets['docs']):,} rows")
    print(f"Queries: {len(datasets['queries']):,} rows")
    if 'qrels' in datasets:
        print(f"Qrels: {len(datasets['qrels']):,} rows")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Loading dataset files...
✅ Loaded docs: 522770 rows from docs.tsv
✅ Loaded queries: 5000 rows from queries.tsv
✅ Loaded qrels: 7626 rows from qrels.tsv

✅ Dataset loaded successfully!
Documents: 522,770 rows
Queries: 5,000 rows
Qrels: 7,626 rows


## Step 4: Inspect Data Structure (Critical Fix)

In [None]:
# CRITICAL: Properly inspect the data structure
print("=== INSPECTING QUERIES STRUCTURE ===")
print("\nQuery columns:", list(datasets['queries'].columns))
print("\nFirst 5 rows of queries:")
print(datasets['queries'].head())

print("\n=== IDENTIFYING TEXT COLUMNS ===")
# Identify which columns contain actual text
for col in datasets['queries'].columns:
    sample_values = datasets['queries'][col].head(3).tolist()
    print(f"\nColumn '{col}':")
    for i, val in enumerate(sample_values):
        print(f"  Row {i}: {str(val)[:100]}..." if len(str(val)) > 100 else f"  Row {i}: {val}")

    # Check if this column contains question text
    if datasets['queries'][col].astype(str).str.len().mean() > 20:
        print(f"  → Likely contains text (avg length: {datasets['queries'][col].astype(str).str.len().mean():.1f})")

print("\n=== INSPECTING DOCUMENTS STRUCTURE ===")
print("\nDocument columns:", list(datasets['docs'].columns))
print("\nFirst 3 rows of documents:")
print(datasets['docs'].head(3))

=== INSPECTING QUERIES STRUCTURE ===

Query columns: ['query_id', 'text']

First 5 rows of queries:
   query_id                                               text
0       318                How does Quora look to a moderator?
1       378  How do I refuse to chose between different thi...
2       379  Did Ben Affleck shine more than Christian Bale...
3       399  What are the effects of demonitization of 500 ...
4       420                       Why creativity is important?

=== IDENTIFYING TEXT COLUMNS ===

Column 'query_id':
  Row 0: 318
  Row 1: 378
  Row 2: 379

Column 'text':
  Row 0: How does Quora look to a moderator?
  Row 1: How do I refuse to chose between different things to do in my life?
  Row 2: Did Ben Affleck shine more than Christian Bale as Batman?
  → Likely contains text (avg length: 51.5)

=== INSPECTING DOCUMENTS STRUCTURE ===

Document columns: ['doc_id', 'text']

First 3 rows of documents:
   doc_id                                               text
0       1  Wh

## Step 5: Smart Text Preprocessing with Correct Column Detection

In [None]:
def safe_clean_text(text):
    """
    Ultra-safe cleaning that preserves Quora question format
    """
    if pd.isna(text) or not isinstance(text, str):
        return ""

    # Convert to string to be safe
    text = str(text)

    # Minimal cleaning - preserve most information
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def find_text_column(df, data_type='query'):
    """
    Intelligently find the column containing actual text content
    """
    print(f"\nFinding text column for {data_type}...")

    # First, look for columns with common text-related names
    text_keywords = ['text', 'question', 'query', 'content', 'title', 'body']

    for col in df.columns:
        col_lower = col.lower()
        # Skip ID columns
        if 'id' in col_lower and not any(keyword in col_lower for keyword in text_keywords):
            continue

        # Check if column name suggests text content
        if any(keyword in col_lower for keyword in text_keywords):
            # Verify it actually contains text
            avg_length = df[col].astype(str).str.len().mean()
            if avg_length > 20:  # Reasonable threshold for text content
                print(f"  ✓ Found text column: '{col}' (avg length: {avg_length:.1f})")
                return col

    # If no column found by name, find the column with longest average text
    max_length = 0
    best_col = None

    for col in df.columns:
        try:
            avg_length = df[col].astype(str).str.len().mean()
            if avg_length > max_length:
                max_length = avg_length
                best_col = col
        except:
            continue

    if best_col and max_length > 20:
        print(f"  ✓ Found text column by length: '{best_col}' (avg length: {max_length:.1f})")
        return best_col

    # Last resort - return the second column (first is usually ID)
    if len(df.columns) > 1:
        print(f"  ⚠️ Using fallback column: '{df.columns[1]}'")
        return df.columns[1]

    return df.columns[0]

# Process queries with correct column detection
print("=== PROCESSING QUERIES ===")
queries_df = datasets['queries'].copy()

# Find the actual text column
query_text_col = find_text_column(queries_df, 'query')

# Show sample of what we're processing
print("\nSample queries before cleaning:")
for i in range(min(3, len(queries_df))):
    print(f"  {i+1}: {queries_df[query_text_col].iloc[i][:100]}...")

# Apply cleaning
queries_df['text_cleaned'] = queries_df[query_text_col].apply(safe_clean_text)

# Remove only truly empty entries
original_count = len(queries_df)
queries_df = queries_df[queries_df['text_cleaned'].str.len() > 0]
cleaned_count = len(queries_df)

print(f"\nQueries processed: {original_count} → {cleaned_count} (removed {original_count - cleaned_count})")

# Show sample after cleaning
print("\nSample queries after cleaning:")
for i in range(min(3, len(queries_df))):
    print(f"  {i+1}: {queries_df['text_cleaned'].iloc[i][:100]}...")

# Process documents
print("\n=== PROCESSING DOCUMENTS ===")
docs_df = datasets['docs'].copy()

# Find the actual text column for documents
doc_text_col = find_text_column(docs_df, 'document')

# Apply cleaning
docs_df['text_cleaned'] = docs_df[doc_text_col].apply(safe_clean_text)

# Remove only truly empty entries
original_count = len(docs_df)
docs_df = docs_df[docs_df['text_cleaned'].str.len() > 0]
cleaned_count = len(docs_df)

print(f"\nDocuments processed: {original_count} → {cleaned_count} (removed {original_count - cleaned_count})")

# Save cleaned data
queries_df.to_csv('queries_cleaned.tsv', sep='\t', index=False)
docs_df.to_csv('docs_cleaned.tsv', sep='\t', index=False)
print("\n✅ Cleaned data saved!")

=== PROCESSING QUERIES ===

Finding text column for query...
  ✓ Found text column: 'text' (avg length: 51.5)

Sample queries before cleaning:
  1: How does Quora look to a moderator?...
  2: How do I refuse to chose between different things to do in my life?...
  3: Did Ben Affleck shine more than Christian Bale as Batman?...

Queries processed: 5000 → 5000 (removed 0)

Sample queries after cleaning:
  1: How does Quora look to a moderator?...
  2: How do I refuse to chose between different things to do in my life?...
  3: Did Ben Affleck shine more than Christian Bale as Batman?...

=== PROCESSING DOCUMENTS ===

Finding text column for document...
  ✓ Found text column: 'text' (avg length: 62.2)

Documents processed: 522770 → 522768 (removed 2)

✅ Cleaned data saved!


## Step 6: Generate Optimized Embeddings

In [None]:
# Load optimized model
MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'
print(f"Loading model: {MODEL_NAME}")
model = SentenceTransformer(MODEL_NAME, device=device)

# Set optimal parameters
if hasattr(model, 'max_seq_length'):
    model.max_seq_length = 512

print(f"Model loaded on {device}")
print(f"Max sequence length: {getattr(model, 'max_seq_length', 'default')}")

# Prepare texts for embedding
print("\nPreparing texts...")

# Get document texts and IDs
doc_texts = docs_df['text_cleaned'].tolist()
doc_ids = docs_df[docs_df.columns[0]].tolist()  # First column is usually ID

# Get query texts and IDs
query_texts = queries_df['text_cleaned'].tolist()
query_ids = queries_df[queries_df.columns[0]].tolist()  # First column is usually ID

print(f"\nReady to generate embeddings for:")
print(f"  - {len(doc_texts):,} documents")
print(f"  - {len(query_texts):,} queries")

# Generate embeddings with progress bar
def generate_embeddings_batch(texts, desc="Generating embeddings"):
    """Generate embeddings with optimal batch size"""
    # Determine batch size based on available memory
    if torch.cuda.is_available():
        gpu_memory = torch.cuda.get_device_properties(0).total_memory
        if gpu_memory < 8e9:  # Less than 8GB
            batch_size = 32
        elif gpu_memory < 16e9:  # Less than 16GB
            batch_size = 64
        else:
            batch_size = 128
    else:
        batch_size = 32

    print(f"Using batch size: {batch_size}")

    embeddings = model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True  # Important for better similarity
    )

    return embeddings

# Generate embeddings
print("\n=== GENERATING DOCUMENT EMBEDDINGS ===")
doc_embeddings = generate_embeddings_batch(doc_texts)

print("\n=== GENERATING QUERY EMBEDDINGS ===")
query_embeddings = generate_embeddings_batch(query_texts)

print(f"\n✅ Embeddings generated!")
print(f"Document embeddings shape: {doc_embeddings.shape}")
print(f"Query embeddings shape: {query_embeddings.shape}")

# Verify normalization
print(f"\nVerification:")
print(f"First doc embedding norm: {np.linalg.norm(doc_embeddings[0]):.3f} (should be ~1.0)")
print(f"First query embedding norm: {np.linalg.norm(query_embeddings[0]):.3f} (should be ~1.0)")

Loading model: sentence-transformers/all-MiniLM-L6-v2


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model loaded on cuda
Max sequence length: 512

Preparing texts...

Ready to generate embeddings for:
  - 522,768 documents
  - 5,000 queries

=== GENERATING DOCUMENT EMBEDDINGS ===
Using batch size: 64


Batches:   0%|          | 0/8169 [00:00<?, ?it/s]


=== GENERATING QUERY EMBEDDINGS ===
Using batch size: 64


Batches:   0%|          | 0/79 [00:00<?, ?it/s]


✅ Embeddings generated!
Document embeddings shape: (522768, 384)
Query embeddings shape: (5000, 384)

Verification:
First doc embedding norm: 1.000 (should be ~1.0)
First query embedding norm: 1.000 (should be ~1.0)


## Step 7: Evaluate Retrieval Performance

In [None]:
# Build FAISS index for efficient retrieval
print("Building FAISS index...")
index = faiss.IndexFlatIP(doc_embeddings.shape[1])  # Inner product for normalized vectors
index.add(doc_embeddings.astype(np.float32))
print(f"Index built with {index.ntotal} documents")

# Quick evaluation on sample queries
print("\n=== SAMPLE RETRIEVAL TEST ===")
n_samples = min(5, len(query_embeddings))
k = 5  # Top-k documents to retrieve

for i in range(n_samples):
    print(f"\nQuery {i+1}: {query_texts[i][:100]}...")

    # Search for similar documents
    scores, indices = index.search(query_embeddings[i:i+1].astype(np.float32), k)

    print(f"Top {k} retrieved documents:")
    for j, (score, idx) in enumerate(zip(scores[0], indices[0])):
        print(f"  {j+1}. Score: {score:.3f} - {doc_texts[idx][:80]}...")

# Calculate basic metrics
print("\n=== CALCULATING METRICS ===")

# Sample evaluation for efficiency
sample_size = min(100, len(query_embeddings))
sample_indices = np.random.choice(len(query_embeddings), sample_size, replace=False)
sample_queries = query_embeddings[sample_indices]

# Calculate similarity statistics
print("Calculating similarity statistics...")
similarities = cosine_similarity(sample_queries, doc_embeddings)

print(f"\nSimilarity Statistics:")
print(f"  Mean: {np.mean(similarities):.4f}")
print(f"  Std: {np.std(similarities):.4f}")
print(f"  Max: {np.max(similarities):.4f}")
print(f"  Min: {np.min(similarities):.4f}")

# Calculate MAP if qrels available
if 'qrels' in datasets and datasets['qrels'] is not None:
    print("\n=== CALCULATING MAP SCORE ===")
    # Implementation would go here based on qrels format
    print("MAP calculation requires proper qrels format")
else:
    print("\n⚠️ No qrels file found for MAP calculation")

Building FAISS index...
Index built with 522768 documents

=== SAMPLE RETRIEVAL TEST ===

Query 1: How does Quora look to a moderator?...
Top 5 retrieved documents:
  1. Score: 0.725 - How does one become a Quora moderator?...
  2. Score: 0.686 - Who are the Quora Moderators?...
  3. Score: 0.680 - How is Quora moderated?...
  4. Score: 0.676 - What does the Quora website look like to members of Quora moderation?...
  5. Score: 0.675 - How does Quora Moderation work?...

Query 2: How do I refuse to chose between different things to do in my life?...
Top 5 retrieved documents:
  1. Score: 0.800 - How do I choose what to do with my life?...
  2. Score: 0.763 - How do you "DECIDE" what you want to do with your life?...
  3. Score: 0.744 - How can I decide what to do in with my life?...
  4. Score: 0.731 - How do I decide on what to do with my life?...
  5. Score: 0.699 - Why I'm not able to decide what my goal is & what to do in my life?...

Query 3: Did Ben Affleck shine more than Christ

In [None]:
# ====== CALCULATE MAP & MPR METRICS ======
if 'qrels' in datasets and len(datasets['qrels']) > 0:
    print("\n=== CALCULATING RETRIEVAL METRICS ===")
    print("Preparing qrels data...")

    # Convert qrels to {query_id: {doc_id: relevance}} format
    qrels = defaultdict(dict)
    for _, row in datasets['qrels'].iterrows():
        qid = str(row['query_id'])
        did = str(row['doc_id'])
        qrels[qid][did] = int(row['relevance'])

    # Create mappings from IDs to embedding indices
    query_id_to_idx = {str(qid): i for i, qid in enumerate(query_ids)}
    doc_id_to_idx = {str(did): i for i, did in enumerate(doc_ids)}

    # Evaluation parameters
    top_k = 100  # Maximum number of docs to retrieve per query
    rank_cutoffs = [5, 10, 20, 50, 100]  # For MPR calculation

    # Initialize metrics storage
    map_scores = []
    mpr_scores = {k: [] for k in rank_cutoffs}

    print(f"\nEvaluating on {len(qrels)} query-relevance pairs...")

    # Process each query with relevance judgments
    for qid, relevant_docs in tqdm(qrels.items(), desc="Evaluating queries"):
        if qid not in query_id_to_idx:
            continue  # Skip if query wasn't processed

        query_idx = query_id_to_idx[qid]
        query_embedding = query_embeddings[query_idx]

        # Retrieve top_k documents
        distances, indices = index.search(
            query_embedding.reshape(1, -1).astype(np.float32),
            top_k
        )

        retrieved_docs = [doc_ids[i] for i in indices[0]]
        relevant_found = 0
        precisions = []

        # Calculate precision at each rank
        for rank, did in enumerate(retrieved_docs, 1):
            if str(did) in relevant_docs:
                relevant_found += 1
                precisions.append(relevant_found / rank)

                # Record precision at cutoff points
                if rank in rank_cutoffs:
                    mpr_scores[rank].append(relevant_found / rank)

        # Calculate Average Precision for this query
        if precisions:
            ap = sum(precisions) / len(relevant_docs)
            map_scores.append(ap)

    # Calculate final metrics
    if map_scores:
        MAP = np.mean(map_scores)
        print(f"\nMean Average Precision (MAP): {MAP:.4f}")

        print("\nMean Precision at Rank (MPR):")
        for cutoff in sorted(mpr_scores.keys()):
            if mpr_scores[cutoff]:
                mpr = np.mean(mpr_scores[cutoff])
                print(f"  @{cutoff}: {mpr:.4f}")
            else:
                print(f"  @{cutoff}: No relevant docs found")

        # Save metrics to metadata
        if 'metadata' in locals():
            metadata['retrieval_metrics'] = {
                'MAP': MAP,
                'MPR': {k: np.mean(v) for k, v in mpr_scores.items() if v}
            }
            joblib.dump(metadata, 'embedding_metadata.joblib')
            print("\n✅ Metrics saved to metadata")
    else:
        print("\n⚠️ No relevant documents found for any query")
else:
    print("\n⚠️ No qrels data found - skipping MAP/MPR calculation")

print("\n=== EVALUATION COMPLETE ===")


=== CALCULATING RETRIEVAL METRICS ===
Preparing qrels data...

Evaluating on 5000 query-relevance pairs...


Evaluating queries: 100%|██████████| 5000/5000 [06:39<00:00, 12.50it/s]



Mean Average Precision (MAP): 0.8454

Mean Precision at Rank (MPR):
  @5: 0.5285
  @10: 0.3847
  @20: 0.2935
  @50: 0.1250
  @100: 0.1025

✅ Metrics saved to metadata

=== EVALUATION COMPLETE ===


## Step 8: Save Results

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define your save directory in Google Drive
save_dir = '/content/drive/MyDrive/Quora_Embeddings'  # Change this to your preferred path

# Create directory if it doesn't exist
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    print(f"Created directory: {save_dir}")
else:
    print(f"Directory already exists: {save_dir}")

print("\nSaving embeddings and metadata to Google Drive...")

# Save embeddings using joblib
joblib.dump(doc_embeddings, f'{save_dir}/doc_embeddings.joblib')
joblib.dump(query_embeddings, f'{save_dir}/query_embeddings.joblib')

# Save metadata
metadata = {
    'model_name': MODEL_NAME,
    'embedding_dim': doc_embeddings.shape[1],
    'num_docs': len(doc_embeddings),
    'num_queries': len(query_embeddings),
    'doc_ids': doc_ids,
    'query_ids': query_ids,
    'normalized': True
}
joblib.dump(metadata, f'{save_dir}/embedding_metadata.joblib')

# Save cleaned texts with IDs using joblib
doc_data = {
    'doc_ids': doc_ids,
    'texts': doc_texts
}
joblib.dump(doc_data, f'{save_dir}/documents_final.joblib')

query_data = {
    'query_ids': query_ids,
    'texts': query_texts
}
joblib.dump(query_data, f'{save_dir}/queries_final.joblib')

# Create summary
summary = f"""
=== PROCESSING COMPLETE ===

Model: {MODEL_NAME}
Documents: {len(doc_embeddings):,}
Queries: {len(query_embeddings):,}
Embedding Dimension: {doc_embeddings.shape[1]}

Files Generated (all in joblib format):
- doc_embeddings.joblib: Document embeddings
- query_embeddings.joblib: Query embeddings
- embedding_metadata.joblib: Metadata
- documents_final.joblib: Cleaned documents with IDs
- queries_final.joblib: Cleaned queries with IDs

Saved to Google Drive at: {save_dir}

✅ All files saved successfully!
"""

print(summary)

# Save summary as text file
with open(f'{save_dir}/processing_summary.txt', 'w') as f:
    f.write(summary)

# Create zip file for easy download
print("\nCreating zip file in Google Drive...")
with zipfile.ZipFile(f'{save_dir}/quora_embeddings_joblib.zip', 'w') as zipf:
    zipf.write(f'{save_dir}/doc_embeddings.joblib', 'doc_embeddings.joblib')
    zipf.write(f'{save_dir}/query_embeddings.joblib', 'query_embeddings.joblib')
    zipf.write(f'{save_dir}/embedding_metadata.joblib', 'embedding_metadata.joblib')
    zipf.write(f'{save_dir}/documents_final.joblib', 'documents_final.joblib')
    zipf.write(f'{save_dir}/queries_final.joblib', 'queries_final.joblib')
    zipf.write(f'{save_dir}/processing_summary.txt', 'processing_summary.txt')

print(f"✅ Zip file created: {save_dir}/quora_embeddings_joblib.zip")
print("\n🎉 Processing complete! Files saved to your Google Drive.")

Mounted at /content/drive
Created directory: /content/drive/MyDrive/Quora_Embeddings

Saving embeddings and metadata to Google Drive...

=== PROCESSING COMPLETE ===

Model: sentence-transformers/all-MiniLM-L6-v2
Documents: 522,768
Queries: 5,000
Embedding Dimension: 384

Files Generated (all in joblib format):
- doc_embeddings.joblib: Document embeddings
- query_embeddings.joblib: Query embeddings
- embedding_metadata.joblib: Metadata
- documents_final.joblib: Cleaned documents with IDs
- queries_final.joblib: Cleaned queries with IDs

Saved to Google Drive at: /content/drive/MyDrive/Quora_Embeddings

✅ All files saved successfully!


Creating zip file in Google Drive...
✅ Zip file created: /content/drive/MyDrive/Quora_Embeddings/quora_embeddings_joblib.zip

🎉 Processing complete! Files saved to your Google Drive.


In [None]:
from google.colab import drive
from sentence_transformers import SentenceTransformer
import joblib
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define your save directory in Google Drive
save_dir = '/content/drive/MyDrive/Quora_Embeddings'  # Change this to your preferred path

# Create directory if it doesn't exist
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    print(f"Created directory: {save_dir}")
else:
    print(f"Directory already exists: {save_dir}")

# 1. Save the model itself
print("\nSaving the Sentence Transformer model...")
model_save_path = f"{save_dir}/{MODEL_NAME.replace('/', '_')}"
model.save(model_save_path)
print(f"✅ Model saved to: {model_save_path}")

# 2. Save embeddings using joblib
print("\nSaving embeddings...")
joblib.dump(doc_embeddings, f'{save_dir}/doc_embeddings.joblib')
joblib.dump(query_embeddings, f'{save_dir}/query_embeddings.joblib')

# 3. Save metadata
metadata = {
    'model_name': MODEL_NAME,
    'model_path': model_save_path,
    'embedding_dim': doc_embeddings.shape[1],
    'num_docs': len(doc_embeddings),
    'num_queries': len(query_embeddings),
    'doc_ids': doc_ids,
    'query_ids': query_ids,
    'normalized': True
}
joblib.dump(metadata, f'{save_dir}/embedding_metadata.joblib')

# 4. Save cleaned texts
doc_data = {
    'doc_ids': doc_ids,
    'texts': doc_texts
}
joblib.dump(doc_data, f'{save_dir}/documents_final.joblib')

query_data = {
    'query_ids': query_ids,
    'texts': query_texts
}
joblib.dump(query_data, f'{save_dir}/queries_final.joblib')

# Create summary
summary = f"""
=== PROCESSING COMPLETE ===

Model: {MODEL_NAME}
Model saved to: {model_save_path}
Documents: {len(doc_embeddings):,}
Queries: {len(query_embeddings):,}
Embedding Dimension: {doc_embeddings.shape[1]}

Files Generated:
- Model directory: {MODEL_NAME.replace('/', '_')}/
- doc_embeddings.joblib: Document embeddings
- query_embeddings.joblib: Query embeddings
- embedding_metadata.joblib: Metadata
- documents_final.joblib: Cleaned documents
- queries_final.joblib: Cleaned queries

Saved to Google Drive at: {save_dir}

✅ All files saved successfully!
"""

print(summary)

# Save summary
with open(f'{save_dir}/processing_summary.txt', 'w') as f:
    f.write(summary)

print("\n🎉 Processing complete! Model and embeddings saved to your Google Drive.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Directory already exists: /content/drive/MyDrive/Quora_Embeddings

Saving the Sentence Transformer model...
✅ Model saved to: /content/drive/MyDrive/Quora_Embeddings/sentence-transformers_all-MiniLM-L6-v2

Saving embeddings...

=== PROCESSING COMPLETE ===

Model: sentence-transformers/all-MiniLM-L6-v2
Model saved to: /content/drive/MyDrive/Quora_Embeddings/sentence-transformers_all-MiniLM-L6-v2
Documents: 522,768
Queries: 5,000
Embedding Dimension: 384

Files Generated:
- Model directory: sentence-transformers_all-MiniLM-L6-v2/
- doc_embeddings.joblib: Document embeddings
- query_embeddings.joblib: Query embeddings
- embedding_metadata.joblib: Metadata
- documents_final.joblib: Cleaned documents
- queries_final.joblib: Cleaned queries

Saved to Google Drive at: /content/drive/MyDrive/Quora_Embeddings

✅ All files saved successfully!


🎉 Processing complete! M