# Quora Dataset Processing and Embedding Generation

This notebook processes the Quora dataset by:
1. Uploading and extracting the dataset files
2. Applying text cleaning methods
3. Generating embeddings using paraphrase-mpnet-base-v2
4. Saving all processed files and models

**Steps Overview:**
- Step 1: Install required packages
- Step 2: Upload Quora dataset from Downloads folder
- Step 3: Extract and save docs, queries, qrels as separate files
- Step 4: Apply text cleaning methods
- Step 5: Generate embeddings using paraphrase-mpnet-base-v2
- Step 6: Save all files and models using joblib

## Step 1: Install Required Packages

In [None]:
!pip install sentence-transformers
!pip install pandas
!pip install numpy
!pip install scikit-learn
!pip install joblib
!pip install nltk
!pip install tqdm

import pandas as pd
import numpy as np
import re
import string
import nltk
import joblib
import os
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import zipfile
import tarfile
from google.colab import files
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

print("All packages installed successfully!")

## Step 2: Upload Quora Dataset

Upload your Quora dataset file from your Downloads folder. The system will automatically detect the file format and extract accordingly.

In [None]:
print("Please upload your Quora dataset file from your Downloads folder:")
uploaded = files.upload()

# Get the uploaded file name
uploaded_file = list(uploaded.keys())[0]
print(f"Uploaded file: {uploaded_file}")

# Extract the file if it's compressed
if uploaded_file.endswith('.zip'):
    with zipfile.ZipFile(uploaded_file, 'r') as zip_ref:
        zip_ref.extractall('quora_dataset')
    print("Zip file extracted successfully!")
elif uploaded_file.endswith('.tar.gz') or uploaded_file.endswith('.tgz'):
    with tarfile.open(uploaded_file, 'r:gz') as tar_ref:
        tar_ref.extractall('quora_dataset')
    print("Tar.gz file extracted successfully!")
else:
    # Move the file to quora_dataset directory
    os.makedirs('quora_dataset', exist_ok=True)
    os.rename(uploaded_file, f'quora_dataset/{uploaded_file}')
    print("File moved to quora_dataset directory!")

# List contents of the extracted directory
print("\nContents of quora_dataset directory:")
for root, dirs, files in os.walk('quora_dataset'):
    level = root.replace('quora_dataset', '').count(os.sep)
    indent = ' ' * 2 * level
    print(f"{indent}{os.path.basename(root)}/")
    subindent = ' ' * 2 * (level + 1)
    for file in files:
        print(f"{subindent}{file}")

## Step 3: Extract and Save Docs, Queries, and Qrels as Separate Files

This step identifies and loads the three main components of the Quora dataset:
- **docs**: The document collection
- **queries**: The search queries
- **qrels**: The relevance judgments (query-document pairs with relevance scores)

In [None]:
# Function to find files by pattern
def find_files_by_pattern(directory, patterns):
    found_files = {}
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            for pattern_name, pattern in patterns.items():
                if any(p in file.lower() for p in pattern):
                    found_files[pattern_name] = file_path
                    break
    return found_files

# Define patterns to search for different file types
file_patterns = {
    'docs': ['corpus', 'documents', 'docs', 'collection'],
    'queries': ['queries', 'query', 'topics'],
    'qrels': ['qrels', 'relevance', 'judgments', 'rel']
}

# Find the files
found_files = find_files_by_pattern('quora_dataset', file_patterns)
print("Found files:")
for file_type, file_path in found_files.items():
    print(f"{file_type}: {file_path}")

# Load the files
def load_file(file_path):
    """Load file based on extension"""
    if file_path.endswith('.tsv'):
        return pd.read_csv(file_path, sep='\t')
    elif file_path.endswith('.csv'):
        return pd.read_csv(file_path)
    elif file_path.endswith('.json') or file_path.endswith('.jsonl'):
        return pd.read_json(file_path, lines=True)
    else:
        # Try to read as tab-separated by default
        try:
            return pd.read_csv(file_path, sep='\t')
        except:
            return pd.read_csv(file_path)

# Load each file type
datasets = {}
for file_type, file_path in found_files.items():
    print(f"\nLoading {file_type} from {file_path}...")
    datasets[file_type] = load_file(file_path)
    print(f"Shape: {datasets[file_type].shape}")
    print(f"Columns: {list(datasets[file_type].columns)}")
    print(f"First few rows:")
    print(datasets[file_type].head())

# Save as separate TSV files with quora names
print("\nSaving files with Quora naming convention...")
datasets['docs'].to_csv('quora_docs.tsv', sep='\t', index=False)
datasets['queries'].to_csv('quora_queries.tsv', sep='\t', index=False)
datasets['qrels'].to_csv('quora_qrels.tsv', sep='\t', index=False)

print("Files saved successfully:")
print("- quora_docs.tsv")
print("- quora_queries.tsv")
print("- quora_qrels.tsv")

# Display summary statistics
print("\n=== DATASET SUMMARY ===")
print(f"Documents: {len(datasets['docs'])} entries")
print(f"Queries: {len(datasets['queries'])} entries")
print(f"Qrels: {len(datasets['qrels'])} entries")

## Step 4: Text Cleaning Methods

This step applies comprehensive text cleaning to both documents and queries:
- Convert to lowercase
- Remove special characters and numbers
- Remove extra whitespace
- Remove stopwords
- Apply lemmatization
- Remove very short texts

In [None]:
# Initialize cleaning tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    """
    Comprehensive text cleaning function
    
    Args:
        text (str): Input text to clean
    
    Returns:
        str: Cleaned text
    """
    if pd.isna(text) or not isinstance(text, str):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove special characters and numbers, keep only alphabets and spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords and apply lemmatization
    tokens = [lemmatizer.lemmatize(token) for token in tokens 
              if token not in stop_words and len(token) > 2]
    
    # Join tokens back
    cleaned_text = ' '.join(tokens)
    
    return cleaned_text

# Apply cleaning to documents
print("Cleaning documents...")
docs_df = datasets['docs'].copy()

# Identify text columns in docs (common column names)
text_columns_docs = [col for col in docs_df.columns if any(keyword in col.lower() for keyword in ['text', 'content', 'body', 'document', 'passage'])]
if not text_columns_docs:
    # If no obvious text column, use the last column or column with longest text
    text_columns_docs = [docs_df.columns[-1]]  # Assume last column is text

print(f"Document text columns identified: {text_columns_docs}")

# Apply cleaning to each text column
for col in text_columns_docs:
    print(f"Cleaning column: {col}")
    tqdm.pandas(desc=f"Cleaning {col}")
    docs_df[f'{col}_cleaned'] = docs_df[col].progress_apply(clean_text)

# Apply cleaning to queries
print("\nCleaning queries...")
queries_df = datasets['queries'].copy()

# Identify text columns in queries
text_columns_queries = [col for col in queries_df.columns if any(keyword in col.lower() for keyword in ['text', 'query', 'question', 'title'])]
if not text_columns_queries:
    # If no obvious text column, use the last column
    text_columns_queries = [queries_df.columns[-1]]  # Assume last column is text

print(f"Query text columns identified: {text_columns_queries}")

# Apply cleaning to each text column
for col in text_columns_queries:
    print(f"Cleaning column: {col}")
    tqdm.pandas(desc=f"Cleaning {col}")
    queries_df[f'{col}_cleaned'] = queries_df[col].progress_apply(clean_text)

# Remove entries with very short cleaned text (less than 3 words)
print("\nFiltering out very short texts...")
original_docs_count = len(docs_df)
original_queries_count = len(queries_df)

# Filter docs
for col in text_columns_docs:
    docs_df = docs_df[docs_df[f'{col}_cleaned'].str.split().str.len() >= 3]

# Filter queries
for col in text_columns_queries:
    queries_df = queries_df[queries_df[f'{col}_cleaned'].str.split().str.len() >= 3]

print(f"Documents: {original_docs_count} -> {len(docs_df)} (removed {original_docs_count - len(docs_df)} short texts)")
print(f"Queries: {original_queries_count} -> {len(queries_df)} (removed {original_queries_count - len(queries_df)} short texts)")

# Save cleaned datasets
print("\nSaving cleaned datasets...")
docs_df.to_csv('quora_docs_cleaned.tsv', sep='\t', index=False)
queries_df.to_csv('quora_queries_cleaned.tsv', sep='\t', index=False)

print("Cleaned files saved:")
print("- quora_docs_cleaned.tsv")
print("- quora_queries_cleaned.tsv")

# Display cleaning examples
print("\n=== CLEANING EXAMPLES ===")
print("\nDocument cleaning examples:")
for col in text_columns_docs:
    print(f"\nColumn: {col}")
    for i in range(min(3, len(docs_df))):
        original = docs_df.iloc[i][col]
        cleaned = docs_df.iloc[i][f'{col}_cleaned']
        print(f"Original: {original[:100]}...")
        print(f"Cleaned:  {cleaned[:100]}...")
        print("-" * 50)

print("\nQuery cleaning examples:")
for col in text_columns_queries:
    print(f"\nColumn: {col}")
    for i in range(min(3, len(queries_df))):
        original = queries_df.iloc[i][col]
        cleaned = queries_df.iloc[i][f'{col}_cleaned']
        print(f"Original: {original}")
        print(f"Cleaned:  {cleaned}")
        print("-" * 50)

## Step 5: Generate Embeddings using paraphrase-mpnet-base-v2

This step:
1. Loads the pre-trained paraphrase-mpnet-base-v2 model
2. Generates embeddings for cleaned documents and queries
3. Saves the embeddings and model for later use

In [None]:
# Load the pre-trained model
print("Loading paraphrase-mpnet-base-v2 model...")
model = SentenceTransformer('paraphrase-mpnet-base-v2')
print("Model loaded successfully!")

# Prepare texts for embedding
print("\nPreparing texts for embedding...")

# For documents, combine all cleaned text columns
doc_texts = []
doc_ids = []

for idx, row in docs_df.iterrows():
    combined_text = ' '.join([str(row[f'{col}_cleaned']) for col in text_columns_docs])
    doc_texts.append(combined_text)
    # Use the first column as ID, or create an ID
    doc_id = row[docs_df.columns[0]] if docs_df.columns[0] != text_columns_docs[0] else idx
    doc_ids.append(doc_id)

# For queries, combine all cleaned text columns
query_texts = []
query_ids = []

for idx, row in queries_df.iterrows():
    combined_text = ' '.join([str(row[f'{col}_cleaned']) for col in text_columns_queries])
    query_texts.append(combined_text)
    # Use the first column as ID, or create an ID
    query_id = row[queries_df.columns[0]] if queries_df.columns[0] != text_columns_queries[0] else idx
    query_ids.append(query_id)

print(f"Prepared {len(doc_texts)} documents and {len(query_texts)} queries for embedding")

# Generate document embeddings
print("\nGenerating document embeddings...")
doc_embeddings = model.encode(doc_texts, 
                             batch_size=32, 
                             show_progress_bar=True,
                             convert_to_numpy=True)

print(f"Document embeddings shape: {doc_embeddings.shape}")

# Generate query embeddings
print("\nGenerating query embeddings...")
query_embeddings = model.encode(query_texts, 
                               batch_size=32, 
                               show_progress_bar=True,
                               convert_to_numpy=True)

print(f"Query embeddings shape: {query_embeddings.shape}")

# Create embedding dataframes with IDs
doc_embeddings_df = pd.DataFrame({
    'doc_id': doc_ids,
    'text': doc_texts,
    'embedding': [emb.tolist() for emb in doc_embeddings]
})

query_embeddings_df = pd.DataFrame({
    'query_id': query_ids,
    'text': query_texts,
    'embedding': [emb.tolist() for emb in query_embeddings]
})

print("\nEmbedding generation completed successfully!")
print(f"Document embeddings: {doc_embeddings_df.shape}")
print(f"Query embeddings: {query_embeddings_df.shape}")

## Step 6: Save All Files and Models using Joblib

This step saves:
- The trained model
- Document and query embeddings as matrices
- All processed datasets
- Metadata for later use

In [None]:
# Save the model
print("Saving model and embeddings...")

# Save the SentenceTransformer model
model.save('quora_paraphrase_mpnet_model')
print("Model saved to: quora_paraphrase_mpnet_model/")

# Save embeddings as numpy arrays using joblib
joblib.dump(doc_embeddings, 'quora_doc_embeddings_matrix.joblib')
joblib.dump(query_embeddings, 'quora_query_embeddings_matrix.joblib')
print("Embedding matrices saved:")
print("- quora_doc_embeddings_matrix.joblib")
print("- quora_query_embeddings_matrix.joblib")

# Save embedding dataframes
joblib.dump(doc_embeddings_df, 'quora_doc_embeddings_df.joblib')
joblib.dump(query_embeddings_df, 'quora_query_embeddings_df.joblib')
print("Embedding dataframes saved:")
print("- quora_doc_embeddings_df.joblib")
print("- quora_query_embeddings_df.joblib")

# Save processed datasets
joblib.dump(docs_df, 'quora_docs_processed.joblib')
joblib.dump(queries_df, 'quora_queries_processed.joblib')
joblib.dump(datasets['qrels'], 'quora_qrels.joblib')
print("Processed datasets saved:")
print("- quora_docs_processed.joblib")
print("- quora_queries_processed.joblib")
print("- quora_qrels.joblib")

# Save metadata
metadata = {
    'model_name': 'paraphrase-mpnet-base-v2',
    'embedding_dim': doc_embeddings.shape[1],
    'num_documents': len(doc_texts),
    'num_queries': len(query_texts),
    'doc_text_columns': text_columns_docs,
    'query_text_columns': text_columns_queries,
    'doc_ids': doc_ids,
    'query_ids': query_ids
}

joblib.dump(metadata, 'quora_metadata.joblib')
print("Metadata saved: quora_metadata.joblib")

# Create a summary file
summary = f"""
=== QUORA DATASET PROCESSING SUMMARY ===

Files Generated:
1. quora_docs.tsv - Original documents
2. quora_queries.tsv - Original queries
3. quora_qrels.tsv - Relevance judgments
4. quora_docs_cleaned.tsv - Cleaned documents
5. quora_queries_cleaned.tsv - Cleaned queries
6. quora_paraphrase_mpnet_model/ - Trained model directory
7. quora_doc_embeddings_matrix.joblib - Document embeddings matrix
8. quora_query_embeddings_matrix.joblib - Query embeddings matrix
9. quora_doc_embeddings_df.joblib - Document embeddings with metadata
10. quora_query_embeddings_df.joblib - Query embeddings with metadata
11. quora_docs_processed.joblib - Processed documents dataframe
12. quora_queries_processed.joblib - Processed queries dataframe
13. quora_qrels.joblib - Relevance judgments dataframe
14. quora_metadata.joblib - Processing metadata

Dataset Statistics:
- Documents: {len(doc_texts)}
- Queries: {len(query_texts)}
- Qrels: {len(datasets['qrels'])}
- Embedding dimension: {doc_embeddings.shape[1]}
- Model: paraphrase-mpnet-base-v2

Text Columns Processed:
- Document columns: {text_columns_docs}
- Query columns: {text_columns_queries}

Processing Steps Applied:
1. Text cleaning (lowercase, remove special chars, stopwords, lemmatization)
2. Filtering (removed texts with <3 words)
3. Embedding generation using paraphrase-mpnet-base-v2
4. File saving with joblib for efficient loading

Next Steps:
- Use the evaluation notebook to assess embedding quality
- Target MAP score: >= 0.7
- All files are ready for download and further processing
"""

with open('quora_processing_summary.txt', 'w') as f:
    f.write(summary)

print(summary)
print("\n=== PROCESSING COMPLETED SUCCESSFULLY ===")
print("All files have been saved and are ready for download!")
print("\nTo download files, run the next cell.")

## Step 7: Download Generated Files

Run this cell to download all generated files to your local machine.

In [None]:
# Create a zip file with all generated files
import zipfile
import os

print("Creating zip file with all generated files...")

# List of files to include in the zip
files_to_zip = [
    'quora_docs.tsv',
    'quora_queries.tsv',
    'quora_qrels.tsv',
    'quora_docs_cleaned.tsv',
    'quora_queries_cleaned.tsv',
    'quora_doc_embeddings_matrix.joblib',
    'quora_query_embeddings_matrix.joblib',
    'quora_doc_embeddings_df.joblib',
    'quora_query_embeddings_df.joblib',
    'quora_docs_processed.joblib',
    'quora_queries_processed.joblib',
    'quora_qrels.joblib',
    'quora_metadata.joblib',
    'quora_processing_summary.txt'
]

# Create zip file
with zipfile.ZipFile('quora_processed_files.zip', 'w') as zipf:
    for file in files_to_zip:
        if os.path.exists(file):
            zipf.write(file)
            print(f"Added {file} to zip")
    
    # Add model directory
    if os.path.exists('quora_paraphrase_mpnet_model'):
        for root, dirs, files in os.walk('quora_paraphrase_mpnet_model'):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, '.')
                zipf.write(file_path, arcname)
        print("Added model directory to zip")

print("\nZip file created: quora_processed_files.zip")
print("Downloading...")

# Download the zip file
files.download('quora_processed_files.zip')

print("\nDownload completed!")
print("\nFiles included in the download:")
for file in files_to_zip:
    if os.path.exists(file):
        size = os.path.getsize(file) / (1024*1024)  # Size in MB
        print(f"- {file} ({size:.2f} MB)")

print("\nModel directory: quora_paraphrase_mpnet_model/")
print("\nAll files are now available in your Downloads folder!")