# Preprocessing Notebook

This notebook handles data preprocessing for the Cirq-RAG-Code-Assistant project.

## Purpose
- Fetch quantum code from GitHub repositories
- Load and clean knowledge base data
- Process Cirq code snippets
- Generate descriptions for code samples
- Prepare data for embedding generation
- Organize knowledge base structure

## Usage
Import preprocessing functions from `src.data` and use them to process your data.


## 1. Setup and Imports

Import the necessary modules for data fetching, preprocessing, and loading.


In [None]:
# Import data processing modules
from pathlib import Path
from src.data.fetcher import DatasetFetcher
from src.data.preprocessor import DataPreprocessor
from src.data.description_generator import DescriptionGenerator
from src.data.dataset_loader import DatasetLoader

# Set up paths
DATA_DIR = Path("data/datasets")
DATA_DIR.mkdir(parents=True, exist_ok=True)

print("âœ… Imports successful!")


## 2. Fetch Data from GitHub

Fetch Cirq code samples from the Cirq GitHub repository.


In [None]:
# Initialize fetcher
fetcher = DatasetFetcher(
    repos_dir="repos",  # Directory to clone repositories
    output_dir=DATA_DIR,  # Output directory for extracted data
)

# Fetch code from all repositories
# Note: This will clone repositories if they don't exist
# Set force_clone=True to re-clone existing repositories
output_file = fetcher.fetch_all(
    output_filename="quantum_code_samples_filtered.jsonl",
    force_clone=False,  # Set to True to re-clone
    min_code_length=50,
    max_code_length=50000,
)

print(f"âœ… Data fetched and saved to: {output_file}")


## 3. Load and Inspect Dataset

Load the dataset and view statistics.


In [None]:
# Load dataset
dataset_path = DATA_DIR / "quantum_code_samples_filtered.jsonl"
loader = DatasetLoader(dataset_path)

# Print statistics
loader.print_stats()

# Get some sample entries
samples = loader.sample(3, seed=42)
print("\nðŸ“‹ Sample entries:")
for i, entry in enumerate(samples, 1):
    print(f"\n--- Sample {i} ---")
    print(f"Framework: {entry.get('framework')}")
    print(f"File: {entry.get('file')}")
    print(f"Code length: {len(entry.get('code', ''))} characters")
    print(f"Code preview: {entry.get('code', '')[:200]}...")


## 4. Preprocess Dataset

Clean and validate the dataset, remove duplicates, and extract metadata.


In [None]:
# Initialize preprocessor
preprocessor = DataPreprocessor(
    min_code_length=50,
    max_code_length=50000,
    min_lines=5,
    max_lines=1000,
    remove_duplicates=True,
    validate_syntax=True,
)

# Preprocess dataset
input_file = DATA_DIR / "quantum_code_samples_filtered.jsonl"
output_file = DATA_DIR / "quantum_code_samples_preprocessed.jsonl"

stats = preprocessor.preprocess_dataset(
    input_path=input_file,
    output_path=output_file,
    add_metadata=True,
)

print(f"âœ… Preprocessing complete! Processed {stats['processed']} entries.")


## 5. Generate Descriptions

Add natural language descriptions to code samples.


In [None]:
# Initialize description generator
# Set use_ml=True to use ML-based summarization (requires transformers)
generator = DescriptionGenerator(
    use_ml=False,  # Set to True for ML-enhanced descriptions
    ml_model="facebook/bart-large-cnn",
    device="auto",  # "auto", "cpu", or "cuda"
)

# Generate descriptions
input_file = DATA_DIR / "quantum_code_samples_preprocessed.jsonl"
output_file = DATA_DIR / "quantum_dataset_with_descriptions.jsonl"

desc_stats = generator.add_descriptions_to_dataset(
    input_path=input_file,
    output_path=output_file,
    use_ml=False,  # Override instance setting if needed
    batch_size=100,
)

print(f"âœ… Descriptions generated! Processed {desc_stats['processed']} entries.")


## 6. Verify Final Dataset

Load and verify the final preprocessed dataset with descriptions.


In [None]:
# Load final dataset
final_dataset = DatasetLoader(DATA_DIR / "quantum_dataset_with_descriptions.jsonl")

# Print statistics
final_dataset.print_stats()

# View a sample entry with description
samples = final_dataset.sample(1, seed=42)
if samples:
    entry = samples[0]
    print("\nðŸ“‹ Sample entry with description:")
    print(f"Framework: {entry.get('framework')}")
    print(f"File: {entry.get('file')}")
    print(f"\nDescription:")
    print(entry.get('description', 'No description'))
    print(f"\nMetadata:")
    if 'metadata' in entry:
        for key, value in entry['metadata'].items():
            print(f"  - {key}: {value}")


## 7. View Cirq Samples

View and analyze Cirq samples from the dataset.


In [None]:
# Get all Cirq samples (all entries should be Cirq)
cirq_samples = final_dataset.get_by_framework("Cirq")
print(f"Found {len(cirq_samples)} Cirq samples")

# View a Cirq sample
if cirq_samples:
    sample = cirq_samples[0]
    print(f"\nðŸ“‹ Cirq Sample:")
    print(f"File: {sample.get('file')}")
    print(f"Description: {sample.get('description', 'No description')[:200]}...")
    print(f"\nCode preview:")
    print(sample.get('code', '')[:300] + "...")


## 8. Complete Pipeline

Run the complete preprocessing pipeline in one go.


In [None]:
# Complete preprocessing pipeline
def run_preprocessing_pipeline(
    fetch_data: bool = False,
    generate_descriptions: bool = True,
    use_ml: bool = False,
):
    """
    Run the complete data preprocessing pipeline.
    
    Args:
        fetch_data: Whether to fetch data from GitHub
        generate_descriptions: Whether to generate descriptions
        use_ml: Whether to use ML for description generation
    """
    # Step 1: Fetch data (optional, if not already done)
    if fetch_data:
        print("Step 1: Fetching data from GitHub...")
        fetcher = DatasetFetcher(output_dir=DATA_DIR)
        fetcher.fetch_all()
    
    # Step 2: Preprocess data
    print("\nStep 2: Preprocessing data...")
    preprocessor = DataPreprocessor()
    preprocessor.preprocess_dataset(
        input_path=DATA_DIR / "quantum_code_samples_filtered.jsonl",
        output_path=DATA_DIR / "quantum_code_samples_preprocessed.jsonl",
    )
    
    # Step 3: Generate descriptions
    if generate_descriptions:
        print("\nStep 3: Generating descriptions...")
        generator = DescriptionGenerator(use_ml=use_ml)
        generator.add_descriptions_to_dataset(
            input_path=DATA_DIR / "quantum_code_samples_preprocessed.jsonl",
            output_path=DATA_DIR / "quantum_dataset_with_descriptions.jsonl",
        )
    
    print("\nâœ… Pipeline complete!")

# Uncomment to run the complete pipeline:
# run_preprocessing_pipeline(fetch_data=False, generate_descriptions=True, use_ml=False)
