# Data Preprocessing Sanity check
## Choose dataset, preprocessing method, and review results

This notebook provides a unified interface to:
1. Load data
2. Choose a preprocessing method (basic, **Llama**, or **Gemini API**)
3. **Test with a small sample first** (NUM_SAMPLES = 5)
4. Review and compare results
5. Run on full dataset when satisfied

In [1]:
# Python 3.8 compatibility shim for importlib.resources.files
import sys

if sys.version_info < (3, 9):
    try:
        import importlib.resources as _ir
        # Requires the backport package: `pip install importlib_resources`
        import importlib_resources as _backport
        if not hasattr(_ir, "files"):
            _ir.files = _backport.files
            print("Patched importlib.resources.files using importlib_resources.")
    except ImportError as e:
        print("WARNING: importlib_resources is not installed.")
        print("Install it in this environment with: pip install importlib_resources")
else:
    # On Python >= 3.9, this shim is not needed but harmless
    import importlib.resources as _ir

print("Python version:", sys.version)


Patched importlib.resources.files using importlib_resources.
Python version: 3.8.10 (tags/v3.8.10:3d8993a, May  3 2021, 11:48:03) [MSC v.1928 64 bit (AMD64)]


In [2]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
import pprint
import google.generativeai as genai
from dotenv import load_dotenv

# Add project root to path
import os
import sys
from pathlib import Path

# Find the project root by walking up until we see pyproject.toml
project_root = Path.cwd()
while not (project_root / "pyproject.toml").exists() and project_root.parent != project_root:
    project_root = project_root.parent

# Add repo root and src/ to sys.path
sys.path.insert(0, str(project_root))
sys.path.insert(0, str(project_root / "src"))

print("Project root:", project_root)
print("Python path entry added:", project_root / "src")

# Import the preprocessing module
from mosaic.preprocessing.preprocessing import (
    load_data,
    basic_preprocess,
    preprocess_with_local_llama,  # For local Llama model
    preprocess_with_gemini_api,   # For Google Gemini API
    compare_cleaning_results
)


print("Preprocessing module loaded successfully")



Project root: c:\Users\andre\projects\MOSAIC
Python path entry added: c:\Users\andre\projects\MOSAIC\src


  from .autonotebook import tqdm as notebook_tqdm


Preprocessing module loaded successfully


## Configuration

In [3]:
notebook_dir = Path.cwd()
project_root = notebook_dir.parent.parent
DATA_DIR = project_root / "DATA"
print(f"DATA_DIR set to: {DATA_DIR}")
if not DATA_DIR.exists():
    print("WARNING: DATA folder not found. Check your folder structure.")
else:
    #print the available raw datasets in the DATA_DIR/raw folder
    RAW_DIR = DATA_DIR / "raw"
    PREPROC_DIR = DATA_DIR / "preprocessed"
    available_datasets = [f.name for f in RAW_DIR.glob("*.csv")]
    print(f"Available raw datasets: {available_datasets}")

DATA_DIR set to: c:\Users\andre\projects\MOSAIC\DATA
Available raw datasets: ['meditation_reflections.csv', 'trial_200_topics.csv', 'trial_cleaned_raw.csv']


In [6]:
DATASETS = {}

if not DATA_DIR.exists():
    print("WARNING: DATA folder not found. Check your folder structure.")
elif not RAW_DIR.exists():
    print(f"WARNING: 'raw' folder not found inside DATA. ({RAW_DIR})")
else:
    # Find all CSVs that end with '_raw.csv'
    raw_files = list(RAW_DIR.glob("*_raw.csv"))
    
    print(f"Available raw datasets: {[f.name for f in raw_files]}")

    for file_path in raw_files:
        filename = file_path.name
        
        # Extract the name (remove '_raw.csv' from the end)
        dataset_name = filename.rsplit('_raw.csv', 1)[0]
        
        # Build the dictionary entry dynamically
        DATASETS[dataset_name] = {
            'input': filename,
            'output_api': f"{dataset_name}_cleaned_API.csv",
            'output_local': f"{dataset_name}_cleaned_llama.csv"
        }

    print("\n--- Generated Configuration ---")
    pprint.pprint(DATASETS)

Available raw datasets: ['meditation_reflections_raw.csv', 'trial_cleaned_raw.csv']

--- Generated Configuration ---
{'meditation_reflections': {'input': 'meditation_reflections_raw.csv',
                            'output_api': 'meditation_reflections_cleaned_API.csv',
                            'output_local': 'meditation_reflections_cleaned_llama.csv'},
 'trial_cleaned': {'input': 'trial_cleaned_raw.csv',
                   'output_api': 'trial_cleaned_cleaned_API.csv',
                   'output_local': 'trial_cleaned_cleaned_llama.csv'}}


## Step 1: Select Dataset and Preprocessing Method

### WORKFLOW FOR COMPARING MODELS:
1. Set `NUM_SAMPLES = 5` (or 10) to test
2. Run each method on the same sample
3. Compare results in Step 4
4. Choose best method
5. Set `NUM_SAMPLES = None` to run on full data

In [7]:
# ============================================
#  SELECT DATASET AND METHOD
# ============================================

# Choose dataset (see above available)
DATASET_CHOICE = 'meditation_reflections'

# Choose preprocessing method:
#   'local_llama'  → Local Llama 3 model (needs GPU: Metal/CUDA)
#   'gemini_api'   → Google Gemini API (needs GOOGLE_API_KEY)
METHOD_CHOICE = 'gemini_api'

# FOR TESTING: Set to a number (5, 10, etc) or None for full dataset
# TIP: Test with 5-10 samples first to compare methods, then set to None
NUM_SAMPLES = 5

print(f"\n{'='*60}")
print(f"CONFIGURATION")
print(f"{'='*60}")
print(f"Dataset:        {DATASET_CHOICE}")
print(f"Method:         {METHOD_CHOICE}")
print(f"Samples:        {NUM_SAMPLES if NUM_SAMPLES else 'ALL (FULL DATASET)'}")
print(f"{'='*60}")

if NUM_SAMPLES:
    print(f"\nRunning in TEST mode with {NUM_SAMPLES} samples")
    print(f"  (Good for checking if everything works before full run)")
else:
    print(f"\nRunning on FULL DATASET")
    print(f"  (This may take a while)")


if METHOD_CHOICE == 'gemini_api':
    # Load .env file for API key
    load_dotenv()
    api_key = os.getenv("GOOGLE_API_KEY")
    
    if not api_key:
        print("Error: GOOGLE_API_KEY not found in .env file.")
        sys.exit(1)
    
    # Configure Gemini
    genai.configure(api_key=api_key)
    print("Google Gemini API configured successfully.")
    print(f"API Key found: {api_key[:5]}...*****")
    print("\n--- Available Gemini Models (generateContent) ---")
    
    # List Models
    try:
        found_any = False
        for m in genai.list_models():
            # Only show models that can generate text (chat models)
            if 'generateContent' in m.supported_generation_methods:
                print(f"  • {m.name}")
                found_any = True
        
        if not found_any:
            print("No models found. Check your API key permissions.")
            
    except Exception as e:
        print(f"Error listing models: {e}")


CONFIGURATION
Dataset:        meditation_reflections
Method:         gemini_api
Samples:        5

Running in TEST mode with 5 samples
  (Good for checking if everything works before full run)
Google Gemini API configured successfully.
API Key found: AIzaS...*****

--- Available Gemini Models (generateContent) ---
No models found. Check your API key permissions.


## Step 2: Load Data

In [8]:
# Build file paths
dataset_config = DATASETS[DATASET_CHOICE]
input_path = os.path.join(RAW_DIR, dataset_config['input'])

# Load data
df = load_data(input_path, text_column='reflection_answer', remove_na=True)

if df is not None:
    print(f"\nSuccessfully loaded {len(df)} records")
    print(f"DataFrame shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    print(f"\nFirst 5 rows (preview):")
    display(df.head(5))
else:
    print("Failed to load data. Check file path and try again.")

Loaded 202 reports from meditation_reflections_raw.csv

Successfully loaded 202 records
DataFrame shape: (202, 1)
Columns: ['reflection_answer']

First 5 rows (preview):


Unnamed: 0,reflection_answer
0,I felt anchored in the physical. That groundin...
1,I noticed the quality of contact between my ha...
2,There was a sense of the body as alive and res...
3,The boundaries between self and environment bl...
4,I lost track of how long I had been sitting. T...


In [None]:
# Sentence-split only: cut each row into individual sentences, one row per sentence
# Run this cell INSTEAD of Step 3 to bypass Gemini/Llama
from mosaic.preprocessing.preprocessing import split_sentences

if df is None:
    print("No data loaded. Run Step 2 first.")
else:
    df_slice = df.head(NUM_SAMPLES).copy() if NUM_SAMPLES else df.copy()
    texts = df_slice["reflection_answer"].astype(str).tolist()
    sentences, doc_map = split_sentences(texts)
    filtered = [(s.strip(), doc_map[i]) for i, s in enumerate(sentences) if s.strip()]
    df_to_process = pd.DataFrame({
        "reflection_answer": [texts[i] for _, i in filtered],
        "cleaned_reflection": [s for s, _ in filtered],
    })
    print(f"Sentence-split complete: {len(df_slice)} rows -> {len(df_to_process)} sentences")
    print(f"Columns: {df_to_process.columns.tolist()}")

## Step 3: Run Preprocessing

In [9]:
if df is None:
    print("No data loaded. Run Step 2 first.")
else:
    df_to_process = df.head(NUM_SAMPLES).copy() if NUM_SAMPLES else df.copy()
    
    # ============================================
    # LOCAL LLAMA METHOD (using Llama 3)
    # ============================================
    if METHOD_CHOICE == 'local_llama':
        output_path = os.path.join(PREPROC_DIR, dataset_config['output_local'])
        if NUM_SAMPLES:
            output_path = output_path.replace('.csv', f'_{NUM_SAMPLES}_test.csv')
        
        df_to_process = preprocess_with_local_llama(
            csv_path=input_path,
            output_path=output_path,
            text_column='reflection_answer',
            num_samples=NUM_SAMPLES
        )
    
    # ============================================
    # GEMINI API METHOD
    # ============================================
    elif METHOD_CHOICE == 'gemini_api':
        print("Running GEMINI API preprocessing...")
        print(f"(Using Google Gemini with batch processing)\n")
        
        output_path = os.path.join(PREPROC_DIR, dataset_config['output_api'])
        if NUM_SAMPLES:
            output_path = output_path.replace('.csv', f'_{NUM_SAMPLES}_test.csv')
        
        df_to_process = preprocess_with_gemini_api(
            csv_path=input_path,
            output_path=output_path,
            text_column='reflection_answer',
            batch_size=20,
            num_samples=NUM_SAMPLES,
            model_name="gemini-2.5-flash-lite"
        )
        print(f"\nGemini API preprocessing complete")
    
    else:
        print(f"Unknown method: {METHOD_CHOICE}")
        print(f"Choose from: 'local_llama', 'gemini_api'")

Running GEMINI API preprocessing...
(Using Google Gemini with batch processing)

[OK] Using specified Gemini model: gemini-2.5-flash-lite

GEMINI API PREPROCESSING
Input file: meditation_reflections_raw.csv
Model: gemini-2.5-flash-lite
Delay between texts: 2s
Max text length: No limit (process all reports)
Processing: 5 reports

PRE-FLIGHT CHECK: Gemini API

MODEL:
  Selected model:        gemini-2.0-flash-lite (default)
  Rate limit:            10 requests/min, 1000/day

REPORT STATISTICS:
  Total reports:         5
  Total characters:      440
  Average length:        88 chars
  Longest report:        101 chars

API SETTINGS:
  Delay between calls:   2s

TIME ESTIMATES:
  Estimated time:        ~0.3 minutes

QUOTA ANALYSIS:
  Model limit:           10 requests/minute
  Your effective rate:   ~15.0 requests/minute


  RECOMMENDATIONS:
    - Use --delay 7 or higher
    - Errors will be marked and can be retried with --retry-failed

[OK] Ready to process 5 reports!


Processing 5 texts 

Gemini API: 100%|██████████| 5/5 [00:23<00:00,  4.64s/it]


SUMMARY
Total reports: 5
Successfully processed: 5
Errors: 0
Skipped (too long): 0
Output saved to: c:\Users\andre\projects\MOSAIC\DATA\preprocessed\meditation_reflections_cleaned_API_5_test.csv


Gemini API preprocessing complete





## Step 4: Review Results

### Summary

In [10]:
if df_to_process is not None and 'cleaned_reflection' in df_to_process.columns:
    print(f"\n{'='*80}")
    print(f"RESULTS SUMMARY")
    print(f"{'='*80}")
    print(f"Total processed: {len(df_to_process)}")
    print(f"Columns: {df_to_process.columns.tolist()}")
    print(f"\nDataFrame Info:")
    print(df_to_process.info())


RESULTS SUMMARY
Total processed: 5
Columns: ['reflection_answer', 'cleaned_reflection']

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   reflection_answer   5 non-null      object
 1   cleaned_reflection  5 non-null      object
dtypes: object(2)
memory usage: 208.0+ bytes
None


### View Full Results Table

In [11]:
# Show all rows for review
print(f"\nAll {len(df_to_process)} processed records:")
display(df_to_process)


All 5 processed records:


Unnamed: 0,reflection_answer,cleaned_reflection
0,I felt anchored in the physical. That groundin...,I felt anchored in the physical. That groundin...
1,I noticed the quality of contact between my ha...,I noticed the quality of contact between my ha...
2,There was a sense of the body as alive and res...,There was a sense of the body as alive and res...
3,The boundaries between self and environment bl...,The boundaries between self and environment bl...
4,I lost track of how long I had been sitting. T...,I lost track of how long I had been sitting. T...


### Side-by-Side Comparison (Original vs Cleaned)

In [12]:
if 'reflection_answer' in df_to_process.columns and 'cleaned_reflection' in df_to_process.columns:
    print(f"\n{'='*80}")
    print("ORIGINAL vs CLEANED COMPARISON")
    print(f"{'='*80}\n")
    
    for i in range(min(5, len(df_to_process))):
        print(f"[Record {i+1}]")
        print(f"ORIGINAL:")
        original = str(df_to_process['reflection_answer'].iloc[i])
        print(f"  {original[:200]}..." if len(original) > 200 else f"  {original}")
        print(f"\nCLEANED:")
        cleaned = str(df_to_process['cleaned_reflection'].iloc[i])
        print(f"  {cleaned[:200]}..." if len(cleaned) > 200 else f"  {cleaned}")
        print(f"{'-'*80}\n")


ORIGINAL vs CLEANED COMPARISON

[Record 1]
ORIGINAL:
  I felt anchored in the physical. That grounding allowed the mind to settle.

CLEANED:
  I felt anchored in the physical. That grounding allowed the mind to settle.
--------------------------------------------------------------------------------

[Record 2]
ORIGINAL:
  I noticed the quality of contact between my hands. Were they touching? The sensation was ambiguous.

CLEANED:
  I noticed the quality of contact between my hands. Were they touching? The sensation was ambiguous.
--------------------------------------------------------------------------------

[Record 3]
ORIGINAL:
  There was a sense of the body as alive and responsive. It was not inert.

CLEANED:
  There was a sense of the body as alive and responsive. It was not inert.
--------------------------------------------------------------------------------

[Record 4]
ORIGINAL:
  The boundaries between self and environment blurred. I was not sure where I ended and the room 

### Statistics

In [13]:
if 'reflection_answer' in df_to_process.columns and 'cleaned_reflection' in df_to_process.columns:
    print("\n" + "="*80)
    print("TEXT LENGTH STATISTICS")
    print("="*80 + "\n")
    
    original_lengths = df_to_process['reflection_answer'].astype(str).str.len()
    cleaned_lengths = df_to_process['cleaned_reflection'].astype(str).str.len()
    
    print(f"Original texts:")
    print(f"  Mean length: {original_lengths.mean():.0f} characters")
    print(f"  Min length: {original_lengths.min()} characters")
    print(f"  Max length: {original_lengths.max()} characters")
    
    print(f"\nCleaned texts:")
    print(f"  Mean length: {cleaned_lengths.mean():.0f} characters")
    print(f"  Min length: {cleaned_lengths.min()} characters")
    print(f"  Max length: {cleaned_lengths.max()} characters")
    
    print(f"\nDifference (cleaned - original):")
    diff = (cleaned_lengths - original_lengths)
    print(f"  Mean: {diff.mean():.0f} characters")
    print(f"  Min: {diff.min()} characters")
    print(f"  Max: {diff.max()} characters")


TEXT LENGTH STATISTICS

Original texts:
  Mean length: 88 characters
  Min length: 72 characters
  Max length: 101 characters

Cleaned texts:
  Mean length: 88 characters
  Min length: 72 characters
  Max length: 101 characters

Difference (cleaned - original):
  Mean: 0 characters
  Min: 0 characters
  Max: 0 characters


## Step 5: Run Full Dataset (when satisfied)

After testing with `NUM_SAMPLES = 5` and reviewing results above:

1. **If results look good**: Uncomment code below
2. **Change `NUM_SAMPLES = 5` to `NUM_SAMPLES = None`**
3. **Re-run from Step 1 through Step 3** to process full dataset

In [11]:


print("\nTo run the full dataset:")
print("  1. Go to Step 1 (Select Dataset and Method)")
print("  2. Change NUM_SAMPLES = 5 to NUM_SAMPLES = None")
print("  3. Run cells in Steps 1 through 3 again")
print("  4. Review final results in Step 4")


To run the full dataset:
  1. Go to Step 1 (Select Dataset and Method)
  2. Change NUM_SAMPLES = 5 to NUM_SAMPLES = None
  3. Run cells in Steps 1 through 3 again
  4. Review final results in Step 4


### Run basic (basic preprocessing and divide into sentences)

In [None]:
df_to_process['cleaned_reflection']

KeyError: 'reflection_answer'

In [None]:

print("Running BASIC preprocessing...")
texts = df_to_process['cleaned_reflection'].tolist()
df_to_process = basic_preprocess(texts, split_into_sentences=True, min_words=2)
print(f"Basic preprocessing complete")

#show the first 5 rows after basic preprocessing
print(f"\nFirst 5 rows after BASIC preprocessing:")
display(df_to_process.head(5))
    

Running BASIC preprocessing...


KeyError: 'reflection_answer'