In [1]:
# --- ENVIRONMENT & LIBRARY SETUP ---
import os
import sys
import pandas as pd
import time
from tqdm.notebook import tqdm
import ast

print("=== TEST DATA PREPROCESSING SETUP ===")
print(f"Python version: {sys.version.split()[0]}")
print(f"Working directory: {os.getcwd()}")

# Validate Kaggle environment
assert '/kaggle/' in os.getcwd(), "This notebook must run in Kaggle environment!"
print("✓ Confirmed running in Kaggle environment")

# Install required packages
print("Installing OpenAI package...")
!pip install -q openai

from openai import OpenAI
print("✓ Libraries imported successfully")

=== TEST DATA PREPROCESSING SETUP ===
Python version: 3.11.11
Working directory: /kaggle/working
✓ Confirmed running in Kaggle environment
Installing OpenAI package...
✓ Libraries imported successfully


In [2]:
# --- API SETUP ---
print("Setting up OpenRouter API...")

try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("OPENROUTER_API_KEY")
    if api_key:
        print("✓ OpenRouter API key loaded from Kaggle Secrets")
    else:
        raise ValueError("No API key found")
except Exception as e:
    print(f"❌ ERROR: Could not load API key: {e}")
    print("Please add OPENROUTER_API_KEY to Kaggle Secrets")
    raise

# Initialize OpenRouter client
openrouter_client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=api_key,
)

print("✓ OpenRouter client initialized")

Setting up OpenRouter API...
✓ OpenRouter API key loaded from Kaggle Secrets
✓ OpenRouter client initialized


In [3]:
# --- DATA LOADING ---
print("Loading test data...")

# 🧪 TESTING MODE - Set to True for small scale testing
TESTING_MODE = True  # Change to False for full processing
TEST_SAMPLE_SIZE = 20  # Number of samples for testing

# Find dataset
kaggle_data_paths = [
    "/kaggle/input/data-of-multimodal-sarcasm-detection",
]

data_dir = None
for path in kaggle_data_paths:
    if os.path.exists(path):
        data_dir = path
        print(f"✓ Dataset found at: {data_dir}")
        break

if data_dir is None:
    print("❌ ERROR: Dataset not found!")
    print("Please add the multimodal sarcasm detection dataset to this notebook")
    raise FileNotFoundError("Dataset not found")

def load_test_data(filepath):
    """Load test data efficiently"""
    records = []
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Test file not found: {filepath}")
        
    print(f"Loading {filepath}...")
    with open(filepath, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            try:
                data_list = ast.literal_eval(line.strip())
                if len(data_list) >= 3:
                    records.append({
                        'id': data_list[0], 
                        'text': data_list[1], 
                        'sarcasm': int(data_list[2])
                    })
            except (ValueError, SyntaxError, IndexError):
                continue
    
    print(f"✓ Loaded {len(records)} test records")
    return pd.DataFrame(records)

# Load test data
test_df = load_test_data(os.path.join(data_dir, 'text', 'test2.txt'))

# 🧪 Apply testing mode if enabled
if TESTING_MODE:
    print(f"\n🧪 TESTING MODE ENABLED")
    print(f"Limiting data to {TEST_SAMPLE_SIZE} samples for testing")
    
    # Take first TEST_SAMPLE_SIZE samples for quick testing
    original_size = len(test_df)
    test_df = test_df.head(TEST_SAMPLE_SIZE).copy()
    
    print(f"Data reduced: {original_size} → {len(test_df)} samples")
    print("⚠️  Remember to set TESTING_MODE = False for full processing")
else:
    print(f"\n🚀 FULL PROCESSING MODE")
    print(f"Processing all {len(test_df)} samples")

# Add image paths
image_folder = os.path.join(data_dir, 'dataset_image')
test_df['image_path'] = test_df['id'].apply(lambda x: os.path.join(image_folder, f"{x}.jpg"))

# Clean data
initial_count = len(test_df)
test_df.dropna(subset=['text'], inplace=True)
test_df['sarcasm'] = test_df['sarcasm'].astype(int)
test_df.drop(test_df[~test_df['image_path'].apply(os.path.exists)].index, inplace=True)
final_count = len(test_df)

print(f"\nData cleaned: {initial_count} → {final_count} samples")
print(f"Sarcastic: {len(test_df[test_df['sarcasm']==1])}")
print(f"Non-sarcastic: {len(test_df[test_df['sarcasm']==0])}")

# Show estimated time based on current data size
estimated_time_minutes = (final_count * 8) / 60
if estimated_time_minutes < 60:
    print(f"⏱️  Estimated processing time: {estimated_time_minutes:.1f} minutes")
else:
    print(f"⏱️  Estimated processing time: {estimated_time_minutes/60:.1f} hours")

Loading test data...
✓ Dataset found at: /kaggle/input/data-of-multimodal-sarcasm-detection
Loading /kaggle/input/data-of-multimodal-sarcasm-detection/text/test2.txt...
✓ Loaded 2409 test records

🧪 TESTING MODE ENABLED
Limiting data to 20 samples for testing
Data reduced: 2409 → 20 samples
⚠️  Remember to set TESTING_MODE = False for full processing

Data cleaned: 20 → 20 samples
Sarcastic: 20
Non-sarcastic: 0
⏱️  Estimated processing time: 2.7 minutes


In [4]:
import os
import time
import pandas as pd

# --- CONFIGURATION ---
TESTING_MODE = True  # Set to True for testing mode

# --- LLM PREPROCESSING FUNCTION ---
def preprocess_with_llm(text):
    """Preprocess text with LLM"""
    if not isinstance(text, str) or not text.strip():
        return ""
        
    try:
        completion = openrouter_client.chat.completions.create(
            model="mistralai/mistral-nemo",
            messages=[
                {"role": "system", "content": "You are an expert text preprocessor for a machine learning model. Your task is to clean and standardize tweet text. Follow these rules strictly:\n1. Correct typos and grammatical errors.\n2. Expand internet slang and abbreviations into standard English (e.g., 'lol' becomes 'laughing out loud').\n3. Convert hashtags into meaningful phrases (e.g., '#nosleep' becomes 'no sleep').\n4. Remove any URLs and mentions like '<user>'.\n5. CRITICALLY IMPORTANT: Preserve the original tone, especially sarcasm or irony. Do not change the underlying meaning.\n6. Your output must ONLY be the final cleaned text, with no extra explanations or chat."},
                {"role": "user", "content": f"Please preprocess the following tweet: \"<user> OMG u kno what i mean?! today is going to be awesome! #nosleep #bestdayever\""},
                {"role": "assistant", "content": "Oh my god, you know what I mean?! Today is going to be awesome! No sleep. Best day ever."},
                {"role": "user", "content": f"Please preprocess the following tweet: \"{text}\""}
            ],
            temperature=0.1,
            max_tokens=150,
        )
        cleaned_text = completion.choices[0].message.content.strip()
        time.sleep(8)  # Rate limiting - 8 seconds between requests
        return cleaned_text
    except Exception as e:
        if "429" in str(e):  # Rate limit
            print(f"Rate limit hit, waiting 60 seconds...")
            time.sleep(60)
            return text  # Return original on rate limit
        else:
            print(f"Error processing text: {e}")
            return text  # Return original on other errors

print("✓ LLM preprocessing function ready")

# --- OUTPUT FILE SETUP ---

# Set output filenames based on mode
if TESTING_MODE:
    OUTPUT_CSV = "test_preprocessed_testing.csv"
    print(f"🧪 Testing mode: Output will be saved to '{OUTPUT_CSV}'")
else:
    OUTPUT_CSV = "test_preprocessed.csv"
    
    # 🔄 SMART RESUME: Check if testing file exists and merge it
    testing_csv = "test_preprocessed_testing.csv"
    if os.path.exists(testing_csv):
        print(f"🔄 Found testing results in '{testing_csv}'")
        print("Smart resume: Will include testing data in full processing")
        
        # Load existing testing data to avoid reprocessing
        existing_df = pd.read_csv(testing_csv)
        print(f"Found {len(existing_df)} preprocessed testing samples")
        
        # Mark testing IDs to skip
        testing_ids = set(existing_df['id'].astype(str))
        print(f"Will skip {len(testing_ids)} already processed samples")
    else:
        testing_ids = set()
        print("No testing data found - processing from scratch")
    
    print(f"🚀 Full mode: Final output will be saved to '{OUTPUT_CSV}'")

print(f"\nOutput file: {OUTPUT_CSV}")

# Resume capability
resume_data = []
processed_ids = set()

if os.path.exists(OUTPUT_CSV):
    print(f"📄 Found existing output file: {OUTPUT_CSV}")
    try:
        existing_df = pd.read_csv(OUTPUT_CSV)
        resume_data = existing_df.to_dict('records')
        processed_ids = set(existing_df['id'].astype(str))
        print(f"✓ Loaded {len(resume_data)} previously processed samples")
        print(f"Remaining to process: {len(test_df) - len(processed_ids)}")
    except Exception as e:
        print(f"⚠️  Error loading existing file: {e}")
        print("Starting fresh...")
        resume_data = []
        processed_ids = set()
else:
    print(f"📝 Creating new output file: {OUTPUT_CSV}")

# For full mode with smart resume, also skip testing IDs
if not TESTING_MODE and testing_ids:
    # Add testing data to resume data if not already there
    if testing_csv and os.path.exists(testing_csv):
        testing_df = pd.read_csv(testing_csv)
        for _, row in testing_df.iterrows():
            if str(row['id']) not in processed_ids:
                resume_data.append(row.to_dict())
                processed_ids.add(str(row['id']))
        print(f"Smart resume: Added {len(testing_df)} testing samples to skip list")

print(f"Total samples to skip: {len(processed_ids)}")
print(f"Samples remaining to process: {len(test_df) - len(processed_ids)}")

✓ LLM preprocessing function ready
🧪 Testing mode: Output will be saved to 'test_preprocessed_testing.csv'

Output file: test_preprocessed_testing.csv
📝 Creating new output file: test_preprocessed_testing.csv
Total samples to skip: 0
Samples remaining to process: 20


In [5]:
# --- BATCH PROCESSING WITH RESUME CAPABILITY ---
output_file = "/kaggle/working/test_processed_complete.csv"
save_frequency = 20  # Save progress every 25 samples

# --- PROCESSING CONFIGURATION ---

# 📊 Batch processing settings
if TESTING_MODE:
    BATCH_SIZE = 10          # Small batches for testing
    SAVE_FREQUENCY = 10      # Save every 5 samples in testing mode
else:
    BATCH_SIZE = 20         # Larger batches for efficiency in full mode
    SAVE_FREQUENCY = 20     # Save every 50 samples in full mode

print(f"⚙️  Batch size: {BATCH_SIZE}")
print(f"💾 Save frequency: every {SAVE_FREQUENCY} samples")
print(f"📊 Total samples to process: {len(test_df) - len(processed_ids)}")

# Check for existing progress
start_idx = 0
processed_data = []

if os.path.exists(output_file):
    try:
        existing_df = pd.read_csv(output_file)
        if 'processed_text' in existing_df.columns:
            processed_data = existing_df.to_dict('records')
            start_idx = len(processed_data)
            print(f"✓ Resuming from index {start_idx} ({start_idx}/{len(test_df)} completed)")
        else:
            print("Existing file found but invalid format, starting fresh")
    except Exception as e:
        print(f"Error reading existing file: {e}, starting fresh")

if start_idx == 0:
    print("Starting fresh preprocessing...")

# Calculate remaining work
remaining_samples = len(test_df) - start_idx
estimated_time_minutes = (remaining_samples * 10) / 60  # 8 seconds per sample
print(f"Remaining samples: {remaining_samples}")
print(f"Estimated time: {estimated_time_minutes:.1f} minutes")

if remaining_samples == 0:
    print("✓ All data already processed!")
else:
    # Process remaining data
    total_processed = len(processed_data)
    start_time = time.time()
    
    for i in tqdm(range(start_idx, len(test_df)), desc="Processing test data"):
        row = test_df.iloc[i]
        
        if str(row['id']) in processed_ids:
            continue
        
        # Create processed row
        processed_row = {
            'id': row['id'],
            'text': row['text'],
            'sarcasm': row['sarcasm'],
            'image_path': row['image_path']
        }
        
        # Process with LLM
        try:
            processed_text = preprocess_with_llm(row['text'])
            processed_row['processed_text'] = processed_text
        except Exception as e:
            print(f"Failed to process sample {i}: {e}")
            processed_row['processed_text'] = row['text']  # Use original
        
        processed_data.append(processed_row)
        total_processed += 1
        
        # Progress update
        if total_processed % 10 == 0:
            elapsed = time.time() - start_time
            rate = total_processed / elapsed if elapsed > 0 else 0
            remaining = len(test_df) - len(processed_ids) - (total_processed - len(processed_ids))
            eta = remaining / rate if rate > 0 else 0
            print(f"✓ Processed {total_processed}/{len(test_df)} | Rate: {rate:.1f}/min | ETA: {eta/60:.1f}min")
        
        # Save intermediate results
        if total_processed % SAVE_FREQUENCY == 0:
            temp_df = pd.DataFrame(processed_data)
            temp_df.to_csv(output_file, index=False)
            print(f"💾 Saved {len(temp_df)} samples to {output_file}")
            print(f"⏱️  Estimated remaining: {((len(test_df) - len(processed_data)) * 8) / 60:.1f} minutes")

# Final save
final_df = pd.DataFrame(processed_data)
final_df.to_csv(output_file, index=False)

elapsed_time = time.time() - start_time
print(f"\n🎉 TEST DATA PREPROCESSING COMPLETED!")
print(f"📄 Output file: {output_file}")
print(f"📊 Total samples processed: {len(final_df)}")
print(f"💾 File size: {os.path.getsize(output_file) / (1024*1024):.1f} MB")
print(f"⏰ Total time: {elapsed_time/60:.1f} minutes")
print(f"🚀 Average rate: {len(final_df)/(elapsed_time/60):.1f} samples/minute")
print("\n✅ Ready for download and use in training notebook!")

# Display sample results
print(f"\n📋 Sample of processed data:")
print(final_df[['id', 'sarcasm', 'text', 'processed_text']].head())

⚙️  Batch size: 10
💾 Save frequency: every 10 samples
📊 Total samples to process: 20
Starting fresh preprocessing...
Remaining samples: 20
Estimated time: 3.3 minutes


Processing test data:   0%|          | 0/20 [00:00<?, ?it/s]

✓ Processed 10/20 | Rate: 0.1/min | ETA: 1.6min
💾 Saved 10 samples to /kaggle/working/test_processed_complete.csv
⏱️  Estimated remaining: 1.3 minutes
✓ Processed 20/20 | Rate: 0.1/min | ETA: 0.0min
💾 Saved 20 samples to /kaggle/working/test_processed_complete.csv
⏱️  Estimated remaining: 0.0 minutes

🎉 TEST DATA PREPROCESSING COMPLETED!
📄 Output file: /kaggle/working/test_processed_complete.csv
📊 Total samples processed: 20
💾 File size: 0.0 MB
⏰ Total time: 3.3 minutes
🚀 Average rate: 6.0 samples/minute

✅ Ready for download and use in training notebook!

📋 Sample of processed data:
                   id  sarcasm  \
0  862902619928506372        1   
1  892551658487631873        1   
2  853143461360480256        1   
3  918423568823840768        1   
4  731617467718610944        1   

                                                text  \
0  i am guessing # netflix no longer lets you gra...   
1  it 's the insensitive strikeouts at suntrust p...   
2  following the path of the river c

In [6]:
# --- VERIFICATION & SUMMARY ---
print("=== FINAL VERIFICATION ===")

# Load and verify final file
final_df = pd.read_csv(output_file)

print(f"📊 Dataset Summary:")
print(f"   Total samples: {len(final_df)}")
print(f"   Sarcastic: {len(final_df[final_df['sarcasm']==1])}")
print(f"   Non-sarcastic: {len(final_df[final_df['sarcasm']==0])}")
print(f"   Missing processed_text: {final_df['processed_text'].isna().sum()}")

# Show sample of processed data
print(f"\n📝 Sample of processed data:")
for i in range(min(3, len(final_df))):
    row = final_df.iloc[i]
    print(f"\nSample {i+1}:")
    print(f"  Original:  {row['text'][:100]}...")
    print(f"  Processed: {row['processed_text'][:100]}...")
    print(f"  Label: {row['sarcasm']}")

print(f"\n🎯 MISSION ACCOMPLISHED - PERSON 3 TASK COMPLETE!")
print(f"📤 Download the file and share with team for training phase")

=== FINAL VERIFICATION ===
📊 Dataset Summary:
   Total samples: 20
   Sarcastic: 20
   Non-sarcastic: 0
   Missing processed_text: 0

📝 Sample of processed data:

Sample 1:
  Original:  i am guessing # netflix no longer lets you grab screens of movies . that & the new rating system is ...
  Processed: I'm guessing Netflix no longer allows you to take screenshots of movies. That, and the new rating sy...
  Label: 1

Sample 2:
  Original:  it 's the insensitive strikeouts at suntrust park .  # braves # chopchop...
  Processed: It's the insensitive strikeouts at SunTrust Park. Braves. Chop chop....
  Label: 1

Sample 3:
  Original:  following the path of the river calder , so ... grim ... up .... north ...
  Processed: Following the path of the River Calder, so grim up north...
  Label: 1

🎯 MISSION ACCOMPLISHED - PERSON 3 TASK COMPLETE!
📤 Download the file and share with team for training phase
