In [2]:
# --- ENVIRONMENT & LIBRARY SETUP ---
import os
import sys
import pandas as pd
import time
from tqdm.notebook import tqdm
import ast

print("=== VALIDATION DATA PREPROCESSING SETUP ===")
print(f"Python version: {sys.version.split()[0]}")
print(f"Working directory: {os.getcwd()}")

# Validate Kaggle environment
assert '/kaggle/' in os.getcwd(), "This notebook must run in Kaggle environment!"
print("✓ Confirmed running in Kaggle environment")

# Install required packages
print("Installing OpenAI package...")
!pip install -q openai

from openai import OpenAI
print("✓ Libraries imported successfully")

=== VALIDATION DATA PREPROCESSING SETUP ===
Python version: 3.11.11
Working directory: /kaggle/working
✓ Confirmed running in Kaggle environment
Installing OpenAI package...
✓ Libraries imported successfully


In [3]:
# --- API SETUP ---
print("Setting up OpenRouter API...")

try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("OPENROUTER_API_KEY")
    if api_key:
        print("✓ OpenRouter API key loaded from Kaggle Secrets")
    else:
        raise ValueError("No API key found")
except Exception as e:
    print(f"❌ ERROR: Could not load API key: {e}")
    print("Please add OPENROUTER_API_KEY to Kaggle Secrets")
    raise

# Initialize OpenRouter client
openrouter_client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=api_key,
)

print("✓ OpenRouter client initialized")

Setting up OpenRouter API...
✓ OpenRouter API key loaded from Kaggle Secrets
✓ OpenRouter client initialized


In [4]:
# --- DATA LOADING ---
print("Loading validation data...")

# 🧪 TESTING MODE - Set to True for small scale testing
TESTING_MODE = False  # Change to False for full processing
TEST_SAMPLE_SIZE = 10  # Number of samples for testing

# 📊 BATCH PROCESSING CONFIGURATION
# Set these parameters to process specific ranges of data
ENABLE_RANGE_PROCESSING = False  # Set to True to enable range processing
START_ROW = 0       # Start from this row (0-based index)
END_ROW = 500      # End at this row (exclusive, so this processes rows 0-999)

# Find dataset
kaggle_data_paths = [
    "/kaggle/input/data-of-multimodal-sarcasm-detection",
]

data_dir = None
for path in kaggle_data_paths:
    if os.path.exists(path):
        data_dir = path
        print(f"✓ Dataset found at: {data_dir}")
        break

if data_dir is None:
    print("❌ ERROR: Dataset not found!")
    print("Please add the multimodal sarcasm detection dataset to this notebook")
    raise FileNotFoundError("Dataset not found")

def load_validation_data(filepath):
    """Load validation data efficiently"""
    records = []
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Validation file not found: {filepath}")
        
    print(f"Loading {filepath}...")
    with open(filepath, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            try:
                data_list = ast.literal_eval(line.strip())
                if len(data_list) >= 3:
                    records.append({
                        'id': data_list[0], 
                        'text': data_list[1], 
                        'sarcasm': int(data_list[2])
                    })
            except (ValueError, SyntaxError, IndexError):
                continue
    
    print(f"✓ Loaded {len(records)} validation records")
    return pd.DataFrame(records)

# Load validation data - try both valid.txt and valid2.txt
validation_files = ['valid2.txt']
val_df = None

for filename in validation_files:
    filepath = os.path.join(data_dir, 'text', filename)
    if os.path.exists(filepath):
        print(f"📁 Found validation file: {filename}")
        val_df = load_validation_data(filepath)
        break

if val_df is None:
    raise FileNotFoundError("No validation file found! Expected 'valid.txt' or 'valid2.txt'")

# 🧪 Apply testing mode if enabled
if TESTING_MODE:
    print(f"\n🧪 TESTING MODE ENABLED")
    print(f"Limiting data to {TEST_SAMPLE_SIZE} samples for testing")
    
    # Take first TEST_SAMPLE_SIZE samples for quick testing
    original_size = len(val_df)
    val_df = val_df.head(TEST_SAMPLE_SIZE).copy()
    
    print(f"Data reduced: {original_size} → {len(val_df)} samples")
    print("⚠️  Remember to set TESTING_MODE = False for full processing")
elif ENABLE_RANGE_PROCESSING:
    print(f"\n📊 RANGE PROCESSING MODE ENABLED")
    original_size = len(val_df)
    
    # Handle negative END_ROW (means process till end)
    actual_end_row = len(val_df) if END_ROW == -1 else min(END_ROW, len(val_df))
    actual_start_row = max(0, START_ROW)
    
    print(f"Processing rows {actual_start_row} to {actual_end_row-1}")
    print(f"Total samples in this batch: {actual_end_row - actual_start_row}")
    
    # Slice the dataframe to specified range
    val_df = val_df.iloc[actual_start_row:actual_end_row].copy()
    val_df.reset_index(drop=True, inplace=True)
    
    print(f"Data filtered: {original_size} → {len(val_df)} samples")
    print(f"📁 Output will include batch info: batch_{actual_start_row}_{actual_end_row}")
else:
    print(f"\n🚀 FULL PROCESSING MODE")
    print(f"Processing all {len(val_df)} samples")

# Add image paths
image_folder = os.path.join(data_dir, 'dataset_image')
val_df['image_path'] = val_df['id'].apply(lambda x: os.path.join(image_folder, f"{x}.jpg"))

# Clean data with detailed logging
initial_count = len(val_df)
print(f"\n🧹 CLEANING DATA:")
print(f"Initial count: {initial_count}")

# Check for missing text
text_before = len(val_df)
val_df.dropna(subset=['text'], inplace=True)
text_after = len(val_df)
text_dropped = text_before - text_after
print(f"Dropped {text_dropped} rows with missing text ({text_dropped/text_before*100:.1f}%)")

# Convert sarcasm to int
val_df['sarcasm'] = val_df['sarcasm'].astype(int)

# Check for missing images
image_before = len(val_df)
missing_images = []
for idx, row in val_df.iterrows():
    if not os.path.exists(row['image_path']):
        missing_images.append(row['id'])

if missing_images:
    print(f"Found {len(missing_images)} missing image files")
    print(f"Sample missing images: {missing_images[:5]}...")
    
val_df.drop(val_df[~val_df['image_path'].apply(os.path.exists)].index, inplace=True)
image_after = len(val_df)
image_dropped = image_before - image_after
print(f"Dropped {image_dropped} rows with missing images ({image_dropped/image_before*100:.1f}%)")

final_count = len(val_df)

print(f"\nData cleaned: {initial_count} → {final_count} samples")
print(f"Sarcastic: {len(val_df[val_df['sarcasm']==1])}")
print(f"Non-sarcastic: {len(val_df[val_df['sarcasm']==0])}")

# Show estimated time based on current data size
estimated_time_minutes = (final_count * 8) / 60
if estimated_time_minutes < 60:
    print(f"⏱️  Estimated processing time: {estimated_time_minutes:.1f} minutes")
else:
    print(f"⏱️  Estimated processing time: {estimated_time_minutes/60:.1f} hours")

Loading validation data...
✓ Dataset found at: /kaggle/input/data-of-multimodal-sarcasm-detection
📁 Found validation file: valid2.txt
Loading /kaggle/input/data-of-multimodal-sarcasm-detection/text/valid2.txt...
✓ Loaded 2410 validation records

🧪 TESTING MODE ENABLED
Limiting data to 10 samples for testing
Data reduced: 2410 → 10 samples
⚠️  Remember to set TESTING_MODE = False for full processing

🧹 CLEANING DATA:
Initial count: 10
Dropped 0 rows with missing text (0.0%)
Dropped 0 rows with missing images (0.0%)

Data cleaned: 10 → 10 samples
Sarcastic: 10
Non-sarcastic: 0
⏱️  Estimated processing time: 1.3 minutes


In [5]:
# --- LLM PREPROCESSING FUNCTION ---
def preprocess_with_llm(text):
    """Preprocess text with LLM"""
    if not isinstance(text, str) or not text.strip():
        return ""
        
    try:
        completion = openrouter_client.chat.completions.create(
            model="mistralai/mistral-nemo",
            messages=[
                {"role": "system", "content": "You are an expert text preprocessor for a machine learning model. Your task is to clean and standardize tweet text. Follow these rules strictly:\n1. Correct typos and grammatical errors.\n2. Expand internet slang and abbreviations into standard English (e.g., 'lol' becomes 'laughing out loud').\n3. Convert hashtags into meaningful phrases (e.g., '#nosleep' becomes 'no sleep').\n4. Remove any URLs and mentions like '<user>'.\n5. CRITICALLY IMPORTANT: Preserve the original tone, especially sarcasm or irony. Do not change the underlying meaning.\n6. Your output must ONLY be the final cleaned text, with no extra explanations or chat."},
                {"role": "user", "content": f"Please preprocess the following tweet: \"<user> OMG u kno what i mean?! today is going to be awesome! #nosleep #bestdayever\""},
                {"role": "assistant", "content": "Oh my god, you know what I mean?! Today is going to be awesome! No sleep. Best day ever."},
                {"role": "user", "content": f"Please preprocess the following tweet: \"{text}\""}
            ],
            temperature=0.1,
            max_tokens=150,
        )
        cleaned_text = completion.choices[0].message.content.strip()
        time.sleep(4)  # Rate limiting - 8 seconds between requests
        return cleaned_text
    except Exception as e:
        if "429" in str(e):  # Rate limit
            print(f"Rate limit hit, waiting 60 seconds...")
            time.sleep(60)
            return text  # Return original on rate limit
        else:
            print(f"Error processing text: {e}")
            return text  # Return original on other errors

print("✓ LLM preprocessing function ready")

# --- OUTPUT FILE SETUP ---

# Set output filenames based on mode
if TESTING_MODE:
    OUTPUT_CSV = "validation_preprocessed_testing.csv"
    print(f"🧪 Testing mode: Output will be saved to '{OUTPUT_CSV}'")
else:
    OUTPUT_CSV = "validation_preprocessed.csv"
    print(f"🚀 Full mode: Final output will be saved to '{OUTPUT_CSV}'")

print(f"\nOutput file: {OUTPUT_CSV}")

✓ LLM preprocessing function ready
🧪 Testing mode: Output will be saved to 'validation_preprocessed_testing.csv'

Output file: validation_preprocessed_testing.csv


In [6]:
# --- BATCH PROCESSING WITH RESUME CAPABILITY ---

# Set output filename and save frequency based on mode
if TESTING_MODE:
    output_file = "/kaggle/working/validation_processed_TEST.csv"
    save_frequency = 5  # Save every 5 samples for testing
    print(f"🧪 Testing mode: Output will be saved as validation_processed_TEST.csv")
elif ENABLE_RANGE_PROCESSING:
    # Create filename with batch info
    actual_end_row = len(val_df) + START_ROW if END_ROW == -1 else END_ROW
    batch_name = f"batch_{START_ROW}_{actual_end_row}"
    output_file = f"/kaggle/working/validation_processed_{batch_name}.csv"
    save_frequency = 20  # Save every 20 samples for batch processing
    print(f"📊 Range mode: Output will be saved as validation_processed_{batch_name}.csv")
    print(f"📁 Processing samples {START_ROW} to {actual_end_row-1}")
else:
    output_file = "/kaggle/working/validation_processed_complete.csv"
    save_frequency = 20  # Save every 20 samples for full processing
    print(f"🚀 Full mode: Output will be saved as validation_processed_complete.csv")

# Check for existing progress
start_idx = 0
processed_data = []

# For full mode, check if we can resume from testing mode results
if not TESTING_MODE:
    test_file = "/kaggle/working/validation_processed_TEST.csv"
    if os.path.exists(test_file) and not os.path.exists(output_file):
        try:
            test_df = pd.read_csv(test_file)
            if 'processed_text' in test_df.columns and len(test_df) > 0:
                print(f"🔄 Found testing results ({len(test_df)} samples)")
                print(f"📋 These will be included in full processing to avoid reprocessing")
                processed_data = test_df.to_dict('records')
                start_idx = len(processed_data)
        except Exception as e:
            print(f"Could not load test results: {e}")

# Check for existing progress in main output file
if os.path.exists(output_file):
    try:
        existing_df = pd.read_csv(output_file)
        if 'processed_text' in existing_df.columns:
            # If we don't have any data yet, load from existing file
            if len(processed_data) == 0:
                processed_data = existing_df.to_dict('records')
                start_idx = len(processed_data)
            # If existing file has more data than our current progress, use it
            elif len(existing_df) > len(processed_data):
                processed_data = existing_df.to_dict('records')
                start_idx = len(processed_data)
            
            print(f"✓ Resuming from index {start_idx} ({start_idx}/{len(val_df)} completed)")
        else:
            print("Existing file found but invalid format, starting fresh")
    except Exception as e:
        print(f"Error reading existing file: {e}, starting fresh")

if start_idx == 0:
    print("Starting fresh preprocessing...")
elif start_idx > 0 and not TESTING_MODE:
    print(f"📈 Will continue processing from sample {start_idx+1} to {len(val_df)}")

# Calculate remaining work
remaining_samples = len(val_df) - start_idx
estimated_time_minutes = (remaining_samples * 8) / 60  # 8 seconds per sample
print(f"Remaining samples: {remaining_samples}")

if estimated_time_minutes < 60:
    print(f"Estimated time: {estimated_time_minutes:.1f} minutes")
else:
    print(f"Estimated time: {estimated_time_minutes/60:.1f} hours")

if remaining_samples == 0:
    print("✓ All data already processed!")
else:
    # Process remaining data
    total_processed = len(processed_data)
    start_time = time.time()
    
    for i in tqdm(range(start_idx, len(val_df)), desc="Processing validation data"):
        row = val_df.iloc[i]
        
        # Create processed row
        processed_row = {
            'id': row['id'],
            'text': row['text'],
            'sarcasm': row['sarcasm'],
            'image_path': row['image_path']
        }
        
        # Process with LLM
        try:
            processed_text = preprocess_with_llm(row['text'])
            processed_row['processed_text'] = processed_text
        except Exception as e:
            print(f"Failed to process sample {i}: {e}")
            processed_row['processed_text'] = row['text']  # Use original
        
        processed_data.append(processed_row)
        total_processed += 1
        
        # Progress update - UBAH ANGKA INI UNTUK MENGATUR FREKUENSI PROGRESS
        if total_processed % 10 == 0:  # ← Ganti 10 dengan angka lain (50, 100, dll)
            elapsed = time.time() - start_time
            rate = total_processed / elapsed if elapsed > 0 else 0
            remaining = len(val_df) - len(processed_data)
            eta = remaining / rate if rate > 0 else 0
            print(f"✓ Processed {total_processed}/{len(val_df)} | Rate: {rate:.1f}/min | ETA: {eta/60:.1f}min")
        
        # Save progress periodically
        if (i + 1) % save_frequency == 0:
            temp_df = pd.DataFrame(processed_data)
            temp_df.to_csv(output_file, index=False)
            print(f"\n💾 Progress saved: {len(processed_data)}/{len(val_df)} samples")
            
            remaining_after_save = len(val_df) - len(processed_data)
            time_remaining_minutes = (remaining_after_save * 8) / 60
            
            if time_remaining_minutes < 60:
                print(f"⏱️  Estimated remaining: {time_remaining_minutes:.1f} minutes")
            else:
                print(f"⏱️  Estimated remaining: {time_remaining_minutes/60:.1f} hours")

# Final save
final_df = pd.DataFrame(processed_data)
final_df.to_csv(output_file, index=False)

elapsed_time = time.time() - start_time
print(f"\n🎉 PREPROCESSING COMPLETED!")
print(f"📄 Output file: {output_file}")
print(f"📊 Total samples processed: {len(final_df)}")
print(f"💾 File size: {os.path.getsize(output_file) / (1024*1024):.2f} MB")
print(f"⏰ Total time: {elapsed_time/60:.1f} minutes")
print(f"🚀 Average rate: {len(final_df)/(elapsed_time/60):.1f} samples/minute")

if TESTING_MODE:
    print(f"\n🧪 TESTING COMPLETE!")
    print(f"✅ CSV file created successfully with {len(final_df)} samples")
    print(f"🔄 To process full data: Set TESTING_MODE = False in cell 4")
    print(f"📋 Your test results will be automatically included in full processing!")
elif ENABLE_RANGE_PROCESSING:
    actual_end_row = len(final_df) + START_ROW
    print(f"\n📊 BATCH PROCESSING COMPLETE!")
    print(f"✅ Batch {START_ROW}-{actual_end_row-1} processed successfully")
    print(f"📁 File: validation_processed_batch_{START_ROW}_{actual_end_row}.csv")
    print(f"🔄 To process next batch: Update START_ROW and END_ROW in cell 4")
    print(f"📋 After all batches: Use merge script to combine all batch files")
    print(f"\n💡 Next batch suggestion:")
    print(f"   START_ROW = {actual_end_row}")
    print(f"   END_ROW = {actual_end_row + 1000}  # or -1 for remaining data")
else:
    print(f"\n✅ Ready for download and use in training notebook!")

# Display sample results
print(f"\n📋 Sample of processed data:")
print(final_df[['id', 'sarcasm', 'text', 'processed_text']].head())

🧪 Testing mode: Output will be saved as validation_processed_TEST.csv
Starting fresh preprocessing...
Remaining samples: 10
Estimated time: 1.3 minutes


Processing validation data:   0%|          | 0/10 [00:00<?, ?it/s]


💾 Progress saved: 5/10 samples
⏱️  Estimated remaining: 0.7 minutes
✓ Processed 10/10 | Rate: 0.1/min | ETA: 0.0min

💾 Progress saved: 10/10 samples
⏱️  Estimated remaining: 0.0 minutes

🎉 PREPROCESSING COMPLETED!
📄 Output file: /kaggle/working/validation_processed_TEST.csv
📊 Total samples processed: 10
💾 File size: 0.00 MB
⏰ Total time: 1.6 minutes
🚀 Average rate: 6.3 samples/minute

🧪 TESTING COMPLETE!
✅ CSV file created successfully with 10 samples
🔄 To process full data: Set TESTING_MODE = False in cell 4
📋 Your test results will be automatically included in full processing!

📋 Sample of processed data:
                   id  sarcasm  \
0  915657464401580032        1   
1  854678856724340736        1   
2  904892917277274112        1   
3  855466461296504832        1   
4  927373534652805120        1   

                                                text  \
0  whew ... that extra <num> miles today to the g...   
1  " oh , good . now no one will know we 're here...   
2  how much

In [7]:
# --- VERIFICATION & SUMMARY ---
print("=== FINAL VERIFICATION ===")

# Load and verify final file
final_df = pd.read_csv(output_file)

print(f"📊 Dataset Summary:")
print(f"   Total samples: {len(final_df)}")
print(f"   Sarcastic: {len(final_df[final_df['sarcasm']==1])}")
print(f"   Non-sarcastic: {len(final_df[final_df['sarcasm']==0])}")
print(f"   Missing processed_text: {final_df['processed_text'].isna().sum()}")

# Show sample of processed data
print(f"\n📝 Sample of processed data:")
for i in range(min(3, len(final_df))):
    row = final_df.iloc[i]
    print(f"\nSample {i+1}:")
    print(f"  Original:  {row['text'][:100]}...")
    print(f"  Processed: {row['processed_text'][:100]}...")
    print(f"  Label: {row['sarcasm']}")

print(f"\n🎯 MISSION ACCOMPLISHED - PERSON 2 TASK COMPLETE!")
print(f"📤 Download the file and share with team for training phase")

=== FINAL VERIFICATION ===
📊 Dataset Summary:
   Total samples: 10
   Sarcastic: 10
   Non-sarcastic: 0
   Missing processed_text: 0

📝 Sample of processed data:

Sample 1:
  Original:  whew ... that extra <num> miles today to the grocery store & back wore me out .  so why are you usin...
  Processed: Whew... That extra 5 miles today to the grocery store and back wore me out. So why are you using a c...
  Label: 1

Sample 2:
  Original:  " oh , good . now no one will know we 're here . " # remingtonsteele # piercebrosnan  # delivery # j...
  Processed: Oh, good. Now no one will know we're here....
  Label: 1

Sample 3:
  Original:  how much of it you think is true ? has this become real ? well today 's food for brain . # thoughts ...
  Processed: How much of it do you think is true? Has this become real? Well, today's food for thought....
  Label: 1

🎯 MISSION ACCOMPLISHED - PERSON 2 TASK COMPLETE!
📤 Download the file and share with team for training phase
