# COVID Tweet Synthetic Data Generation

This notebook focuses on generating synthetic fake tweets from real COVID-related tweets.
The process is similar to news articles but adapted for the shorter, more informal format of tweets.

In [None]:
# Import required libraries
import sys
import os
sys.path.append('../..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import json
from datetime import datetime
import re

# Import custom modules
from src.data_generation.llm_client import create_llm_client, SyntheticDataResult
from src.utils.data_utils import load_config, load_raw_data, save_processed_data
from src.utils.text_preprocessing import TextPreprocessor

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

In [None]:
# Load configuration and environment
config = load_config()

from dotenv import load_dotenv
load_dotenv()

print("Configuration and environment loaded.")

## Load COVID Tweet Data

In [None]:
# Load COVID tweets data
try:
    tweets_df = load_raw_data("tweets")
    print(f"Loaded {len(tweets_df)} tweets")
    
    if len(tweets_df) > 0:
        print("\nDataset info:")
        print(tweets_df.info())
        print("\nFirst few rows:")
        print(tweets_df.head())
    else:
        print("No tweet data found. Creating sample data for demonstration...")
        
        # Create sample COVID tweets
        sample_tweets = [
            {
                'text': 'Breaking: New COVID variant found in 500 cases across London. Scientists say its 80% more contagious than Delta. Hospitals preparing for surge. #COVID19 #Health',
                'user': 'health_reporter',
                'date': '2024-01-15',
                'retweets': 1250,
                'likes': 3400
            },
            {
                'text': 'Just got my 4th COVID shot! Studies show 96% protection rate. Feel great and ready for winter season. Everyone should get vaccinated! 💉 #VaccinesWork',
                'user': 'medical_student',
                'date': '2024-01-12',
                'retweets': 89,
                'likes': 456
            },
            {
                'text': 'COVID deaths down 60% this month in our county. Mask mandate lifted yesterday. Local ICU capacity back to normal levels. Great news! 😷➡️😊',
                'user': 'county_health',
                'date': '2024-01-10',
                'retweets': 567,
                'likes': 1890
            },
            {
                'text': 'My grandmother, 85, recovered from COVID in just 5 days with new treatment. Doctors said its 90% effective for elderly patients. Amazing progress! ❤️',
                'user': 'grateful_family',
                'date': '2024-01-08',
                'retweets': 234,
                'likes': 892
            }
        ]
        
        tweets_df = pd.DataFrame(sample_tweets)
        print(f"Created sample dataset with {len(tweets_df)} tweets for demonstration.")
        
except Exception as e:
    print(f"Error loading data: {e}")
    tweets_df = pd.DataFrame()

## Tweet-Specific Processing Functions

In [None]:
def clean_tweet_text(text):
    """Clean tweet text while preserving hashtags and mentions for context."""
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Clean extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def extract_tweet_facts(llm_client, tweet_text, num_facts=2):
    """Extract facts from tweets with tweet-specific prompt."""
    prompt = f"""
    Extract exactly {num_facts} key factual claims from this COVID-related tweet. 
    Focus on specific, verifiable facts (numbers, percentages, locations, medical claims, etc.).
    Ignore opinions, emotions, and general statements. Return only the facts as a numbered list.
    
    Tweet: {tweet_text}
    
    Facts:
    """
    
    try:
        response = llm_client.client.chat.completions.create(
            model=llm_client.model_name,
            messages=[{"role": "user", "content": prompt}],
            temperature=llm_client.temperature,
            max_tokens=llm_client.max_tokens
        )
        
        facts_text = response.choices[0].message.content.strip()
        facts = llm_client._parse_facts_from_response(facts_text)
        
        return facts[:num_facts]
        
    except Exception as e:
        print(f"Error extracting facts from tweet: {e}")
        return []

def generate_synthetic_tweet(llm_client, original_tweet, modified_facts):
    """Generate synthetic tweet with modified facts."""
    prompt = f"""
    Rewrite this COVID-related tweet to incorporate the modified facts while maintaining:
    - Tweet-like language and tone
    - Similar length (under 280 characters)
    - Hashtags and emojis where appropriate
    - Believable but false information
    
    Original tweet: {original_tweet}
    
    Modified facts to incorporate:
    {chr(10).join([f"- {fact}" for fact in modified_facts])}
    
    Rewritten tweet:
    """
    
    try:
        response = llm_client.client.chat.completions.create(
            model=llm_client.model_name,
            messages=[{"role": "user", "content": prompt}],
            temperature=llm_client.temperature,
            max_tokens=llm_client.max_tokens
        )
        
        return response.choices[0].message.content.strip()
        
    except Exception as e:
        print(f"Error generating synthetic tweet: {e}")
        return original_tweet

print("Tweet processing functions defined.")

## Initialize LLM Client and Process Tweets

In [None]:
# Initialize LLM client
llm_provider = "openai"

try:
    llm_client = create_llm_client(
        provider=llm_provider,
        model_name=config['llm'][llm_provider]['model'],
        temperature=config['llm'][llm_provider]['temperature'],
        max_tokens=config['llm'][llm_provider]['max_tokens']
    )
    print(f"Successfully initialized {llm_provider} client for tweets")
except Exception as e:
    print(f"Error initializing LLM client: {e}")
    llm_client = None

In [None]:
# Process tweets to generate synthetic versions
if llm_client is not None and len(tweets_df) > 0:
    
    synthetic_tweet_results = []
    facts_per_tweet = 2  # Usually fewer facts per tweet due to length
    
    # Process all sample tweets
    sample_size = len(tweets_df)
    
    print(f"Processing {sample_size} tweets...")
    
    for idx, row in tqdm(tweets_df.iterrows(), total=sample_size):
        tweet_text = clean_tweet_text(row['text'])
        
        print(f"\nProcessing tweet {idx + 1}: {tweet_text[:50]}...")
        
        try:
            # Step 1: Extract facts from tweet
            original_facts = extract_tweet_facts(llm_client, tweet_text, facts_per_tweet)
            
            if not original_facts:
                print(f"  ⚠️ No facts extracted, skipping...")
                continue
            
            print(f"  ✓ Extracted {len(original_facts)} facts")
            
            # Step 2: Modify facts
            modified_facts = llm_client.modify_facts(original_facts)
            print(f"  ✓ Modified facts")
            
            # Step 3: Generate synthetic tweet
            synthetic_tweet = generate_synthetic_tweet(llm_client, tweet_text, modified_facts)
            print(f"  ✓ Generated synthetic tweet")
            
            # Store result
            result = SyntheticDataResult(
                original_text=tweet_text,
                original_facts=original_facts,
                modified_facts=modified_facts,
                synthetic_text=synthetic_tweet,
                generation_metadata={
                    'timestamp': datetime.now().isoformat(),
                    'model': llm_client.model_name,
                    'provider': llm_provider,
                    'type': 'tweet',
                    'original_user': row.get('user', 'unknown'),
                    'original_engagement': {
                        'retweets': row.get('retweets', 0),
                        'likes': row.get('likes', 0)
                    }
                }
            )
            
            synthetic_tweet_results.append(result)
            
        except Exception as e:
            print(f"  ✗ Error processing tweet: {e}")
            continue
    
    print(f"\nCompleted processing. Generated {len(synthetic_tweet_results)} synthetic tweets.")
    
else:
    print("Skipping synthetic tweet generation due to missing LLM client or data.")
    synthetic_tweet_results = []

## Results Display and Analysis

In [None]:
# Display synthetic tweet results
if synthetic_tweet_results:
    print("SYNTHETIC TWEET GENERATION RESULTS")
    print("=" * 60)
    
    for i, result in enumerate(synthetic_tweet_results):
        print(f"\n🐦 TWEET PAIR {i + 1}")
        print("-" * 40)
        
        print("\n📱 ORIGINAL TWEET:")
        print(f"  {result.original_text}")
        print(f"  [Length: {len(result.original_text)} chars]")
        
        print("\n🔍 EXTRACTED FACTS:")
        for j, fact in enumerate(result.original_facts, 1):
            print(f"  {j}. {fact}")
        
        print("\n🔄 MODIFIED FACTS:")
        for j, fact in enumerate(result.modified_facts, 1):
            print(f"  {j}. {fact}")
        
        print("\n🤖 SYNTHETIC TWEET:")
        print(f"  {result.synthetic_text}")
        print(f"  [Length: {len(result.synthetic_text)} chars]")
        
        print("\n" + "=" * 60)
else:
    print("No synthetic tweet results to display.")

In [None]:
# Analyze tweet characteristics
if synthetic_tweet_results:
    # Create analysis DataFrame
    tweet_analysis = []
    
    for result in synthetic_tweet_results:
        original_hashtags = len(re.findall(r'#\w+', result.original_text))
        synthetic_hashtags = len(re.findall(r'#\w+', result.synthetic_text))
        
        original_mentions = len(re.findall(r'@\w+', result.original_text))
        synthetic_mentions = len(re.findall(r'@\w+', result.synthetic_text))
        
        original_emojis = len(re.findall(r'[😀-🙿🌀-🗿🚀-🛿🇀-🇿]', result.original_text))
        synthetic_emojis = len(re.findall(r'[😀-🙿🌀-🗿🚀-🛿🇀-🇿]', result.synthetic_text))
        
        tweet_analysis.append({
            'original_length': len(result.original_text),
            'synthetic_length': len(result.synthetic_text),
            'original_words': len(result.original_text.split()),
            'synthetic_words': len(result.synthetic_text.split()),
            'original_hashtags': original_hashtags,
            'synthetic_hashtags': synthetic_hashtags,
            'original_mentions': original_mentions,
            'synthetic_mentions': synthetic_mentions,
            'original_emojis': original_emojis,
            'synthetic_emojis': synthetic_emojis,
            'facts_count': len(result.original_facts)
        })
    
    analysis_df = pd.DataFrame(tweet_analysis)
    
    print("TWEET ANALYSIS STATISTICS")
    print("=" * 30)
    print(analysis_df.describe())
    
    # Visualizations
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    fig.suptitle('Synthetic Tweet Analysis', fontsize=16)
    
    # Length comparison
    axes[0, 0].scatter(analysis_df['original_length'], analysis_df['synthetic_length'])
    axes[0, 0].plot([0, 280], [0, 280], 'r--', alpha=0.5)  # Twitter character limit line
    axes[0, 0].axhline(y=280, color='r', linestyle=':', alpha=0.5, label='Twitter limit')
    axes[0, 0].axvline(x=280, color='r', linestyle=':', alpha=0.5)
    axes[0, 0].set_xlabel('Original Tweet Length')
    axes[0, 0].set_ylabel('Synthetic Tweet Length')
    axes[0, 0].set_title('Character Count Comparison')
    axes[0, 0].legend()
    
    # Word count comparison
    axes[0, 1].scatter(analysis_df['original_words'], analysis_df['synthetic_words'])
    axes[0, 1].plot([0, analysis_df['original_words'].max()], [0, analysis_df['original_words'].max()], 'r--', alpha=0.5)
    axes[0, 1].set_xlabel('Original Word Count')
    axes[0, 1].set_ylabel('Synthetic Word Count')
    axes[0, 1].set_title('Word Count Comparison')
    
    # Hashtag preservation
    axes[0, 2].scatter(analysis_df['original_hashtags'], analysis_df['synthetic_hashtags'])
    axes[0, 2].plot([0, analysis_df['original_hashtags'].max()], [0, analysis_df['original_hashtags'].max()], 'r--', alpha=0.5)
    axes[0, 2].set_xlabel('Original Hashtags')
    axes[0, 2].set_ylabel('Synthetic Hashtags')
    axes[0, 2].set_title('Hashtag Preservation')
    
    # Length distribution
    axes[1, 0].hist([analysis_df['original_length'], analysis_df['synthetic_length']], 
                   bins=10, alpha=0.7, label=['Original', 'Synthetic'], edgecolor='black')
    axes[1, 0].axvline(x=280, color='r', linestyle='--', alpha=0.5, label='Twitter limit')
    axes[1, 0].set_xlabel('Character Count')
    axes[1, 0].set_ylabel('Frequency')
    axes[1, 0].set_title('Length Distribution')
    axes[1, 0].legend()
    
    # Facts extracted
    axes[1, 1].hist(analysis_df['facts_count'], bins=5, alpha=0.7, edgecolor='black')
    axes[1, 1].set_xlabel('Number of Facts Extracted')
    axes[1, 1].set_ylabel('Frequency')
    axes[1, 1].set_title('Facts per Tweet')
    
    # Emoji preservation
    axes[1, 2].scatter(analysis_df['original_emojis'], analysis_df['synthetic_emojis'])
    axes[1, 2].plot([0, analysis_df['original_emojis'].max()], [0, analysis_df['original_emojis'].max()], 'r--', alpha=0.5)
    axes[1, 2].set_xlabel('Original Emojis')
    axes[1, 2].set_ylabel('Synthetic Emojis')
    axes[1, 2].set_title('Emoji Preservation')
    
    plt.tight_layout()
    plt.show()
    
else:
    print("No tweet data available for analysis.")

## Save Synthetic Tweet Data

In [None]:
# Save synthetic tweet data
if synthetic_tweet_results:
    # Create dataset with original and synthetic tweet pairs
    synthetic_tweet_dataset = []
    
    for result in synthetic_tweet_results:
        # Add original (real) tweet
        synthetic_tweet_dataset.append({
            'text': result.original_text,
            'label': 'real',
            'type': 'original_tweet',
            'facts': '|'.join(result.original_facts),
            'character_count': len(result.original_text),
            'word_count': len(result.original_text.split()),
            'hashtag_count': len(re.findall(r'#\w+', result.original_text)),
            'mention_count': len(re.findall(r'@\w+', result.original_text)),
            'generation_metadata': None
        })
        
        # Add synthetic (fake) tweet
        synthetic_tweet_dataset.append({
            'text': result.synthetic_text,
            'label': 'fake',
            'type': 'synthetic_tweet',
            'facts': '|'.join(result.modified_facts),
            'character_count': len(result.synthetic_text),
            'word_count': len(result.synthetic_text.split()),
            'hashtag_count': len(re.findall(r'#\w+', result.synthetic_text)),
            'mention_count': len(re.findall(r'@\w+', result.synthetic_text)),
            'generation_metadata': json.dumps(result.generation_metadata)
        })
    
    # Convert to DataFrame and save
    synthetic_tweets_df = pd.DataFrame(synthetic_tweet_dataset)
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"synthetic_covid_tweets_{timestamp}.csv"
    
    save_processed_data(synthetic_tweets_df, filename, "synthetic")
    
    print(f"Saved {len(synthetic_tweets_df)} tweet records to data/synthetic/{filename}")
    print(f"\nDataset breakdown:")
    print(synthetic_tweets_df['label'].value_counts())
    print(f"\nCharacter count statistics:")
    print(synthetic_tweets_df.groupby('label')['character_count'].describe())
    
    # Save detailed results
    detailed_tweet_results = []
    for result in synthetic_tweet_results:
        detailed_tweet_results.append({
            'original_text': result.original_text,
            'original_facts': result.original_facts,
            'modified_facts': result.modified_facts,
            'synthetic_text': result.synthetic_text,
            'metadata': result.generation_metadata
        })
    
    json_filename = f"detailed_tweet_generation_results_{timestamp}.json"
    json_path = f"data/synthetic/{json_filename}"
    
    with open(json_path, 'w') as f:
        json.dump(detailed_tweet_results, f, indent=2)
    
    print(f"Saved detailed tweet results to {json_path}")
    
else:
    print("No synthetic tweet data to save.")

## Tweet-Specific Quality Metrics

In [None]:
# Calculate tweet-specific quality metrics
if synthetic_tweet_results:
    print("TWEET QUALITY ASSESSMENT")
    print("=" * 30)
    
    quality_metrics = []
    
    for i, result in enumerate(synthetic_tweet_results):
        original = result.original_text
        synthetic = result.synthetic_text
        
        # Length compliance (under Twitter's 280 character limit)
        length_compliant = len(synthetic) <= 280
        
        # Style preservation (hashtags, emojis, mentions)
        has_hashtags = '#' in synthetic if '#' in original else True
        has_emojis = bool(re.search(r'[😀-🙿🌀-🗿🚀-🛿🇀-🇿]', synthetic)) if bool(re.search(r'[😀-🙿🌀-🗿🚀-🛿🇀-🇿]', original)) else True
        
        # Length similarity (not too different from original)
        length_ratio = len(synthetic) / len(original) if len(original) > 0 else 1
        length_similar = 0.5 <= length_ratio <= 2.0
        
        quality_score = sum([
            length_compliant,
            has_hashtags,
            has_emojis,
            length_similar
        ]) / 4
        
        quality_metrics.append({
            'tweet_id': i + 1,
            'length_compliant': length_compliant,
            'has_hashtags': has_hashtags,
            'has_emojis': has_emojis,
            'length_similar': length_similar,
            'quality_score': quality_score,
            'original_length': len(original),
            'synthetic_length': len(synthetic),
            'length_ratio': length_ratio
        })
    
    quality_df = pd.DataFrame(quality_metrics)
    
    print(f"Average Quality Score: {quality_df['quality_score'].mean():.2f}")
    print(f"Length Compliance: {quality_df['length_compliant'].mean()*100:.1f}%")
    print(f"Style Preservation (hashtags): {quality_df['has_hashtags'].mean()*100:.1f}%")
    print(f"Style Preservation (emojis): {quality_df['has_emojis'].mean()*100:.1f}%")
    print(f"Length Similarity: {quality_df['length_similar'].mean()*100:.1f}%")
    
    # Visualize quality metrics
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 3, 1)
    quality_scores = quality_df['quality_score']
    plt.hist(quality_scores, bins=10, alpha=0.7, edgecolor='black')
    plt.axvline(quality_scores.mean(), color='red', linestyle='--', label=f'Mean: {quality_scores.mean():.2f}')
    plt.xlabel('Quality Score')
    plt.ylabel('Frequency')
    plt.title('Quality Score Distribution')
    plt.legend()
    
    plt.subplot(1, 3, 2)
    metrics_summary = [
        quality_df['length_compliant'].mean(),
        quality_df['has_hashtags'].mean(),
        quality_df['has_emojis'].mean(),
        quality_df['length_similar'].mean()
    ]
    metric_names = ['Length\nCompliant', 'Has\nHashtags', 'Has\nEmojis', 'Length\nSimilar']
    
    bars = plt.bar(metric_names, metrics_summary, alpha=0.7)
    plt.ylabel('Success Rate')
    plt.title('Quality Metrics Breakdown')
    plt.ylim(0, 1.1)
    
    # Add value labels on bars
    for bar, value in zip(bars, metrics_summary):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                f'{value:.2f}', ha='center', va='bottom')
    
    plt.subplot(1, 3, 3)
    plt.scatter(quality_df['original_length'], quality_df['synthetic_length'])
    plt.plot([0, 280], [0, 280], 'r--', alpha=0.5)
    plt.axhline(y=280, color='r', linestyle=':', alpha=0.5, label='Twitter limit')
    plt.axvline(x=280, color='r', linestyle=':', alpha=0.5)
    plt.xlabel('Original Length')
    plt.ylabel('Synthetic Length')
    plt.title('Length Preservation')
    plt.legend()
    
    plt.tight_layout()
    plt.show()
    
else:
    print("No tweet data available for quality assessment.")

## Next Steps for Tweet Processing

1. **Scale up**: Process larger Twitter datasets
2. **User behavior**: Consider user characteristics and engagement patterns
3. **Temporal aspects**: Account for trending topics and time-sensitive information
4. **Thread handling**: Process tweet threads and conversations
5. **Multi-modal content**: Handle tweets with images, videos, links
6. **Network effects**: Consider retweet and mention patterns

The synthetic tweet data can now be combined with news article data for comprehensive fake news detection model training.