# Social Media Data Exploration and Analysis
Implemented based on src/data_preparation.py

In [None]:
# 1. Environment Setup
!pip install -q pandas matplotlib seaborn datasets psutil requests
from google.colab import drive
drive.mount('/content/drive')

# Copy src directory to Colab
import os
import shutil
PROJECT_PATH = "/content/drive/MyDrive/CS6120_project"
SRC_PATH = os.path.join(PROJECT_PATH, "src")
if os.path.exists(SRC_PATH):
    shutil.copytree(SRC_PATH, "/content/src")
    import sys
    sys.path.append("/content")
    print("Successfully copied src directory to Colab")
else:
    print(f"Warning: Could not find src directory at {SRC_PATH}")

# Memory monitoring
import psutil
print(f"Available memory: {psutil.virtual_memory().available/1024**3:.2f} GB")

In [None]:
# 2. MSMARCO Data Loading
try:
    from src.data_preparation import DataPreprocessor
except ImportError:
    from content.src.data_preparation import DataPreprocessor
import pandas as pd
from datasets import load_dataset

# Initialize preprocessor
preprocessor = DataPreprocessor()

# Load MSMARCO dataset
try:
    print("Attempting to load MSMARCO dataset...")
    
    # Try different dataset names/versions
    dataset_names = ["ms_marco", "microsoft/ms_marco", "msmarco"]
    versions = ["v1.1", "v2.1", "v2.0", None]
    
    for name in dataset_names:
        for version in versions:
            try:
                config = f" (version: {version})" if version else ""
                print(f"Trying {name}{config}...")
                
                if version:
                    msmarco = load_dataset(name, version)
                else:
                    msmarco = load_dataset(name)
                
                if len(msmarco['train']) > 0:
                    print(f"Successfully loaded {name}{config}")
                    
                    # Version-aware data processing
                    docs = []
                    for doc in msmarco['train'].select(range(10000)):
                        if version == "v1.1":
                            if 'passages' in doc and 'passage_text' in doc['passages'] and len(doc['passages']['passage_text']) > 0:
                                docs.append({
                                    'text': doc['passages']['passage_text'][0],
                                    'length': len(doc['passages']['passage_text'][0])
                                })
                        elif version == "v2.1":
                            if 'passage' in doc and len(doc['passage']) > 0:
                                docs.append({
                                    'text': doc['passage'],
                                    'length': len(doc['passage'])
                                })
                    
                    msmarco_df = pd.DataFrame(docs)
                    break
            except Exception as e:
                print(f"Failed with {name}{config}: {str(e)[:100]}")
        else:
            continue
        break
    
    if 'msmarco_df' not in locals():
        raise ValueError("Could not load any version of MSMARCO dataset")
    
    print(f"MSMARCO statistics:\n{msmarco_df.describe()}")
    
except Exception as e:
    print(f"Failed to load MSMARCO: {e}")
    # Fallback to local data if available
    try:
        local_path = os.path.join(PROJECT_PATH, "data/msmarco_sample.csv")
        if os.path.exists(local_path):
            print("Attempting to load local data...")
            msmarco_df = pd.read_csv(local_path)
            print("Loaded local MSMARCO sample data")
    except Exception as e:
        print(f"Failed to load local data: {e}")
    finally:
        if 'msmarco_df' not in locals():
            # Create empty dataframe if all else fails
            msmarco_df = pd.DataFrame(columns=['text', 'length'])
            print("Created empty dataframe as fallback")

In [None]:
# 3. Twitter Data Loading
import zipfile
from tqdm import tqdm
import requests

try:
    print("Attempting to load Twitter dataset...")
    
    # Download Twitter data
    twitter_url = "https://archive.org/download/twitter_cikm_2010/twitter_cikm_2010.zip"
    twitter_zip = "/content/twitter.zip"
    
    if not os.path.exists(twitter_zip):
        print("Downloading Twitter data...")
        response = requests.get(twitter_url, stream=True)
        response.raise_for_status()
        
        with open(twitter_zip, 'wb') as f:
            for chunk in tqdm(response.iter_content(chunk_size=8192)):
                f.write(chunk)
    
    # Extract and process tweets
    print("Extracting Twitter data...")
    with zipfile.ZipFile(twitter_zip, 'r') as zip_ref:
        zip_ref.extractall("/content/twitter_data")
    
    # Process each file in the extracted directory
    tweets = []
    for file in os.listdir("/content/twitter_data/twitter_cikm_2010"):
        if file.endswith(".txt"):
            with open(os.path.join("/content/twitter_data/twitter_cikm_2010", file), 'r', encoding='utf-8') as f:
                tweets.extend(f.read().splitlines())
    
    # Clean and create dataframe
    print("Cleaning Twitter data...")
    cleaned_tweets = [preprocessor.clean_text(tweet) for tweet in tweets[:10000]]  # Limit to 10k tweets
    twitter_df = pd.DataFrame({
        'text': cleaned_tweets,
        'length': [len(tweet) for tweet in cleaned_tweets]
    })
    
    print(f"Twitter statistics:\n{twitter_df.describe()}")
    
except Exception as e:
    print(f"Failed to load Twitter data: {e}")
    # Fallback to empty dataframe
    twitter_df = pd.DataFrame(columns=['text', 'length'])
    print("Created empty Twitter dataframe as fallback")

In [None]:
# 4. Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# MSMARCO text length distribution
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(msmarco_df['length'], bins=50, color='blue')
plt.title('MSMARCO Text Length')
plt.xlabel('Character Count')
plt.ylabel('Frequency')

# Twitter text length distribution
plt.subplot(1, 2, 2)
sns.histplot(twitter_df['length'], bins=50, color='green')
plt.title('Twitter Text Length')
plt.xlabel('Character Count')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# 5. Memory Optimization Check
def check_memory_usage(df, name):
    mem_usage = df.memory_usage(deep=True).sum() / 1024**2
    print(f"{name} memory usage: {mem_usage:.2f} MB")
    if mem_usage > 500:  # Warning if exceeds 500MB
        print(f"Warning: High memory usage for {name}, consider optimizing data types")
        
check_memory_usage(msmarco_df, "MSMARCO")
check_memory_usage(twitter_df, "Twitter")