# Social Media Data Exploration and AnalysisImplemented based on src/data_preparation.py

In [None]:
# 1. Environment Setup
!pip install -q pandas matplotlib seaborn datasets
from google.colab import drive
drive.mount('/content/drive')

# Set project path
import os
PROJECT_PATH = "/content/drive/MyDrive/CS6120_project"
os.chdir(PROJECT_PATH)

# Memory monitoring
import psutil
print(f"Available memory: {psutil.virtual_memory().available/1024**3:.2f} GB")

In [None]:
# 2. Data Loading (based on src/data_preparation.py)
from src.data_preparation import DataPreprocessor
import pandas as pd
from datasets import load_dataset

# Initialize preprocessor
preprocessor = DataPreprocessor()

# Load MSMARCO dataset
try:
    msmarco = load_dataset("microsoft/ms_marco", "v1.1")
    msmarco_df = pd.DataFrame([{
        'text': doc['passages']['passage_text'][0],
        'length': len(doc['passages']['passage_text'][0])
    } for doc in msmarco['train'][:10000]])  # Sample 10k records
    
    print(f"MSMARCO statistics:\n{msmarco_df.describe()}")
except Exception as e:
    print(f"Failed to load MSMARCO: {e}")

In [None]:
# 3. Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Text length distribution
plt.figure(figsize=(10,6))
sns.histplot(msmarco_df['length'], bins=50)
plt.title('Text Length Distribution')
plt.xlabel('Character Count')
plt.ylabel('Frequency')
plt.show()

In [None]:
# 4. Memory Optimization Check
def check_memory_usage(df):
    mem_usage = df.memory_usage(deep=True).sum() / 1024**2
    print(f"Current memory usage: {mem_usage:.2f} MB")
    if mem_usage > 500:  # Warning if exceeds 500MB
        print("Warning: High memory usage, consider optimizing data types")
        
check_memory_usage(msmarco_df)