# ChronoMeme Forecaster: Data Exploration

**Description:** Predicts the short-term 'virality' or trend score of internet memes based on social media mention frequency and sentiment analysis over time.

**Features:**
*   Ingests time-series data of meme mentions (using mock data here).
*   Applies basic sentiment analysis (simulated).
*   Uses a time series model (Prophet) to forecast future mention frequency/trend score.
*   Visualizes historical meme popularity and predicted trend.
*   Calculates a simple 'peak virality' prediction window.

## 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from prophet import Prophet
from sklearn.metrics import mean_absolute_error, mean_squared_error
import warnings
import random
from datetime import timedelta

# Settings
warnings.filterwarnings('ignore')
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

## 2. Data Loading and Inspection (Using Mock Data)

In [None]:
def generate_mock_data(days=180, num_memes=3, freq='H'):
    base_date = pd.to_datetime('2023-01-01')
    date_rng = pd.date_range(start=base_date, periods=days * 24, freq=freq)
    
    all_data = []

    for i in range(num_memes):
        meme_id = f'meme_{i+1}'
        
        # Simulate trend + seasonality + noise for mentions
        time_factor = np.linspace(0, 5 * np.pi, len(date_rng))
        trend = (np.sin(time_factor - i * np.pi/2) + 1.1) * (50 + i * 20) # Different peak times
        seasonality = 10 * np.sin(2 * np.pi * date_rng.hour / 24) + 5 * np.sin(2 * np.pi * date_rng.dayofweek / 7) # Daily and weekly patterns
        noise = np.random.normal(0, 15 + i*5, len(date_rng))
        
        mention_count = np.maximum(0, trend + seasonality + noise).astype(int)
        
        # Simulate sentiment (correlated slightly with mentions, with noise)
        sentiment_base = (mention_count / mention_count.max()) * 0.6 - 0.3 # Base sentiment related to popularity
        sentiment_noise = np.random.normal(0, 0.15, len(date_rng))
        sentiment_shift = np.sin(time_factor/3 + i * np.pi) * 0.1 # Slow sentiment shifts
        sentiment_score = np.clip(sentiment_base + sentiment_noise + sentiment_shift, -1, 1)
        
        meme_df = pd.DataFrame({
            'timestamp': date_rng,
            'meme_id': meme_id,
            'mention_count': mention_count,
            'sentiment_score': sentiment_score
        })
        all_data.append(meme_df)
        
    df = pd.concat(all_data, ignore_index=True)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    return df

# Generate data
df = generate_mock_data(days=90, num_memes=3)

# Inspect data
print("Data Head:")
print(df.head())
print("\nData Info:")
df.info()
print("\nData Description:")
print(df.describe())
print("\nMissing Values:")
print(df.isnull().sum())

## 3. Exploratory Data Analysis (EDA)

In [None]:
# Aggregate data daily for clearer visualization
df_daily = df.groupby(['meme_id', pd.Grouper(key='timestamp', freq='D')])[['mention_count', 'sentiment_score']].agg(
    mention_count=('mention_count', 'sum'),
    sentiment_score=('sentiment_score', 'mean')
).reset_index()

print("\nDaily Aggregated Data Head:")
print(df_daily.head())

In [None]:
plt.figure(figsize=(15, 7))
sns.lineplot(data=df_daily, x='timestamp', y='mention_count', hue='meme_id')
plt.title('Daily Mention Count Over Time by Meme')
plt.xlabel('Date')
plt.ylabel('Total Daily Mentions')
plt.show()

In [None]:
plt.figure(figsize=(15, 7))
sns.lineplot(data=df_daily, x='timestamp', y='sentiment_score', hue='meme_id')
plt.title('Average Daily Sentiment Score Over Time by Meme')
plt.xlabel('Date')
plt.ylabel('Average Sentiment Score')
plt.ylim(-1, 1)
plt.show()

In [None]:
plt.figure(figsize=(12, 5))
sns.histplot(data=df_daily, x='mention_count', hue='meme_id', kde=True, bins=30)
plt.title('Distribution of Daily Mention Counts')
plt.xlabel('Daily Mention Count')
plt.show()

In [None]:
plt.figure(figsize=(12, 5))
sns.histplot(data=df_daily, x='sentiment_score', hue='meme_id', kde=True, bins=30)
plt.title('Distribution of Average Daily Sentiment Scores')
plt.xlabel('Average Daily Sentiment Score')
plt.show()

In [None]:
# Correlation between mentions and sentiment (using daily data)
for meme in df_daily['meme_id'].unique():
    meme_data = df_daily[df_daily['meme_id'] == meme]
    correlation = meme_data['mention_count'].corr(meme_data['sentiment_score'])
    print(f"Correlation between mentions and sentiment for {meme}: {correlation:.2f}")
    
    # Lagged correlation (does sentiment today correlate with mentions tomorrow?)
    meme_data['mention_lag-1'] = meme_data['mention_count'].shift(-1)
    lagged_corr = meme_data['sentiment_score'].corr(meme_data['mention_lag-1'])
    print(f"Correlation between today's sentiment and tomorrow's mentions for {meme}: {lagged_corr:.2f}")
    
    plt.figure(figsize=(8, 5))
    sns.scatterplot(data=meme_data, x='mention_count', y='sentiment_score')
    plt.title(f'Mention Count vs Sentiment Score for {meme}')
    plt.show()

## 4. Statistical Analysis

In [None]:
# Time Series Decomposition (Example for meme_1)
meme1_daily = df_daily[df_daily['meme_id'] == 'meme_1'].set_index('timestamp')['mention_count']

# Need at least 2 full periods for seasonal decomposition, let's assume weekly seasonality (period=7)
if len(meme1_daily) >= 14:
    decomposition = seasonal_decompose(meme1_daily, model='additive', period=7)
    
    fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(12, 10), sharex=True)
    decomposition.observed.plot(ax=ax1)
    ax1.set_ylabel('Observed')
    decomposition.trend.plot(ax=ax2)
    ax2.set_ylabel('Trend')
    decomposition.seasonal.plot(ax=ax3)
    ax3.set_ylabel('Seasonal')
    decomposition.resid.plot(ax=ax4)
    ax4.set_ylabel('Residual')
    plt.suptitle('Time Series Decomposition for meme_1 (Daily Mentions)')
    plt.xlabel('Date')
    plt.tight_layout(rect=[0, 0.03, 1, 0.97])
    plt.show()
else:
    print("Not enough data points for seasonal decomposition with period=7.")

In [None]:
# Stationarity Test (Augmented Dickey-Fuller Test)
def adf_test(timeseries, name):
    print(f'\nAugmented Dickey-Fuller Test for {name}:')
    # Handle potential NaNs from decomposition or differencing if applied
    timeseries_cleaned = timeseries.dropna()
    if timeseries_cleaned.empty:
        print("Series is empty after dropping NaNs, cannot perform ADF test.")
        return
    
    result = adfuller(timeseries_cleaned)
    print('ADF Statistic: %f' % result[0])
    print('p-value: %f' % result[1])
    print('Critical Values:')
    for key, value in result[4].items():
        print('\t%s: %.3f' % (key, value))
    if result[1] <= 0.05:
        print("Result: Reject the null hypothesis (H0). Data is likely stationary.")
    else:
        print("Result: Fail to reject the null hypothesis (H0). Data is likely non-stationary.")

# Test stationarity for each meme's daily mention count
for meme in df_daily['meme_id'].unique():
    meme_series = df_daily[df_daily['meme_id'] == meme].set_index('timestamp')['mention_count']
    adf_test(meme_series, f'{meme} Daily Mentions')

## 5. Feature Engineering Experiments

In [None]:
# Work with the daily aggregated data
df_feat = df_daily.copy()

# Sort data for time-based features
df_feat = df_feat.sort_values(by=['meme_id', 'timestamp'])

# Lag features (previous day's mentions and sentiment)
df_feat['mention_lag_1'] = df_feat.groupby('meme_id')['mention_count'].shift(1)
df_feat['sentiment_lag_1'] = df_feat.groupby('meme_id')['sentiment_score'].shift(1)

# Rolling window features (e.g., 7-day rolling mean/std)
df_feat['mention_roll_mean_7'] = df_feat.groupby('meme_id')['mention_count'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())
df_feat['mention_roll_std_7'] = df_feat.groupby('meme_id')['mention_count'].transform(lambda x: x.rolling(window=7, min_periods=1).std())
df_feat['sentiment_roll_mean_7'] = df_feat.groupby('meme_id')['sentiment_score'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())

# Time-based features
df_feat['dayofweek'] = df_feat['timestamp'].dt.dayofweek
df_feat['dayofyear'] = df_feat['timestamp'].dt.dayofyear
df_feat['weekofyear'] = df_feat['timestamp'].dt.isocalendar().week.astype(int)
df_feat['month'] = df_feat['timestamp'].dt.month

# Interaction feature (example)
df_feat['mention_x_sentiment_lag1'] = df_feat['mention_lag_1'] * df_feat['sentiment_lag_1']

# Display features for one meme
print("\nFeature Engineering Example (meme_1):")
print(df_feat[df_feat['meme_id'] == 'meme_1'].head(10))

# Check for NaNs introduced by lagging/rolling features
print("\nNaNs after Feature Engineering:")
print(df_feat.isnull().sum())

## 6. Initial Model Testing (Prophet)

In [None]:
# Select data for one meme (e.g., meme_1)
meme_to_forecast = 'meme_1'
df_prophet = df_daily[df_daily['meme_id'] == meme_to_forecast][['timestamp', 'mention_count']].copy()

# Prepare data for Prophet (requires columns 'ds' and 'y')
df_prophet = df_prophet.rename(columns={'timestamp': 'ds', 'mention_count': 'y'})

# Split data for simple validation (e.g., last 14 days for testing)
train_cutoff = df_prophet['ds'].max() - timedelta(days=14)
df_train = df_prophet[df_prophet['ds'] <= train_cutoff]
df_test = df_prophet[df_prophet['ds'] > train_cutoff]

print(f"Training data shape: {df_train.shape}")
print(f"Test data shape: {df_test.shape}")

In [None]:
# Initialize and fit Prophet model
# Prophet automatically handles seasonality (yearly, weekly, daily if applicable)
model = Prophet(daily_seasonality=False, weekly_seasonality=True, yearly_seasonality=False, 
                changepoint_prior_scale=0.05) # Adjust prior scale based on trend flexibility needed

# Add potential regressors (example: lagged sentiment - requires careful handling of future values)
# For simplicity, we'll stick to univariate forecasting first.
# df_train_reg = pd.merge(df_train, df_feat[['timestamp', 'meme_id', 'sentiment_lag_1']], 
#                         left_on='ds', right_on='timestamp', how='left')
# model.add_regressor('sentiment_lag_1')

model.fit(df_train)

In [None]:
# Create future dataframe for predictions (including test period + future forecast)
future_periods = 30 # Forecast 14 days of test + 16 extra days
future = model.make_future_dataframe(periods=future_periods, freq='D')

# Add regressor values to future dataframe if used
# future_reg = pd.merge(future, df_feat[['timestamp', 'meme_id', 'sentiment_lag_1']], 
#                         left_on='ds', right_on='timestamp', how='left')
# future_reg = future_reg.ffill() # Simple forward fill for future regressor values (use with caution!)

# Make predictions
forecast = model.predict(future)

# Display forecast results
print("\nForecast Data Head:")
print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].head())
print("\nForecast Data Tail:")
print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail())

In [None]:
# Plot forecast
fig1 = model.plot(forecast)
plt.title(f'Prophet Forecast for {meme_to_forecast} Mentions')
plt.xlabel('Date')
plt.ylabel('Mention Count')
# Add actual test data points to the plot
plt.scatter(df_test['ds'], df_test['y'], color='red', s=10, label='Actual Test Data')
plt.legend()
plt.show()

In [None]:
# Plot forecast components
fig2 = model.plot_components(forecast)
plt.show()

In [None]:
# Evaluate forecast on the test set
forecast_test = forecast[forecast['ds'].isin(df_test['ds'])]

mae = mean_absolute_error(df_test['y'], forecast_test['yhat'])
rmse = np.sqrt(mean_squared_error(df_test['y'], forecast_test['yhat']))
mean_actual = df_test['y'].mean()
mape = np.mean(np.abs((df_test['y'] - forecast_test['yhat']) / df_test['y'])) * 100 if mean_actual != 0 else np.inf

print(f"\nEvaluation Metrics on Test Set ({meme_to_forecast}):")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%" if mape != np.inf else "MAPE: Undefined (zero actual values)")
print(f"Mean Actual Value: {mean_actual:.2f}")

In [None]:
# Calculate simple 'peak virality' prediction window
# Look for the max predicted value in the future forecast period
future_forecast = forecast[forecast['ds'] > df_prophet['ds'].max()]

if not future_forecast.empty:
    peak_forecast_value = future_forecast['yhat'].max()
    peak_forecast_date = future_forecast.loc[future_forecast['yhat'].idxmax(), 'ds']
    
    # Define window around the peak (e.g., +/- 1 day)
    peak_window_start = peak_forecast_date - timedelta(days=1)
    peak_window_end = peak_forecast_date + timedelta(days=1)
    
    print(f"\nPeak Virality Prediction ({meme_to_forecast}):")
    print(f"  Predicted Peak Value (yhat): {peak_forecast_value:.2f}")
    print(f"  Predicted Peak Date: {peak_forecast_date.strftime('%Y-%m-%d')}")
    print(f"  Simple Peak Window: {peak_window_start.strftime('%Y-%m-%d')} to {peak_window_end.strftime('%Y-%m-%d')}")
    
    # Highlight peak on forecast plot
    fig = model.plot(forecast)
    plt.scatter(df_test['ds'], df_test['y'], color='red', s=10, label='Actual Test Data')
    plt.axvline(peak_forecast_date, color='green', linestyle='--', label=f'Predicted Peak Date ({peak_forecast_date.strftime("%Y-%m-%d")})')
    plt.axvspan(peak_window_start, peak_window_end, color='green', alpha=0.1, label='Peak Window')
    plt.title(f'Prophet Forecast with Predicted Peak for {meme_to_forecast}')
    plt.xlabel('Date')
    plt.ylabel('Mention Count')
    plt.legend()
    plt.show()

else:
    print("\nNo future forecast data available to predict peak.")

## End of Exploration