In [None]:
import pandas as pd
import numpy as np
import datetime
import time
from prophet import Prophet
from prophet.diagnostics import cross_validation, performance_metrics
from prophet.plot import plot_cross_validation_metric
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error
import warnings

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
try:
    nltk.data.find('sentiment/vader_lexicon.zip')
except nltk.downloader.DownloadError:
    print('Downloading VADER lexicon...')
    nltk.download('vader_lexicon')
except LookupError:
    print('Downloading VADER lexicon...')
    nltk.download('vader_lexicon')

In [None]:
def generate_mock_meme_data(days=730, peak_day=365, max_mentions=10000, noise_level=0.1):
    """Generates mock time-series data for meme mentions and sentiment."""
    dates = pd.to_datetime(pd.date_range(end=datetime.date.today(), periods=days, freq='D'))
    
    # Simulate mention counts (logistic growth + decay)
    x = np.arange(days)
    # Growth phase (sigmoid)
    growth_rate = 0.05
    growth = max_mentions / (1 + np.exp(-growth_rate * (x - peak_day * 0.7)))
    # Decay phase (exponential)
    decay_rate = 0.015
    decay_start_value = max_mentions / (1 + np.exp(-growth_rate * (peak_day - peak_day * 0.7)))
    decay = decay_start_value * np.exp(-decay_rate * (x - peak_day))
    
    mentions = np.where(x < peak_day, growth, decay)
    
    # Add noise
    noise = np.random.normal(0, mentions * noise_level, days)
    mentions = np.maximum(0, mentions + noise).astype(int)
    
    # Simulate simple text snippets
    texts = []
    positive_keywords = ['lol', 'haha', 'awesome', 'love', 'funny', 'good', 'great']
    negative_keywords = ['hate', 'stupid', 'annoying', 'bad', 'overused', 'cringe']
    neutral_keywords = ['meme', 'post', 'share', 'see', 'trend', 'mention']
    
    for i in range(days):
        # Sentiment roughly correlates with trend phase
        if i < peak_day * 0.5: # Early growth
            sentiment_prob = 0.6 # Mostly positive
        elif i < peak_day * 1.2: # Peak and early decay
            sentiment_prob = 0.4 # Mixed
        else: # Late decay
            sentiment_prob = 0.2 # More negative/neutral
            
        if np.random.rand() < sentiment_prob:
            kw = np.random.choice(positive_keywords)
        elif np.random.rand() < 0.7: # Higher chance of negative than neutral in later stages
             kw = np.random.choice(negative_keywords)
        else:
             kw = np.random.choice(neutral_keywords)
        texts.append(f"Saw the {kw} meme today.")
        
    df = pd.DataFrame({'date': dates, 'mentions': mentions, 'text': texts})
    return df

def load_meme_data(source='generate', **kwargs):
    """Simulates loading data from a project module."""
    print(f"Loading data using source: {source}")
    if source == 'generate':
        return generate_mock_meme_data(**kwargs)
    # Add other potential sources like loading from CSV
    # elif source == 'csv':
    #     return pd.read_csv(kwargs.get('filepath'))
    else:
        raise ValueError("Unsupported data source")

# Load data
raw_data = load_meme_data(days=730, peak_day=400, max_mentions=15000, noise_level=0.15)
print(raw_data.head())
print(f"\nData shape: {raw_data.shape}")

In [None]:
# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Apply sentiment analysis
raw_data['sentiment_compound'] = raw_data['text'].apply(lambda text: analyzer.polarity_scores(text)['compound'])

# Aggregate sentiment per day (if multiple entries per day existed)
# In this mock data, we have one entry per day, so sentiment is already daily
daily_sentiment = raw_data.set_index('date')['sentiment_compound']

print("Sentiment analysis complete.")
print(raw_data[['date', 'sentiment_compound']].head())

In [None]:
# Prepare data for Prophet: requires 'ds' (datestamp) and 'y' (numeric value to forecast)
# We will forecast 'mentions'
df_prophet = raw_data[['date', 'mentions']].copy()
df_prophet.rename(columns={'date': 'ds', 'mentions': 'y'}, inplace=True)

# Optional: Add sentiment as a regressor
# df_prophet['sentiment'] = raw_data['sentiment_compound'].values

print("Data prepared for Prophet:")
print(df_prophet.head())

In [None]:
# Define the model
# We can customize seasonality and add regressors here if needed
model = Prophet(
    yearly_seasonality=True, 
    weekly_seasonality=True, 
    daily_seasonality=False, # Data is daily, so daily seasonality is not applicable
    # seasonality_mode='multiplicative', # Consider if trends are multiplicative
    # growth='logistic', # If there's a known carrying capacity
)

# Add sentiment as a regressor (if included in df_prophet)
# model.add_regressor('sentiment')

# Fit the model to the historical data
print("Training Prophet model...")
start_time = time.time()

# If using logistic growth, define cap and floor
# df_prophet['cap'] = df_prophet['y'].max() * 1.5 # Example capacity
# df_prophet['floor'] = 0 # Minimum mentions

model.fit(df_prophet)
end_time = time.time()
print(f"Model training completed in {end_time - start_time:.2f} seconds.")

In [None]:
# Create a dataframe for future predictions
future_periods = 90 # Forecast for the next 90 days
future = model.make_future_dataframe(periods=future_periods)

# Add future values for regressors if used
# Need to forecast sentiment or assume a value
# For simplicity, let's assume average sentiment continues
# future_sentiment = df_prophet['sentiment'].mean()
# future['sentiment'] = future_sentiment

# If using logistic growth, add cap and floor to future dataframe
# future['cap'] = df_prophet['cap'].iloc[0]
# future['floor'] = df_prophet['floor'].iloc[0]

print(f"Future dataframe created for {future_periods} days:")
print(future.tail())

In [None]:
# Make predictions
print("Generating forecast...")
forecast = model.predict(future)
print("Forecast complete.")

# Display forecast details for the last few days
print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail())

In [None]:
# Plot the forecast
fig1 = model.plot(forecast)
plt.title('Meme Mention Frequency Forecast')
plt.xlabel('Date')
plt.ylabel('Mentions')
plt.show()

In [None]:
# Plot forecast components (trend, weekly/yearly seasonality)
fig2 = model.plot_components(forecast)
plt.show()

In [None]:
# Model Evaluation using Cross-Validation
# 'initial': The size of the initial training period.
# 'period': The spacing between cutoff dates.
# 'horizon': The forecast horizon.
print("Performing cross-validation...")
start_time_cv = time.time()
# Use parameters appropriate for the dataset size (e.g., 1 year initial, 180 days period, 90 days horizon)
initial_cv = f'{min(365, int(len(df_prophet)*0.5))} days'
period_cv = f'{min(180, int(len(df_prophet)*0.2))} days'
horizon_cv = f'{future_periods} days'

df_cv = cross_validation(model, initial=initial_cv, period=period_cv, horizon=horizon_cv, parallel="processes")
end_time_cv = time.time()
print(f"Cross-validation completed in {end_time_cv - start_time_cv:.2f} seconds.")
print(df_cv.head())

In [None]:
# Calculate performance metrics
df_p = performance_metrics(df_cv)
print("Performance Metrics (MAPE, MAE, RMSE, etc.):")
print(df_p)

In [None]:
# Visualize cross-validation results (e.g., MAPE)
fig_cv = plot_cross_validation_metric(df_cv, metric='mape')
plt.title('Cross-Validation MAPE')
plt.show()

In [None]:
# Calculate 'Peak Virality' Prediction Window
# Find the peak forecasted value in the future period
future_forecast = forecast[forecast['ds'] > df_prophet['ds'].max()].copy()

if not future_forecast.empty:
    peak_yhat = future_forecast['yhat'].max()
    peak_date = future_forecast.loc[future_forecast['yhat'].idxmax(), 'ds']
    
    print(f"\nPredicted peak mentions (yhat): {peak_yhat:.2f}")
    print(f"Predicted peak date: {peak_date.strftime('%Y-%m-%d')}")
    
    # Define a window around the peak (e.g., days where forecast is > 85% of peak)
    peak_threshold = peak_yhat * 0.85
    peak_window_df = future_forecast[future_forecast['yhat'] >= peak_threshold]
    
    if not peak_window_df.empty:
        peak_start_date = peak_window_df['ds'].min()
        peak_end_date = peak_window_df['ds'].max()
        print(f"Predicted 'Peak Virality' Window (yhat >= {peak_threshold:.2f}):")
        print(f"Start: {peak_start_date.strftime('%Y-%m-%d')}")
        print(f"End:   {peak_end_date.strftime('%Y-%m-%d')}")
        
        # Visualize the peak window
        fig_peak, ax_peak = plt.subplots()
        model.plot(forecast, ax=ax_peak)
        ax_peak.axvspan(peak_start_date, peak_end_date, color='red', alpha=0.2, label='Peak Virality Window')
        ax_peak.plot(peak_date, peak_yhat, 'ro', markersize=8, label=f'Predicted Peak ({peak_date.strftime("%Y-%m-%d")})')
        plt.title('Meme Mention Forecast with Peak Virality Window')
        plt.xlabel('Date')
        plt.ylabel('Mentions')
        plt.legend()
        plt.show()
    else:
        print("Could not determine a peak virality window based on the threshold.")
else:
    print("\nNo future forecast data available to determine peak virality.")