# Market Impact Prediction
## Steps 5.i, 5.ii: Linear Regression, BigQuery ML, Impact Probability Calculation

In [None]:
import sys
sys.path.append('..')

from src.modeling.market_impact_predictor import MarketImpactPredictor
from src.modeling.bigquery_ml import BigQueryMLModel
from src.modeling.statistical_analysis import StatisticalAnalyzer
import pandas as pd
import numpy as np

## Load Analyzed Data

In [None]:
reddit_df_analyzed = pd.read_csv('../data/reddit_analyzed.csv')
print(f"Loaded {len(reddit_df_analyzed)} analyzed memes")
reddit_df_analyzed.head()

## Step 5.i: Train Market Impact Prediction Model

In [None]:
predictor = MarketImpactPredictor()
metrics = predictor.train(reddit_df_analyzed)

print("\nModel Performance Metrics:")
for key, value in metrics.items():
    print(f"  {key}: {value:.4f}")

## Make Predictions and Calculate Impact Probabilities

In [None]:
reddit_df_analyzed = predictor.predict(reddit_df_analyzed)
reddit_df_analyzed = predictor.calculate_impact_probability(reddit_df_analyzed)

print(f"Predictions generated for {len(reddit_df_analyzed)} memes")
print(f"Average predicted readiness: {reddit_df_analyzed['predicted_readiness'].mean():.3f}")
print(f"Average 72h impact probability: {reddit_df_analyzed['impact_prob_72h'].mean():.2f}%")

reddit_df_analyzed[['body', 'predicted_readiness', 'impact_prob_24h', 'impact_prob_48h', 'impact_prob_72h']].head(10)

## Step 5.i: Create BigQuery ML Model

In [None]:
bqml = BigQueryMLModel()
bqml.create_slang_acceleration_model()
eval_results = bqml.evaluate_model()

print("\nBigQuery ML Evaluation:")
print(eval_results)

## Statistical Testing and Benchmarking

In [None]:
analyzer = StatisticalAnalyzer()

columns = ['seriousness_threshold', 'irony_collapse_index', 'lookalike_similarity', 'predicted_readiness', 'toxicity_score']
corr_matrix = analyzer.correlation_analysis(reddit_df_analyzed, columns)

ttest_result = analyzer.ici_ttest(reddit_df_analyzed)

print("\nStatistical tests complete")

## Identify High-Impact Memes

In [None]:
high_impact = reddit_df_analyzed[reddit_df_analyzed['predicted_readiness'] > 0.75].copy()
high_impact = high_impact.sort_values('impact_prob_72h', ascending=False)

print(f"High-impact memes identified: {len(high_impact)}")
print(f"\nTop 10 High-Impact Memes:")
high_impact[['body', 'predicted_readiness', 'impact_prob_72h', 'irony_collapse_index']].head(10)

## Step 5.ii: Prepare Export for Google Sheets

In [None]:
from src.utils.export import ExportManager

export_manager = ExportManager()
sheets_export = export_manager.prepare_betting_odds_export(reddit_df_analyzed.head(100))
filepath = export_manager.export_to_csv(sheets_export, 'meme_predictions_export.csv')

print(f"\nExport prepared: {filepath}")
sheets_export.head(10)

## Prediction Summary

In [None]:
print("Market Prediction Summary:")
print(f"Total predictions: {len(reddit_df_analyzed)}")
print(f"\nModel Performance:")
print(f"  Test RÂ² Score: {metrics['test_r2']:.4f}")
print(f"  Test MAE: {metrics['test_mae']:.4f}")
print(f"\nImpact Probabilities:")
print(f"  Avg 24h: {reddit_df_analyzed['impact_prob_24h'].mean():.2f}%")
print(f"  Avg 48h: {reddit_df_analyzed['impact_prob_48h'].mean():.2f}%")
print(f"  Avg 72h: {reddit_df_analyzed['impact_prob_72h'].mean():.2f}%")
print(f"\nHigh-Impact Memes (>75% readiness): {len(high_impact)}")