# Semantic Analysis with Vertex AI
## Steps 1.i, 1.ii, 2.i: Semantic Scoring, Vector Clustering, Visual Metaphor Detection

In [None]:
import sys
sys.path.append('..')

from src.semantic_analysis.semantic_scoring import SemanticScorer
from src.semantic_analysis.vertex_ai_gemini import GeminiAnalyzer
from src.semantic_analysis.vector_clustering import VectorClusterer
import pandas as pd
import numpy as np

## Step 1.i: Calculate Meme Seriousness Threshold and Irony Collapse Index

In [None]:
reddit_df = pd.read_csv('../data/reddit_meme_raw.csv')

scorer = SemanticScorer()
semantic_results = scorer.process_batch(reddit_df['body'].tolist()[:1000])

reddit_df_analyzed = reddit_df.head(len(semantic_results)).copy()
reddit_df_analyzed = reddit_df_analyzed.join(semantic_results)

print(f"Semantic scores calculated for {len(semantic_results)} memes")
print(f"Average Seriousness Threshold: {semantic_results['seriousness_threshold'].mean():.3f}")
print(f"Average ICI: {semantic_results['irony_collapse_index'].mean():.3f}")

semantic_results.head(10)

## Step 1.i: Save Semantic Scores to BigQuery

In [None]:
from google.cloud import bigquery
from config.settings import PROJECT_ID, DATASET_ID

client = bigquery.Client(project=PROJECT_ID)

semantic_batch = []
for idx, row in reddit_df_analyzed.head(500).iterrows():
    semantic_batch.append({
        'meme_id': f"MEME_{idx}",
        'text': row['body'][:500],
        'meme_seriousness_threshold': row['seriousness_threshold'],
        'irony_collapse_index': row['irony_collapse_index'],
        'financial_keywords_count': row['financial_keywords_count'],
        'humor_keywords_count': row['humor_keywords_count']
    })

semantic_df = pd.DataFrame(semantic_batch)
table_id = f"{PROJECT_ID}.{DATASET_ID}.semantic_scores"

job_config = bigquery.LoadJobConfig(write_disposition="WRITE_APPEND")
job = client.load_table_from_dataframe(semantic_df, table_id, job_config=job_config)
job.result()

print(f"Semantic scores saved to {table_id}")

## Step 1.ii: Vector Clustering Against Historical Market Movers

In [None]:
clusterer = VectorClusterer()
reddit_df_analyzed = clusterer.find_lookalikes(reddit_df_analyzed)
reddit_df_analyzed = clusterer.perform_clustering(reddit_df_analyzed, n_clusters=6)

print(f"Vector clustering complete")
print(f"Average lookalike similarity: {reddit_df_analyzed['lookalike_similarity'].mean():.3f}")
print(f"High similarity memes (>0.7): {(reddit_df_analyzed['lookalike_similarity'] > 0.7).sum()}")

reddit_df_analyzed[['body', 'best_lookalike', 'lookalike_similarity', 'cluster']].head(10)

## Step 2.i: Visual Metaphor Analysis with Gemini

In [None]:
gemini = GeminiAnalyzer()

visual_results = []
for text in reddit_df_analyzed['body'].head(100):
    analysis = gemini.analyze_visual_metaphor(text)
    visual_results.append({
        'primary_metaphor': analysis['primary_visual_metaphor'],
        'shopping_correlation': analysis['google_shopping_correlation'],
        'metaphor_count': analysis['metaphor_count']
    })

visual_df = pd.DataFrame(visual_results)
reddit_df_analyzed = reddit_df_analyzed.head(len(visual_df)).join(visual_df)

print(f"Visual metaphor analysis complete")
print(f"Memes with visual metaphors: {(visual_df['metaphor_count'] > 0).sum()}")
print(f"Average shopping correlation: {visual_df['shopping_correlation'].mean():.3f}")

visual_df.head(10)

## Step 2.ii: Export Visual Metadata to Google Cloud Storage

In [None]:
from src.utils.storage import StorageManager

storage_manager = StorageManager()

visual_metadata = []
for idx, row in reddit_df_analyzed.iterrows():
    visual_metadata.append({
        'meme_id': f"MEME_{idx}",
        'primary_visual_metaphor': row.get('primary_metaphor', 'none'),
        'shopping_correlation_score': float(row.get('shopping_correlation', 0.0)),
        'metaphor_count': int(row.get('metaphor_count', 0)),
        'seriousness_threshold': float(row['seriousness_threshold']),
        'irony_collapse_index': float(row['irony_collapse_index'])
    })

storage_manager.export_visual_metadata(visual_metadata)
print("Visual metadata exported to GCS")

## Analysis Summary

In [None]:
print("Semantic Analysis Summary:")
print(f"Total memes analyzed: {len(reddit_df_analyzed)}")
print(f"\nSemantic Scores:")
print(f"  Avg Seriousness Threshold: {reddit_df_analyzed['seriousness_threshold'].mean():.3f}")
print(f"  Avg ICI: {reddit_df_analyzed['irony_collapse_index'].mean():.3f}")
print(f"\nClustering:")
print(f"  Unique clusters: {reddit_df_analyzed['cluster'].nunique()}")
print(f"  Avg lookalike similarity: {reddit_df_analyzed['lookalike_similarity'].mean():.3f}")
print(f"\nVisual Metaphors:")
print(f"  Memes with metaphors: {(reddit_df_analyzed['metaphor_count'] > 0).sum()}")
print(f"  Avg shopping correlation: {reddit_df_analyzed['shopping_correlation'].mean():.3f}")