In [11]:
import pandas as pd
import altair as alt

In [12]:
# read in ACLED data

regions = {
    "africa_acled": "data/Africa_aggregated_data_up_to-2025-10-18.csv",
    "asia_pacific_acled": "data/Asia-Pacific_aggregated_data_up_to-2025-10-18.csv",
    "europe_acled": "data/Europe-Central-Asia_aggregated_data_up_to-2025-10-18.csv",
    "latin_america_acled": "data/Latin-America-the-Caribbean_aggregated_data_up_to-2025-10-18.csv",
    "middle_east_acled": "data/Middle-East_aggregated_data_up_to-2025-10-18.csv",
    "us_canada_acled": "data/US-and-Canada_aggregated_data_up_to-2025-10-18.csv"
}

# combine datasets
dfs = {}
for region_name, file_path in regions.items():
    dfs[region_name] = pd.read_csv(file_path)
    print(f"Loaded {region_name}: {len(dfs[region_name])} rows")

# Combine into single dataframe
acled = pd.concat(dfs.values(), ignore_index=True)
print(f"Combined dataset: {len(acled)} total rows")

Loaded africa_acled: 255889 rows
Loaded asia_pacific_acled: 198169 rows
Loaded europe_acled: 110481 rows
Loaded latin_america_acled: 161259 rows
Loaded middle_east_acled: 137579 rows
Loaded us_canada_acled: 20839 rows
Combined dataset: 884216 total rows


## Tier 3 - Hashtags

### I. Country-specific Hashtags

#### 1) Palestine

In [39]:
# PALESTINE HASHTAGS ANALYSIS: ACLED EVENTS vs GOOGLE TRENDS

print("="*80)
print("PALESTINE HASHTAGS ANALYSIS")
print("="*80)

# 1. FILTER ACLED DATA
# ---------------------

palestine_acled = acled[
    (acled['COUNTRY'] == 'Palestine') & 
    (acled['WEEK'] >= '2020-01-01')
].copy()

palestine_acled['WEEK'] = pd.to_datetime(palestine_acled['WEEK'])

palestine_acled['month'] = palestine_acled['WEEK'].dt.to_period('M').dt.to_timestamp()
monthly = palestine_acled.groupby('month').agg({
    'EVENTS': 'sum',
    'FATALITIES': 'sum'
}).reset_index()

print(f"✓ ACLED Data: {len(monthly)} months")
print(f"  Date range: {monthly['month'].min()} to {monthly['month'].max()}")
print(f"  Total events: {monthly['EVENTS'].sum():,}")
print(f"  Total fatalities: {monthly['FATALITIES'].sum():,}")


# 2. LOAD GOOGLE TRENDS FILES
# ----------------------------
palestine_hashtag_files = {
    '#FreePalestine': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_FreePalestine.csv',
    '#Gaza': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_Gaza.csv',
    '#GazaCeasefire': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_GazaCeasefire.csv',
    '#FreeGaza': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_FreeGaza.csv'
}

print("\nNOTE: Hashtag removed for GazaCeasefire due to no data.")

trends_data = {}
for name, filepath in palestine_hashtag_files.items():
    try:
        df = pd.read_csv(filepath, skiprows=1)
        df.columns = ['month', 'value']
        df['month'] = pd.to_datetime(df['month'])
        df['value'] = df['value'].replace('<1', '0.5')
        df['value'] = pd.to_numeric(df['value'], errors='coerce')
        trends_data[name] = df
        print(f"  ✓ Loaded: {name:25s} - {len(df)} months, max={df['value'].max()}")
    except Exception as e:
        print(f"    ✗ Error loading {name}: {e}")
        

# 3. MERGE DATASETS
# -----------------
merged = monthly.copy()
for name, df in trends_data.items():
    merged = merged.merge(
        df.rename(columns={'value': name}),
        on='month',
        how='left'
    )
print(f"\n✓ Merged dataset: {len(merged)} months with {len(trends_data)} search terms")


# 4. CORRELATION ANALYSIS
# -----------------------
print("\n" + "="*80)
print("CORRELATION ANALYSIS")
print("="*80)

correlations = []
for term in trends_data.keys():
    if term in merged.columns:
        valid_data = merged[['EVENTS', 'FATALITIES', term]].dropna()
        if len(valid_data) > 10:
            corr_events = valid_data['EVENTS'].corr(valid_data[term])
            corr_fatalities = valid_data['FATALITIES'].corr(valid_data[term])
            correlations.append({
                'Search Term': term,
                'Corr w/ Events': corr_events,
                'Corr w/ Fatalities': corr_fatalities,
                'Data Points': len(valid_data)
            })

corr_df = pd.DataFrame(correlations).sort_values('Corr w/ Events', ascending=False)
print("\n" + corr_df.to_string(index=False))

# 5. TIME-LAG ANALYSIS
# --------------------
print("\n" + "="*80)
print("TIME-LAG ANALYSIS")
print("="*80)

top_terms = corr_df.head(3)['Search Term'].tolist()

for term in top_terms:
    print(f"\n{term}:")
    valid_data = merged[['EVENTS', term]].dropna()
    best_corr = -999
    best_lag = 0
    
    for lag in range(-3, 4):
        if lag == 0:
            corr = valid_data['EVENTS'].corr(valid_data[term])
        elif lag > 0:
            if len(valid_data) > lag:
                corr = valid_data['EVENTS'].iloc[lag:].corr(valid_data[term].iloc[:-lag])
            else:
                corr = 0
        else:
            if len(valid_data) > abs(lag):
                corr = valid_data['EVENTS'].iloc[:lag].corr(valid_data[term].iloc[-lag:])
            else:
                corr = 0
        
        if abs(corr) > abs(best_corr):
            best_corr = corr
            best_lag = lag
        
        direction = "searches LAG" if lag > 0 else ("searches LEAD" if lag < 0 else "CONCURRENT")
        print(f"  Lag {lag:+2d} months ({direction:15s}): correlation = {corr:+.3f}")
    
    interpretation = "REACTIVE (searches follow events)" if best_lag > 0 else \
                    "PREDICTIVE (searches precede events)" if best_lag < 0 else \
                    "CONCURRENT (searches match events)"
    print(f"\n  → Best correlation at lag {best_lag:+d}: {best_corr:+.3f} ({interpretation})")

# 6. KEY PERIODS IDENTIFICATION
# -----------------------------
print("\n" + "="*80)
print("KEY PERIODS")
print("="*80)

print("\nTop 5 Event Spikes:")
top_spikes = merged.nlargest(5, 'EVENTS')[['month', 'EVENTS', 'FATALITIES'] + list(trends_data.keys())]
for idx, row in top_spikes.iterrows():
    print(f"\n{row['month'].strftime('%B %Y')}:")
    print(f"  ACLED Events: {row['EVENTS']:,}")
    print(f"  ACLED Fatalities: {row['FATALITIES']:,}")
    print(f"  Search Interest:")
    for term in trends_data.keys():
        if pd.notna(row[term]):
            print(f"    - {term:25s}: {row[term]:.0f}/100")

# 7. VISUALIZATION
# ---------------
print("\n" + "="*80)
print("CREATING VISUALIZATIONS")
print("="*80)

# Normalize data
merged_normalized = merged.copy()
merged_normalized['EVENTS_norm'] = (merged['EVENTS'] / merged['EVENTS'].max()) * 100
merged_normalized['FATALITIES_norm'] = (merged['FATALITIES'] / merged['FATALITIES'].max()) * 100

# Reshape for Altair
plot_data = []
for _, row in merged_normalized.iterrows():
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Events',
        'value': row['EVENTS_norm'],
        'type': 'Conflict Data',
        'raw_value': row['EVENTS']
    })
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Fatalities',
        'value': row['FATALITIES_norm'],
        'type': 'Conflict Data',
        'raw_value': row['FATALITIES']
    })
    for term in top_terms:
        if term in row and pd.notna(row[term]):
            plot_data.append({
                'month': row['month'],
                'metric': f'Search: {term}',
                'value': row[term],
                'type': 'Google Trends',
                'raw_value': row[term]
            })

plot_df = pd.DataFrame(plot_data)

# Main chart
chart = alt.Chart(plot_df).mark_line(strokeWidth=2.5, point=True).encode(
    x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45)),
    y=alt.Y('value:Q', title='Normalized Value (0-100)', scale=alt.Scale(domain=[0, 105])),
    color=alt.Color('metric:N', title='Metric', scale=alt.Scale(scheme='tableau10')),
    strokeDash=alt.StrokeDash('type:N', title='Data Type',
                               scale=alt.Scale(domain=['Conflict Data', 'Google Trends'],
                                             range=[[1,0], [5,3]])),
    tooltip=[
        alt.Tooltip('month:T', title='Month', format='%B %Y'),
        alt.Tooltip('metric:N', title='Metric'),
        alt.Tooltip('value:Q', title='Normalized', format='.1f'),
        alt.Tooltip('raw_value:Q', title='Raw Value', format=',.0f')
    ]
).properties(
    width=1400,
    height=450,
    title={
        'text': 'Palestine Hashtags: ACLED Events vs Google Search Interest (2020-2025)',
        'subtitle': 'Examining Palestine-specific Hashtags',
        'fontSize': 18,
        'subtitleFontSize': 13
    }
).interactive()

chart.save('palestine_hashtags_acled_vs_trends.html')
print(f"✓ Saved: palestine_hashtags_acled_vs_trends.html")

# Display
chart

# 8. INDIVIDUAL COMPARISON CHARTS
# --------------------------------
for term in top_terms:
    term_data = merged[['month', 'EVENTS', 'FATALITIES', term]].dropna().copy()
    
    base = alt.Chart(term_data).encode(
        x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45))
    )
    
    events_line = base.mark_line(color='steelblue', strokeWidth=3).encode(
        y=alt.Y('EVENTS:Q', title='ACLED Events', axis=alt.Axis(titleColor='steelblue')),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    trends_line = base.mark_line(color='red', strokeWidth=3).encode(
        y=alt.Y(f'{term}:Q', title=f'Google Trends: {term}',
                axis=alt.Axis(titleColor='red'), scale=alt.Scale(domain=[0, 100])),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    term_chart = alt.layer(events_line, trends_line).resolve_scale(
        y='independent'
    ).properties(
        width=1200,
        height=400,
        title=f'Palestine Hashtags: ACLED Events vs "{term}" Search Interest'
    ).interactive()
    
    filename = f"palestine_hashtags_{term.lower().replace(' ', '_')}_comparison.html"
    term_chart.save(filename)
    print(f"✓ Saved: {filename}")

print("\n✓ Palestine Hashtags analysis complete!")

PALESTINE HASHTAGS ANALYSIS
✓ ACLED Data: 118 months
  Date range: 2015-12-01 00:00:00 to 2025-09-01 00:00:00
  Total events: 27,743
  Total fatalities: 22,882

NOTE: Hashtag removed for GazaCeasefire due to no data.
  ✓ Loaded: #FreePalestine            - 70 months, max=100
  ✓ Loaded: #Gaza                     - 70 months, max=100
  ✓ Loaded: #GazaCeasefire            - 70 months, max=100
  ✓ Loaded: #FreeGaza                 - 70 months, max=100

✓ Merged dataset: 118 months with 4 search terms

CORRELATION ANALYSIS

   Search Term  Corr w/ Events  Corr w/ Fatalities  Data Points
         #Gaza        0.625221            0.831157           69
     #FreeGaza        0.413921            0.685147           69
#FreePalestine        0.411616            0.425717           69
#GazaCeasefire       -0.054704           -0.044126           69

TIME-LAG ANALYSIS

#Gaza:
  Lag -3 months (searches LEAD  ): correlation = +0.630
  Lag -2 months (searches LEAD  ): correlation = +0.630
  Lag -1 months

#### 2) Ukraine/Russia

In [42]:
# UKRAINE-RUSSIA ANALYSIS: ACLED EVENTS vs GOOGLE TRENDS

print("="*80)
print("UKRAINE-RUSSIA HASHTAGS ANALYSIS")
print("="*80)

# 1. FILTER ACLED DATA
# ---------------------

ukraineRussia_acled = acled[
    ((acled['COUNTRY'] == 'Ukraine') | (acled['COUNTRY'] == 'Russia')) & 
    (acled['WEEK'] >= '2020-01-01')
].copy()

ukraineRussia_acled['WEEK'] = pd.to_datetime(ukraineRussia_acled['WEEK'])

ukraineRussia_acled['month'] = ukraineRussia_acled['WEEK'].dt.to_period('M').dt.to_timestamp()
monthly = ukraineRussia_acled.groupby('month').agg({
    'EVENTS': 'sum',
    'FATALITIES': 'sum'
}).reset_index()

print(f"✓ ACLED Data: {len(monthly)} months")
print(f"  Date range: {monthly['month'].min()} to {monthly['month'].max()}")
print(f"  Total events: {monthly['EVENTS'].sum():,}")
print(f"  Total fatalities: {monthly['FATALITIES'].sum():,}")


# 2. LOAD GOOGLE TRENDS FILES
# ----------------------------
ukraineRussia_hashtag_files = {
    '#UkraineWar': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_UkraineWar.csv',
    '#StandWithUkraine': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_StandWithUkraine.csv',
    '#SlavaUkraine': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_SlavaUkraine.csv',
    '#RussiaUkraineWar': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_RussiaUkraineWar.csv',
    '#StopPutin': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_StopPutin.csv'
}

trends_data = {}
for name, filepath in ukraineRussia_hashtag_files.items():
    try:
        df = pd.read_csv(filepath, skiprows=1)
        df.columns = ['month', 'value']
        df['month'] = pd.to_datetime(df['month'])
        df['value'] = df['value'].replace('<1', '0.5')
        df['value'] = pd.to_numeric(df['value'], errors='coerce')
        trends_data[name] = df
        print(f"  ✓ Loaded: {name:25s} - {len(df)} months, max={df['value'].max()}")
    except Exception as e:
        print(f"    ✗ Error loading {name}: {e}")
        

# 3. MERGE DATASETS
# -----------------
merged = monthly.copy()
for name, df in trends_data.items():
    merged = merged.merge(
        df.rename(columns={'value': name}),
        on='month',
        how='left'
    )
print(f"\n✓ Merged dataset: {len(merged)} months with {len(trends_data)} search terms")


# 4. CORRELATION ANALYSIS
# -----------------------
print("\n" + "="*80)
print("CORRELATION ANALYSIS")
print("="*80)

correlations = []
for term in trends_data.keys():
    if term in merged.columns:
        valid_data = merged[['EVENTS', 'FATALITIES', term]].dropna()
        if len(valid_data) > 10:
            corr_events = valid_data['EVENTS'].corr(valid_data[term])
            corr_fatalities = valid_data['FATALITIES'].corr(valid_data[term])
            correlations.append({
                'Search Term': term,
                'Corr w/ Events': corr_events,
                'Corr w/ Fatalities': corr_fatalities,
                'Data Points': len(valid_data)
            })

corr_df = pd.DataFrame(correlations).sort_values('Corr w/ Events', ascending=False)
print("\n" + corr_df.to_string(index=False))

# 5. TIME-LAG ANALYSIS
# --------------------
print("\n" + "="*80)
print("TIME-LAG ANALYSIS")
print("="*80)

top_terms = corr_df.head(3)['Search Term'].tolist()

for term in top_terms:
    print(f"\n{term}:")
    valid_data = merged[['EVENTS', term]].dropna()
    best_corr = -999
    best_lag = 0
    
    for lag in range(-3, 4):
        if lag == 0:
            corr = valid_data['EVENTS'].corr(valid_data[term])
        elif lag > 0:
            if len(valid_data) > lag:
                corr = valid_data['EVENTS'].iloc[lag:].corr(valid_data[term].iloc[:-lag])
            else:
                corr = 0
        else:
            if len(valid_data) > abs(lag):
                corr = valid_data['EVENTS'].iloc[:lag].corr(valid_data[term].iloc[-lag:])
            else:
                corr = 0
        
        if abs(corr) > abs(best_corr):
            best_corr = corr
            best_lag = lag
        
        direction = "searches LAG" if lag > 0 else ("searches LEAD" if lag < 0 else "CONCURRENT")
        print(f"  Lag {lag:+2d} months ({direction:15s}): correlation = {corr:+.3f}")
    
    interpretation = "REACTIVE (searches follow events)" if best_lag > 0 else \
                    "PREDICTIVE (searches precede events)" if best_lag < 0 else \
                    "CONCURRENT (searches match events)"
    print(f"\n  → Best correlation at lag {best_lag:+d}: {best_corr:+.3f} ({interpretation})")

# 6. KEY PERIODS IDENTIFICATION
# -----------------------------
print("\n" + "="*80)
print("KEY PERIODS")
print("="*80)

print("\nTop 5 Event Spikes:")
top_spikes = merged.nlargest(5, 'EVENTS')[['month', 'EVENTS', 'FATALITIES'] + list(trends_data.keys())]
for idx, row in top_spikes.iterrows():
    print(f"\n{row['month'].strftime('%B %Y')}:")
    print(f"  ACLED Events: {row['EVENTS']:,}")
    print(f"  ACLED Fatalities: {row['FATALITIES']:,}")
    print(f"  Search Interest:")
    for term in trends_data.keys():
        if pd.notna(row[term]):
            print(f"    - {term:25s}: {row[term]:.0f}/100")

# 7. VISUALIZATION
# ---------------
print("\n" + "="*80)
print("CREATING VISUALIZATIONS")
print("="*80)

# Normalize data
merged_normalized = merged.copy()
merged_normalized['EVENTS_norm'] = (merged['EVENTS'] / merged['EVENTS'].max()) * 100
merged_normalized['FATALITIES_norm'] = (merged['FATALITIES'] / merged['FATALITIES'].max()) * 100

# Reshape for Altair
plot_data = []
for _, row in merged_normalized.iterrows():
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Events',
        'value': row['EVENTS_norm'],
        'type': 'Conflict Data',
        'raw_value': row['EVENTS']
    })
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Fatalities',
        'value': row['FATALITIES_norm'],
        'type': 'Conflict Data',
        'raw_value': row['FATALITIES']
    })
    for term in top_terms:
        if term in row and pd.notna(row[term]):
            plot_data.append({
                'month': row['month'],
                'metric': f'Search: {term}',
                'value': row[term],
                'type': 'Google Trends',
                'raw_value': row[term]
            })

plot_df = pd.DataFrame(plot_data)

# Main chart
chart = alt.Chart(plot_df).mark_line(strokeWidth=2.5, point=True).encode(
    x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45)),
    y=alt.Y('value:Q', title='Normalized Value (0-100)', scale=alt.Scale(domain=[0, 105])),
    color=alt.Color('metric:N', title='Metric', scale=alt.Scale(scheme='tableau10')),
    strokeDash=alt.StrokeDash('type:N', title='Data Type',
                               scale=alt.Scale(domain=['Conflict Data', 'Google Trends'],
                                             range=[[1,0], [5,3]])),
    tooltip=[
        alt.Tooltip('month:T', title='Month', format='%B %Y'),
        alt.Tooltip('metric:N', title='Metric'),
        alt.Tooltip('value:Q', title='Normalized', format='.1f'),
        alt.Tooltip('raw_value:Q', title='Raw Value', format=',.0f')
    ]
).properties(
    width=1400,
    height=450,
    title={
        'text': 'Ukraine & Russia: ACLED Events vs Google Search Interest (2020-2025)',
        'subtitle': 'Examining Ukraine & Russia-specific Hashtags',
        'fontSize': 18,
        'subtitleFontSize': 13
    }
).interactive()

chart.save('ukraineRussia_hashtags_acled_vs_trends.html')
print(f"✓ Saved: ukraineRussia_hashtags_acled_vs_trends.html")

# Display
chart

# 8. INDIVIDUAL COMPARISON CHARTS
# --------------------------------
for term in top_terms:
    term_data = merged[['month', 'EVENTS', 'FATALITIES', term]].dropna().copy()
    
    base = alt.Chart(term_data).encode(
        x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45))
    )
    
    events_line = base.mark_line(color='steelblue', strokeWidth=3).encode(
        y=alt.Y('EVENTS:Q', title='ACLED Events', axis=alt.Axis(titleColor='steelblue')),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    trends_line = base.mark_line(color='red', strokeWidth=3).encode(
        y=alt.Y(f'{term}:Q', title=f'Google Trends: {term}',
                axis=alt.Axis(titleColor='red'), scale=alt.Scale(domain=[0, 100])),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    term_chart = alt.layer(events_line, trends_line).resolve_scale(
        y='independent'
    ).properties(
        width=1200,
        height=400,
        title=f'Ukraine-Russia Hashtags: ACLED Events vs "{term}" Search Interest'
    ).interactive()
    
    filename = f"ukraineRussia_hashtags_{term.lower().replace(' ', '_')}_comparison.html"
    term_chart.save(filename)
    print(f"✓ Saved: {filename}")

print("\n✓ Ukraine-Russia hashtags analysis complete!")

UKRAINE-RUSSIA HASHTAGS ANALYSIS
✓ ACLED Data: 94 months
  Date range: 2017-12-01 00:00:00 to 2025-09-01 00:00:00
  Total events: 106,598
  Total fatalities: 74,494
  ✓ Loaded: #UkraineWar               - 70 months, max=100
  ✓ Loaded: #StandWithUkraine         - 70 months, max=100
  ✓ Loaded: #SlavaUkraine             - 70 months, max=100
  ✓ Loaded: #RussiaUkraineWar         - 70 months, max=100
  ✓ Loaded: #StopPutin                - 70 months, max=100

✓ Merged dataset: 94 months with 5 search terms

CORRELATION ANALYSIS

      Search Term  Corr w/ Events  Corr w/ Fatalities  Data Points
#StandWithUkraine        0.032669            0.011247           69
#RussiaUkraineWar        0.019567            0.005528           69
      #UkraineWar        0.014349            0.024689           69
    #SlavaUkraine       -0.075668           -0.065052           69
       #StopPutin       -0.099928           -0.056665           69

TIME-LAG ANALYSIS

#StandWithUkraine:
  Lag -3 months (searches L

#### 3) Syria

In [43]:
# SYRIA HASHTAGS ANALYSIS: ACLED EVENTS vs GOOGLE TRENDS

print("="*80)
print("SYRIA HASHTAGS ANALYSIS")
print("="*80)

# 1. FILTER ACLED DATA
# ---------------------

syria_acled = acled[
    (acled['COUNTRY'] == 'Syria') & 
    (acled['WEEK'] >= '2020-01-01')
].copy()

syria_acled['WEEK'] = pd.to_datetime(syria_acled['WEEK'])

syria_acled['month'] = syria_acled['WEEK'].dt.to_period('M').dt.to_timestamp()
monthly = syria_acled.groupby('month').agg({
    'EVENTS': 'sum',
    'FATALITIES': 'sum'
}).reset_index()

print(f"✓ ACLED Data: {len(monthly)} months")
print(f"  Date range: {monthly['month'].min()} to {monthly['month'].max()}")
print(f"  Total events: {monthly['EVENTS'].sum():,}")
print(f"  Total fatalities: {monthly['FATALITIES'].sum():,}")


# 2. LOAD GOOGLE TRENDS FILES
# ----------------------------
syria_hashtag_files = {
    '#SyriaWar': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_SyriaWar.csv',
    '#Syria': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_Syria.csv'
}

trends_data = {}
for name, filepath in syria_hashtag_files.items():
    try:
        df = pd.read_csv(filepath, skiprows=1)
        df.columns = ['month', 'value']
        df['month'] = pd.to_datetime(df['month'])
        df['value'] = df['value'].replace('<1', '0.5')
        df['value'] = pd.to_numeric(df['value'], errors='coerce')
        trends_data[name] = df
        print(f"  ✓ Loaded: {name:25s} - {len(df)} months, max={df['value'].max()}")
    except Exception as e:
        print(f"    ✗ Error loading {name}: {e}")
        

# 3. MERGE DATASETS
# -----------------
merged = monthly.copy()
for name, df in trends_data.items():
    merged = merged.merge(
        df.rename(columns={'value': name}),
        on='month',
        how='left'
    )
print(f"\n✓ Merged dataset: {len(merged)} months with {len(trends_data)} search terms")


# 4. CORRELATION ANALYSIS
# -----------------------
print("\n" + "="*80)
print("CORRELATION ANALYSIS")
print("="*80)

correlations = []
for term in trends_data.keys():
    if term in merged.columns:
        valid_data = merged[['EVENTS', 'FATALITIES', term]].dropna()
        if len(valid_data) > 10:
            corr_events = valid_data['EVENTS'].corr(valid_data[term])
            corr_fatalities = valid_data['FATALITIES'].corr(valid_data[term])
            correlations.append({
                'Search Term': term,
                'Corr w/ Events': corr_events,
                'Corr w/ Fatalities': corr_fatalities,
                'Data Points': len(valid_data)
            })

corr_df = pd.DataFrame(correlations).sort_values('Corr w/ Events', ascending=False)
print("\n" + corr_df.to_string(index=False))

# 5. TIME-LAG ANALYSIS
# --------------------
print("\n" + "="*80)
print("TIME-LAG ANALYSIS")
print("="*80)

top_terms = corr_df.head(3)['Search Term'].tolist()

for term in top_terms:
    print(f"\n{term}:")
    valid_data = merged[['EVENTS', term]].dropna()
    best_corr = -999
    best_lag = 0
    
    for lag in range(-3, 4):
        if lag == 0:
            corr = valid_data['EVENTS'].corr(valid_data[term])
        elif lag > 0:
            if len(valid_data) > lag:
                corr = valid_data['EVENTS'].iloc[lag:].corr(valid_data[term].iloc[:-lag])
            else:
                corr = 0
        else:
            if len(valid_data) > abs(lag):
                corr = valid_data['EVENTS'].iloc[:lag].corr(valid_data[term].iloc[-lag:])
            else:
                corr = 0
        
        if abs(corr) > abs(best_corr):
            best_corr = corr
            best_lag = lag
        
        direction = "searches LAG" if lag > 0 else ("searches LEAD" if lag < 0 else "CONCURRENT")
        print(f"  Lag {lag:+2d} months ({direction:15s}): correlation = {corr:+.3f}")
    
    interpretation = "REACTIVE (searches follow events)" if best_lag > 0 else \
                    "PREDICTIVE (searches precede events)" if best_lag < 0 else \
                    "CONCURRENT (searches match events)"
    print(f"\n  → Best correlation at lag {best_lag:+d}: {best_corr:+.3f} ({interpretation})")

# 6. KEY PERIODS IDENTIFICATION
# -----------------------------
print("\n" + "="*80)
print("KEY PERIODS")
print("="*80)

print("\nTop 5 Event Spikes:")
top_spikes = merged.nlargest(5, 'EVENTS')[['month', 'EVENTS', 'FATALITIES'] + list(trends_data.keys())]
for idx, row in top_spikes.iterrows():
    print(f"\n{row['month'].strftime('%B %Y')}:")
    print(f"  ACLED Events: {row['EVENTS']:,}")
    print(f"  ACLED Fatalities: {row['FATALITIES']:,}")
    print(f"  Search Interest:")
    for term in trends_data.keys():
        if pd.notna(row[term]):
            print(f"    - {term:25s}: {row[term]:.0f}/100")

# 7. VISUALIZATION
# ---------------
print("\n" + "="*80)
print("CREATING VISUALIZATIONS")
print("="*80)

# Normalize data
merged_normalized = merged.copy()
merged_normalized['EVENTS_norm'] = (merged['EVENTS'] / merged['EVENTS'].max()) * 100
merged_normalized['FATALITIES_norm'] = (merged['FATALITIES'] / merged['FATALITIES'].max()) * 100

# Reshape for Altair
plot_data = []
for _, row in merged_normalized.iterrows():
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Events',
        'value': row['EVENTS_norm'],
        'type': 'Conflict Data',
        'raw_value': row['EVENTS']
    })
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Fatalities',
        'value': row['FATALITIES_norm'],
        'type': 'Conflict Data',
        'raw_value': row['FATALITIES']
    })
    for term in top_terms:
        if term in row and pd.notna(row[term]):
            plot_data.append({
                'month': row['month'],
                'metric': f'Search: {term}',
                'value': row[term],
                'type': 'Google Trends',
                'raw_value': row[term]
            })

plot_df = pd.DataFrame(plot_data)

# Main chart
chart = alt.Chart(plot_df).mark_line(strokeWidth=2.5, point=True).encode(
    x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45)),
    y=alt.Y('value:Q', title='Normalized Value (0-100)', scale=alt.Scale(domain=[0, 105])),
    color=alt.Color('metric:N', title='Metric', scale=alt.Scale(scheme='tableau10')),
    strokeDash=alt.StrokeDash('type:N', title='Data Type',
                               scale=alt.Scale(domain=['Conflict Data', 'Google Trends'],
                                             range=[[1,0], [5,3]])),
    tooltip=[
        alt.Tooltip('month:T', title='Month', format='%B %Y'),
        alt.Tooltip('metric:N', title='Metric'),
        alt.Tooltip('value:Q', title='Normalized', format='.1f'),
        alt.Tooltip('raw_value:Q', title='Raw Value', format=',.0f')
    ]
).properties(
    width=1400,
    height=450,
    title={
        'text': 'Syria Hashtags: ACLED Events vs Google Search Interest (2020-2025)',
        'subtitle': 'Examining Syria-specific Hashtags',
        'fontSize': 18,
        'subtitleFontSize': 13
    }
).interactive()

chart.save('syria_hashtags_acled_vs_trends.html')
print(f"✓ Saved: syria_hashtags_acled_vs_trends.html")

# Display
chart

# 8. INDIVIDUAL COMPARISON CHARTS
# --------------------------------
for term in top_terms:
    term_data = merged[['month', 'EVENTS', 'FATALITIES', term]].dropna().copy()
    
    base = alt.Chart(term_data).encode(
        x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45))
    )
    
    events_line = base.mark_line(color='steelblue', strokeWidth=3).encode(
        y=alt.Y('EVENTS:Q', title='ACLED Events', axis=alt.Axis(titleColor='steelblue')),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    trends_line = base.mark_line(color='red', strokeWidth=3).encode(
        y=alt.Y(f'{term}:Q', title=f'Google Trends: {term}',
                axis=alt.Axis(titleColor='red'), scale=alt.Scale(domain=[0, 100])),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    term_chart = alt.layer(events_line, trends_line).resolve_scale(
        y='independent'
    ).properties(
        width=1200,
        height=400,
        title=f'Syria Hashtags: ACLED Events vs "{term}" Search Interest'
    ).interactive()
    
    filename = f"syria_hashtags_{term.lower().replace(' ', '_')}_comparison.html"
    term_chart.save(filename)
    print(f"✓ Saved: {filename}")

print("\n✓ Syria hashtags analysis complete!")

SYRIA HASHTAGS ANALYSIS
✓ ACLED Data: 106 months
  Date range: 2016-12-01 00:00:00 to 2025-09-01 00:00:00
  Total events: 50,064
  Total fatalities: 46,791
  ✓ Loaded: #SyriaWar                 - 70 months, max=100
  ✓ Loaded: #Syria                    - 70 months, max=100

✓ Merged dataset: 106 months with 2 search terms

CORRELATION ANALYSIS

Search Term  Corr w/ Events  Corr w/ Fatalities  Data Points
     #Syria        0.139592            0.333633           69
  #SyriaWar       -0.000538           -0.070622           69

TIME-LAG ANALYSIS

#Syria:
  Lag -3 months (searches LEAD  ): correlation = -0.091
  Lag -2 months (searches LEAD  ): correlation = -0.080
  Lag -1 months (searches LEAD  ): correlation = +0.136
  Lag +0 months (CONCURRENT     ): correlation = +0.140
  Lag +1 months (searches LAG   ): correlation = +0.136
  Lag +2 months (searches LAG   ): correlation = -0.080
  Lag +3 months (searches LAG   ): correlation = -0.091

  → Best correlation at lag +0: -999.000 (CONCURR

  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]


#### 4) Turkey

In [45]:
# TURKEY HASHTAGS ANALYSIS: ACLED EVENTS vs GOOGLE TRENDS

print("="*80)
print("TURKEY HASHTAGS ANALYSIS")
print("="*80)

# 1. FILTER ACLED DATA
# ---------------------

turkey_acled = acled[
    (acled['COUNTRY'] == 'Turkey') & 
    (acled['WEEK'] >= '2020-01-01')
].copy()

turkey_acled['WEEK'] = pd.to_datetime(turkey_acled['WEEK'])

turkey_acled['month'] = turkey_acled['WEEK'].dt.to_period('M').dt.to_timestamp()
monthly = turkey_acled.groupby('month').agg({
    'EVENTS': 'sum',
    'FATALITIES': 'sum'
}).reset_index()

print(f"✓ ACLED Data: {len(monthly)} months")
print(f"  Date range: {monthly['month'].min()} to {monthly['month'].max()}")
print(f"  Total events: {monthly['EVENTS'].sum():,}")
print(f"  Total fatalities: {monthly['FATALITIES'].sum():,}")


# 2. LOAD GOOGLE TRENDS FILES
# ----------------------------
turkey_hashtag_files = {
    '#TurkishWomen': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_TurkishWomen.csv'
}

trends_data = {}
for name, filepath in turkey_hashtag_files.items():
    try:
        df = pd.read_csv(filepath, skiprows=1)
        df.columns = ['month', 'value']
        df['month'] = pd.to_datetime(df['month'])
        df['value'] = df['value'].replace('<1', '0.5')
        df['value'] = pd.to_numeric(df['value'], errors='coerce')
        trends_data[name] = df
        print(f"  ✓ Loaded: {name:25s} - {len(df)} months, max={df['value'].max()}")
    except Exception as e:
        print(f"    ✗ Error loading {name}: {e}")
        

# 3. MERGE DATASETS
# -----------------
merged = monthly.copy()
for name, df in trends_data.items():
    merged = merged.merge(
        df.rename(columns={'value': name}),
        on='month',
        how='left'
    )
print(f"\n✓ Merged dataset: {len(merged)} months with {len(trends_data)} search terms")


# 4. CORRELATION ANALYSIS
# -----------------------
print("\n" + "="*80)
print("CORRELATION ANALYSIS")
print("="*80)

correlations = []
for term in trends_data.keys():
    if term in merged.columns:
        valid_data = merged[['EVENTS', 'FATALITIES', term]].dropna()
        if len(valid_data) > 10:
            corr_events = valid_data['EVENTS'].corr(valid_data[term])
            corr_fatalities = valid_data['FATALITIES'].corr(valid_data[term])
            correlations.append({
                'Search Term': term,
                'Corr w/ Events': corr_events,
                'Corr w/ Fatalities': corr_fatalities,
                'Data Points': len(valid_data)
            })

corr_df = pd.DataFrame(correlations).sort_values('Corr w/ Events', ascending=False)
print("\n" + corr_df.to_string(index=False))

# 5. TIME-LAG ANALYSIS
# --------------------
print("\n" + "="*80)
print("TIME-LAG ANALYSIS")
print("="*80)

top_terms = corr_df.head(3)['Search Term'].tolist()

for term in top_terms:
    print(f"\n{term}:")
    valid_data = merged[['EVENTS', term]].dropna()
    best_corr = -999
    best_lag = 0
    
    for lag in range(-3, 4):
        if lag == 0:
            corr = valid_data['EVENTS'].corr(valid_data[term])
        elif lag > 0:
            if len(valid_data) > lag:
                corr = valid_data['EVENTS'].iloc[lag:].corr(valid_data[term].iloc[:-lag])
            else:
                corr = 0
        else:
            if len(valid_data) > abs(lag):
                corr = valid_data['EVENTS'].iloc[:lag].corr(valid_data[term].iloc[-lag:])
            else:
                corr = 0
        
        if abs(corr) > abs(best_corr):
            best_corr = corr
            best_lag = lag
        
        direction = "searches LAG" if lag > 0 else ("searches LEAD" if lag < 0 else "CONCURRENT")
        print(f"  Lag {lag:+2d} months ({direction:15s}): correlation = {corr:+.3f}")
    
    interpretation = "REACTIVE (searches follow events)" if best_lag > 0 else \
                    "PREDICTIVE (searches precede events)" if best_lag < 0 else \
                    "CONCURRENT (searches match events)"
    print(f"\n  → Best correlation at lag {best_lag:+d}: {best_corr:+.3f} ({interpretation})")

# 6. KEY PERIODS IDENTIFICATION
# -----------------------------
print("\n" + "="*80)
print("KEY PERIODS")
print("="*80)

print("\nTop 5 Event Spikes:")
top_spikes = merged.nlargest(5, 'EVENTS')[['month', 'EVENTS', 'FATALITIES'] + list(trends_data.keys())]
for idx, row in top_spikes.iterrows():
    print(f"\n{row['month'].strftime('%B %Y')}:")
    print(f"  ACLED Events: {row['EVENTS']:,}")
    print(f"  ACLED Fatalities: {row['FATALITIES']:,}")
    print(f"  Search Interest:")
    for term in trends_data.keys():
        if pd.notna(row[term]):
            print(f"    - {term:25s}: {row[term]:.0f}/100")

# 7. VISUALIZATION
# ---------------
print("\n" + "="*80)
print("CREATING VISUALIZATIONS")
print("="*80)

# Normalize data
merged_normalized = merged.copy()
merged_normalized['EVENTS_norm'] = (merged['EVENTS'] / merged['EVENTS'].max()) * 100
merged_normalized['FATALITIES_norm'] = (merged['FATALITIES'] / merged['FATALITIES'].max()) * 100

# Reshape for Altair
plot_data = []
for _, row in merged_normalized.iterrows():
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Events',
        'value': row['EVENTS_norm'],
        'type': 'Conflict Data',
        'raw_value': row['EVENTS']
    })
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Fatalities',
        'value': row['FATALITIES_norm'],
        'type': 'Conflict Data',
        'raw_value': row['FATALITIES']
    })
    for term in top_terms:
        if term in row and pd.notna(row[term]):
            plot_data.append({
                'month': row['month'],
                'metric': f'Search: {term}',
                'value': row[term],
                'type': 'Google Trends',
                'raw_value': row[term]
            })

plot_df = pd.DataFrame(plot_data)

# Main chart
chart = alt.Chart(plot_df).mark_line(strokeWidth=2.5, point=True).encode(
    x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45)),
    y=alt.Y('value:Q', title='Normalized Value (0-100)', scale=alt.Scale(domain=[0, 105])),
    color=alt.Color('metric:N', title='Metric', scale=alt.Scale(scheme='tableau10')),
    strokeDash=alt.StrokeDash('type:N', title='Data Type',
                               scale=alt.Scale(domain=['Conflict Data', 'Google Trends'],
                                             range=[[1,0], [5,3]])),
    tooltip=[
        alt.Tooltip('month:T', title='Month', format='%B %Y'),
        alt.Tooltip('metric:N', title='Metric'),
        alt.Tooltip('value:Q', title='Normalized', format='.1f'),
        alt.Tooltip('raw_value:Q', title='Raw Value', format=',.0f')
    ]
).properties(
    width=1400,
    height=450,
    title={
        'text': 'Turkey Hashtags: ACLED Events vs Google Search Interest (2020-2025)',
        'subtitle': 'Examining Turkey-specific Hashtags',
        'fontSize': 18,
        'subtitleFontSize': 13
    }
).interactive()

chart.save('turkey_hashtags_acled_vs_trends.html')
print(f"✓ Saved: turkey_hashtags_acled_vs_trends.html")

# Display
chart

# 8. INDIVIDUAL COMPARISON CHARTS
# --------------------------------
for term in top_terms:
    term_data = merged[['month', 'EVENTS', 'FATALITIES', term]].dropna().copy()
    
    base = alt.Chart(term_data).encode(
        x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45))
    )
    
    events_line = base.mark_line(color='steelblue', strokeWidth=3).encode(
        y=alt.Y('EVENTS:Q', title='ACLED Events', axis=alt.Axis(titleColor='steelblue')),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    trends_line = base.mark_line(color='red', strokeWidth=3).encode(
        y=alt.Y(f'{term}:Q', title=f'Google Trends: {term}',
                axis=alt.Axis(titleColor='red'), scale=alt.Scale(domain=[0, 100])),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    term_chart = alt.layer(events_line, trends_line).resolve_scale(
        y='independent'
    ).properties(
        width=1200,
        height=400,
        title=f'Turkey Hashtags: ACLED Events vs "{term}" Search Interest'
    ).interactive()
    
    filename = f"turkey_hashtags_{term.lower().replace(' ', '_')}_comparison.html"
    term_chart.save(filename)
    print(f"✓ Saved: {filename}")

print("\n✓ Turkey hashtags analysis complete!")

TURKEY HASHTAGS ANALYSIS
✓ ACLED Data: 118 months
  Date range: 2015-12-01 00:00:00 to 2025-09-01 00:00:00
  Total events: 16,115
  Total fatalities: 3,109
  ✓ Loaded: #TurkishWomen             - 70 months, max=100

✓ Merged dataset: 118 months with 1 search terms

CORRELATION ANALYSIS

  Search Term  Corr w/ Events  Corr w/ Fatalities  Data Points
#TurkishWomen       -0.054259            -0.12587           69

TIME-LAG ANALYSIS

#TurkishWomen:
  Lag -3 months (searches LEAD  ): correlation = -0.061
  Lag -2 months (searches LEAD  ): correlation = -0.055
  Lag -1 months (searches LEAD  ): correlation = -0.058
  Lag +0 months (CONCURRENT     ): correlation = -0.054
  Lag +1 months (searches LAG   ): correlation = -0.058
  Lag +2 months (searches LAG   ): correlation = -0.055
  Lag +3 months (searches LAG   ): correlation = -0.061

  → Best correlation at lag +0: -999.000 (CONCURRENT (searches match events))

KEY PERIODS

Top 5 Event Spikes:

July 2016:
  ACLED Events: 736
  ACLED Fatali

#### 5) Yemen

In [47]:
# YEMEN HASHTAGS ANALYSIS: ACLED EVENTS vs GOOGLE TRENDS

print("="*80)
print("YEMEN HASHTAGS ANALYSIS")
print("="*80)

# 1. FILTER ACLED DATA
# ---------------------

yemen_acled = acled[
    (acled['COUNTRY'] == 'Yemen') & 
    (acled['WEEK'] >= '2020-01-01')
].copy()

yemen_acled['WEEK'] = pd.to_datetime(yemen_acled['WEEK'])

yemen_acled['month'] = yemen_acled['WEEK'].dt.to_period('M').dt.to_timestamp()
monthly = yemen_acled.groupby('month').agg({
    'EVENTS': 'sum',
    'FATALITIES': 'sum'
}).reset_index()

print(f"✓ ACLED Data: {len(monthly)} months")
print(f"  Date range: {monthly['month'].min()} to {monthly['month'].max()}")
print(f"  Total events: {monthly['EVENTS'].sum():,}")
print(f"  Total fatalities: {monthly['FATALITIES'].sum():,}")


# 2. LOAD GOOGLE TRENDS FILES
# ----------------------------
yemen_hashtag_files = {
    '#TalkAboutYemen': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_TurkishWomen.csv',
    '#YemenPeace': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_YemenPeace.csv'
}

print("\nNOTE: Hashtag removed for TalkAboutYemen due to no data.")

trends_data = {}
for name, filepath in yemen_hashtag_files.items():
    try:
        df = pd.read_csv(filepath, skiprows=1)
        df.columns = ['month', 'value']
        df['month'] = pd.to_datetime(df['month'])
        df['value'] = df['value'].replace('<1', '0.5')
        df['value'] = pd.to_numeric(df['value'], errors='coerce')
        trends_data[name] = df
        print(f"  ✓ Loaded: {name:25s} - {len(df)} months, max={df['value'].max()}")
    except Exception as e:
        print(f"    ✗ Error loading {name}: {e}")
        

# 3. MERGE DATASETS
# -----------------
merged = monthly.copy()
for name, df in trends_data.items():
    merged = merged.merge(
        df.rename(columns={'value': name}),
        on='month',
        how='left'
    )
print(f"\n✓ Merged dataset: {len(merged)} months with {len(trends_data)} search terms")


# 4. CORRELATION ANALYSIS
# -----------------------
print("\n" + "="*80)
print("CORRELATION ANALYSIS")
print("="*80)

correlations = []
for term in trends_data.keys():
    if term in merged.columns:
        valid_data = merged[['EVENTS', 'FATALITIES', term]].dropna()
        if len(valid_data) > 10:
            corr_events = valid_data['EVENTS'].corr(valid_data[term])
            corr_fatalities = valid_data['FATALITIES'].corr(valid_data[term])
            correlations.append({
                'Search Term': term,
                'Corr w/ Events': corr_events,
                'Corr w/ Fatalities': corr_fatalities,
                'Data Points': len(valid_data)
            })

corr_df = pd.DataFrame(correlations).sort_values('Corr w/ Events', ascending=False)
print("\n" + corr_df.to_string(index=False))

# 5. TIME-LAG ANALYSIS
# --------------------
print("\n" + "="*80)
print("TIME-LAG ANALYSIS")
print("="*80)

top_terms = corr_df.head(3)['Search Term'].tolist()

for term in top_terms:
    print(f"\n{term}:")
    valid_data = merged[['EVENTS', term]].dropna()
    best_corr = -999
    best_lag = 0
    
    for lag in range(-3, 4):
        if lag == 0:
            corr = valid_data['EVENTS'].corr(valid_data[term])
        elif lag > 0:
            if len(valid_data) > lag:
                corr = valid_data['EVENTS'].iloc[lag:].corr(valid_data[term].iloc[:-lag])
            else:
                corr = 0
        else:
            if len(valid_data) > abs(lag):
                corr = valid_data['EVENTS'].iloc[:lag].corr(valid_data[term].iloc[-lag:])
            else:
                corr = 0
        
        if abs(corr) > abs(best_corr):
            best_corr = corr
            best_lag = lag
        
        direction = "searches LAG" if lag > 0 else ("searches LEAD" if lag < 0 else "CONCURRENT")
        print(f"  Lag {lag:+2d} months ({direction:15s}): correlation = {corr:+.3f}")
    
    interpretation = "REACTIVE (searches follow events)" if best_lag > 0 else \
                    "PREDICTIVE (searches precede events)" if best_lag < 0 else \
                    "CONCURRENT (searches match events)"
    print(f"\n  → Best correlation at lag {best_lag:+d}: {best_corr:+.3f} ({interpretation})")

# 6. KEY PERIODS IDENTIFICATION
# -----------------------------
print("\n" + "="*80)
print("KEY PERIODS")
print("="*80)

print("\nTop 5 Event Spikes:")
top_spikes = merged.nlargest(5, 'EVENTS')[['month', 'EVENTS', 'FATALITIES'] + list(trends_data.keys())]
for idx, row in top_spikes.iterrows():
    print(f"\n{row['month'].strftime('%B %Y')}:")
    print(f"  ACLED Events: {row['EVENTS']:,}")
    print(f"  ACLED Fatalities: {row['FATALITIES']:,}")
    print(f"  Search Interest:")
    for term in trends_data.keys():
        if pd.notna(row[term]):
            print(f"    - {term:25s}: {row[term]:.0f}/100")

# 7. VISUALIZATION
# ---------------
print("\n" + "="*80)
print("CREATING VISUALIZATIONS")
print("="*80)

# Normalize data
merged_normalized = merged.copy()
merged_normalized['EVENTS_norm'] = (merged['EVENTS'] / merged['EVENTS'].max()) * 100
merged_normalized['FATALITIES_norm'] = (merged['FATALITIES'] / merged['FATALITIES'].max()) * 100

# Reshape for Altair
plot_data = []
for _, row in merged_normalized.iterrows():
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Events',
        'value': row['EVENTS_norm'],
        'type': 'Conflict Data',
        'raw_value': row['EVENTS']
    })
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Fatalities',
        'value': row['FATALITIES_norm'],
        'type': 'Conflict Data',
        'raw_value': row['FATALITIES']
    })
    for term in top_terms:
        if term in row and pd.notna(row[term]):
            plot_data.append({
                'month': row['month'],
                'metric': f'Search: {term}',
                'value': row[term],
                'type': 'Google Trends',
                'raw_value': row[term]
            })

plot_df = pd.DataFrame(plot_data)

# Main chart
chart = alt.Chart(plot_df).mark_line(strokeWidth=2.5, point=True).encode(
    x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45)),
    y=alt.Y('value:Q', title='Normalized Value (0-100)', scale=alt.Scale(domain=[0, 105])),
    color=alt.Color('metric:N', title='Metric', scale=alt.Scale(scheme='tableau10')),
    strokeDash=alt.StrokeDash('type:N', title='Data Type',
                               scale=alt.Scale(domain=['Conflict Data', 'Google Trends'],
                                             range=[[1,0], [5,3]])),
    tooltip=[
        alt.Tooltip('month:T', title='Month', format='%B %Y'),
        alt.Tooltip('metric:N', title='Metric'),
        alt.Tooltip('value:Q', title='Normalized', format='.1f'),
        alt.Tooltip('raw_value:Q', title='Raw Value', format=',.0f')
    ]
).properties(
    width=1400,
    height=450,
    title={
        'text': 'Yemen Hashtags: ACLED Events vs Google Search Interest (2020-2025)',
        'subtitle': 'Examining Yemen-specific Hashtags',
        'fontSize': 18,
        'subtitleFontSize': 13
    }
).interactive()

chart.save('yemen_hashtags_acled_vs_trends.html')
print(f"✓ Saved: yemen_hashtags_acled_vs_trends.html")

# Display
chart

# 8. INDIVIDUAL COMPARISON CHARTS
# --------------------------------
for term in top_terms:
    term_data = merged[['month', 'EVENTS', 'FATALITIES', term]].dropna().copy()
    
    base = alt.Chart(term_data).encode(
        x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45))
    )
    
    events_line = base.mark_line(color='steelblue', strokeWidth=3).encode(
        y=alt.Y('EVENTS:Q', title='ACLED Events', axis=alt.Axis(titleColor='steelblue')),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    trends_line = base.mark_line(color='red', strokeWidth=3).encode(
        y=alt.Y(f'{term}:Q', title=f'Google Trends: {term}',
                axis=alt.Axis(titleColor='red'), scale=alt.Scale(domain=[0, 100])),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    term_chart = alt.layer(events_line, trends_line).resolve_scale(
        y='independent'
    ).properties(
        width=1200,
        height=400,
        title=f'Yemen Hashtags: ACLED Events vs "{term}" Search Interest'
    ).interactive()
    
    filename = f"yemen_hashtags_{term.lower().replace(' ', '_')}_comparison.html"
    term_chart.save(filename)
    print(f"✓ Saved: {filename}")

print("\n✓ Yemen hashtags analysis complete!")

YEMEN HASHTAGS ANALYSIS
✓ ACLED Data: 130 months
  Date range: 2014-12-01 00:00:00 to 2025-09-01 00:00:00
  Total events: 36,708
  Total fatalities: 55,897

NOTE: Hashtag removed for TalkAboutYemen due to no data.
  ✓ Loaded: #TalkAboutYemen           - 70 months, max=100
  ✓ Loaded: #YemenPeace               - 70 months, max=100

✓ Merged dataset: 130 months with 2 search terms

CORRELATION ANALYSIS

    Search Term  Corr w/ Events  Corr w/ Fatalities  Data Points
    #YemenPeace        0.134008           -0.007178           69
#TalkAboutYemen       -0.056758           -0.116162           69

TIME-LAG ANALYSIS

#YemenPeace:
  Lag -3 months (searches LEAD  ): correlation = +0.148
  Lag -2 months (searches LEAD  ): correlation = +0.140
  Lag -1 months (searches LEAD  ): correlation = +0.134
  Lag +0 months (CONCURRENT     ): correlation = +0.134
  Lag +1 months (searches LAG   ): correlation = +0.134
  Lag +2 months (searches LAG   ): correlation = +0.140
  Lag +3 months (searches LAG  

#### 6) Myanmar

In [48]:
# MYANMAR HASHTAGS ANALYSIS: ACLED EVENTS vs GOOGLE TRENDS

print("="*80)
print("MYANMAR HASHTAGS ANALYSIS")
print("="*80)

# 1. FILTER ACLED DATA
# ---------------------

myanmar_acled = acled[
    (acled['COUNTRY'] == 'Myanmar') & 
    (acled['WEEK'] >= '2020-01-01')
].copy()

myanmar_acled['WEEK'] = pd.to_datetime(myanmar_acled['WEEK'])

myanmar_acled['month'] = myanmar_acled['WEEK'].dt.to_period('M').dt.to_timestamp()
monthly = myanmar_acled.groupby('month').agg({
    'EVENTS': 'sum',
    'FATALITIES': 'sum'
}).reset_index()

print(f"✓ ACLED Data: {len(monthly)} months")
print(f"  Date range: {monthly['month'].min()} to {monthly['month'].max()}")
print(f"  Total events: {monthly['EVENTS'].sum():,}")
print(f"  Total fatalities: {monthly['FATALITIES'].sum():,}")


# 2. LOAD GOOGLE TRENDS FILES
# ----------------------------
myanmar_hashtag_files = {
    '#WhatsHappeningInMyanmar': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_WhatsHappeningInMyanmar.csv',
    '#MilkTeaAlliance': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_MilkTeaAlliance.csv',
    '#POSCO_StopSupportingSAC': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_POSCOStopSupportingSAC.csv'
}

print("\nNOTE: Removed hashtag for Posco_StopSupportingSAC due to no data.")

trends_data = {}
for name, filepath in myanmar_hashtag_files.items():
    try:
        df = pd.read_csv(filepath, skiprows=1)
        df.columns = ['month', 'value']
        df['month'] = pd.to_datetime(df['month'])
        df['value'] = df['value'].replace('<1', '0.5')
        df['value'] = pd.to_numeric(df['value'], errors='coerce')
        trends_data[name] = df
        print(f"  ✓ Loaded: {name:25s} - {len(df)} months, max={df['value'].max()}")
    except Exception as e:
        print(f"    ✗ Error loading {name}: {e}")
        

# 3. MERGE DATASETS
# -----------------
merged = monthly.copy()
for name, df in trends_data.items():
    merged = merged.merge(
        df.rename(columns={'value': name}),
        on='month',
        how='left'
    )
print(f"\n✓ Merged dataset: {len(merged)} months with {len(trends_data)} search terms")


# 4. CORRELATION ANALYSIS
# -----------------------
print("\n" + "="*80)
print("CORRELATION ANALYSIS")
print("="*80)

correlations = []
for term in trends_data.keys():
    if term in merged.columns:
        valid_data = merged[['EVENTS', 'FATALITIES', term]].dropna()
        if len(valid_data) > 10:
            corr_events = valid_data['EVENTS'].corr(valid_data[term])
            corr_fatalities = valid_data['FATALITIES'].corr(valid_data[term])
            correlations.append({
                'Search Term': term,
                'Corr w/ Events': corr_events,
                'Corr w/ Fatalities': corr_fatalities,
                'Data Points': len(valid_data)
            })

corr_df = pd.DataFrame(correlations).sort_values('Corr w/ Events', ascending=False)
print("\n" + corr_df.to_string(index=False))

# 5. TIME-LAG ANALYSIS
# --------------------
print("\n" + "="*80)
print("TIME-LAG ANALYSIS")
print("="*80)

top_terms = corr_df.head(3)['Search Term'].tolist()

for term in top_terms:
    print(f"\n{term}:")
    valid_data = merged[['EVENTS', term]].dropna()
    best_corr = -999
    best_lag = 0
    
    for lag in range(-3, 4):
        if lag == 0:
            corr = valid_data['EVENTS'].corr(valid_data[term])
        elif lag > 0:
            if len(valid_data) > lag:
                corr = valid_data['EVENTS'].iloc[lag:].corr(valid_data[term].iloc[:-lag])
            else:
                corr = 0
        else:
            if len(valid_data) > abs(lag):
                corr = valid_data['EVENTS'].iloc[:lag].corr(valid_data[term].iloc[-lag:])
            else:
                corr = 0
        
        if abs(corr) > abs(best_corr):
            best_corr = corr
            best_lag = lag
        
        direction = "searches LAG" if lag > 0 else ("searches LEAD" if lag < 0 else "CONCURRENT")
        print(f"  Lag {lag:+2d} months ({direction:15s}): correlation = {corr:+.3f}")
    
    interpretation = "REACTIVE (searches follow events)" if best_lag > 0 else \
                    "PREDICTIVE (searches precede events)" if best_lag < 0 else \
                    "CONCURRENT (searches match events)"
    print(f"\n  → Best correlation at lag {best_lag:+d}: {best_corr:+.3f} ({interpretation})")

# 6. KEY PERIODS IDENTIFICATION
# -----------------------------
print("\n" + "="*80)
print("KEY PERIODS")
print("="*80)

print("\nTop 5 Event Spikes:")
top_spikes = merged.nlargest(5, 'EVENTS')[['month', 'EVENTS', 'FATALITIES'] + list(trends_data.keys())]
for idx, row in top_spikes.iterrows():
    print(f"\n{row['month'].strftime('%B %Y')}:")
    print(f"  ACLED Events: {row['EVENTS']:,}")
    print(f"  ACLED Fatalities: {row['FATALITIES']:,}")
    print(f"  Search Interest:")
    for term in trends_data.keys():
        if pd.notna(row[term]):
            print(f"    - {term:25s}: {row[term]:.0f}/100")

# 7. VISUALIZATION
# ---------------
print("\n" + "="*80)
print("CREATING VISUALIZATIONS")
print("="*80)

# Normalize data
merged_normalized = merged.copy()
merged_normalized['EVENTS_norm'] = (merged['EVENTS'] / merged['EVENTS'].max()) * 100
merged_normalized['FATALITIES_norm'] = (merged['FATALITIES'] / merged['FATALITIES'].max()) * 100

# Reshape for Altair
plot_data = []
for _, row in merged_normalized.iterrows():
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Events',
        'value': row['EVENTS_norm'],
        'type': 'Conflict Data',
        'raw_value': row['EVENTS']
    })
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Fatalities',
        'value': row['FATALITIES_norm'],
        'type': 'Conflict Data',
        'raw_value': row['FATALITIES']
    })
    for term in top_terms:
        if term in row and pd.notna(row[term]):
            plot_data.append({
                'month': row['month'],
                'metric': f'Search: {term}',
                'value': row[term],
                'type': 'Google Trends',
                'raw_value': row[term]
            })

plot_df = pd.DataFrame(plot_data)

# Main chart
chart = alt.Chart(plot_df).mark_line(strokeWidth=2.5, point=True).encode(
    x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45)),
    y=alt.Y('value:Q', title='Normalized Value (0-100)', scale=alt.Scale(domain=[0, 105])),
    color=alt.Color('metric:N', title='Metric', scale=alt.Scale(scheme='tableau10')),
    strokeDash=alt.StrokeDash('type:N', title='Data Type',
                               scale=alt.Scale(domain=['Conflict Data', 'Google Trends'],
                                             range=[[1,0], [5,3]])),
    tooltip=[
        alt.Tooltip('month:T', title='Month', format='%B %Y'),
        alt.Tooltip('metric:N', title='Metric'),
        alt.Tooltip('value:Q', title='Normalized', format='.1f'),
        alt.Tooltip('raw_value:Q', title='Raw Value', format=',.0f')
    ]
).properties(
    width=1400,
    height=450,
    title={
        'text': 'Myanmar Hashtags: ACLED Events vs Google Search Interest (2020-2025)',
        'subtitle': 'Examining Myanmar-specific hashtags',
        'fontSize': 18,
        'subtitleFontSize': 13
    }
).interactive()

chart.save('myanmar_hashtags_acled_vs_trends.html')
print(f"✓ Saved: myanmar_hashtags_acled_vs_trends.html")

# Display
chart

# 8. INDIVIDUAL COMPARISON CHARTS
# --------------------------------
for term in top_terms:
    term_data = merged[['month', 'EVENTS', 'FATALITIES', term]].dropna().copy()
    
    base = alt.Chart(term_data).encode(
        x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45))
    )
    
    events_line = base.mark_line(color='steelblue', strokeWidth=3).encode(
        y=alt.Y('EVENTS:Q', title='ACLED Events', axis=alt.Axis(titleColor='steelblue')),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    trends_line = base.mark_line(color='red', strokeWidth=3).encode(
        y=alt.Y(f'{term}:Q', title=f'Google Trends: {term}',
                axis=alt.Axis(titleColor='red'), scale=alt.Scale(domain=[0, 100])),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    term_chart = alt.layer(events_line, trends_line).resolve_scale(
        y='independent'
    ).properties(
        width=1200,
        height=400,
        title=f'Myanmar Hashtags: ACLED Events vs "{term}" Search Interest'
    ).interactive()
    
    filename = f"myanmar_hashtags_{term.lower().replace(' ', '_')}_comparison.html"
    term_chart.save(filename)
    print(f"✓ Saved: {filename}")

print("\n✓ Myanmar hashtags analysis complete!")

MYANMAR HASHTAGS ANALYSIS
✓ ACLED Data: 187 months
  Date range: 2009-12-01 00:00:00 to 2025-09-01 00:00:00
  Total events: 31,662
  Total fatalities: 32,707

NOTE: Removed hashtag for Posco_StopSupportingSAC due to no data.
  ✓ Loaded: #WhatsHappeningInMyanmar  - 70 months, max=100
  ✓ Loaded: #MilkTeaAlliance          - 70 months, max=100
  ✓ Loaded: #POSCO_StopSupportingSAC  - 70 months, max=100

✓ Merged dataset: 187 months with 3 search terms

CORRELATION ANALYSIS

             Search Term  Corr w/ Events  Corr w/ Fatalities  Data Points
#POSCO_StopSupportingSAC        0.284284            0.044731           69
#WhatsHappeningInMyanmar        0.137849            0.162852           69
        #MilkTeaAlliance       -0.246314           -0.273124           69

TIME-LAG ANALYSIS

#POSCO_StopSupportingSAC:
  Lag -3 months (searches LEAD  ): correlation = +0.291
  Lag -2 months (searches LEAD  ): correlation = +0.290
  Lag -1 months (searches LEAD  ): correlation = +0.286
  Lag +0 months

#### 7) Afghanistan

In [49]:
# AFGHANISTAN HASHTAGS ANALYSIS: ACLED EVENTS vs GOOGLE TRENDS

print("="*80)
print("AFGHANISTAN HASHTAGS ANALYSIS")
print("="*80)

# 1. FILTER ACLED DATA
# ---------------------

afghanistan_acled = acled[
    (acled['COUNTRY'] == 'Afghanistan') & 
    (acled['WEEK'] >= '2020-01-01')
].copy()

afghanistan_acled['WEEK'] = pd.to_datetime(afghanistan_acled['WEEK'])

afghanistan_acled['month'] = afghanistan_acled['WEEK'].dt.to_period('M').dt.to_timestamp()
monthly = afghanistan_acled.groupby('month').agg({
    'EVENTS': 'sum',
    'FATALITIES': 'sum'
}).reset_index()

print(f"✓ ACLED Data: {len(monthly)} months")
print(f"  Date range: {monthly['month'].min()} to {monthly['month'].max()}")
print(f"  Total events: {monthly['EVENTS'].sum():,}")
print(f"  Total fatalities: {monthly['FATALITIES'].sum():,}")


# 2. LOAD GOOGLE TRENDS FILES
# ----------------------------
afghanistan_hashtag_files = {
    '#Afghanistan': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_Afghanistan.csv',
    '#Taliban': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_Taliban.csv'
}

trends_data = {}
for name, filepath in afghanistan_hashtag_files.items():
    try:
        df = pd.read_csv(filepath, skiprows=1)
        df.columns = ['month', 'value']
        df['month'] = pd.to_datetime(df['month'])
        df['value'] = df['value'].replace('<1', '0.5')
        df['value'] = pd.to_numeric(df['value'], errors='coerce')
        trends_data[name] = df
        print(f"  ✓ Loaded: {name:25s} - {len(df)} months, max={df['value'].max()}")
    except Exception as e:
        print(f"    ✗ Error loading {name}: {e}")
        

# 3. MERGE DATASETS
# -----------------
merged = monthly.copy()
for name, df in trends_data.items():
    merged = merged.merge(
        df.rename(columns={'value': name}),
        on='month',
        how='left'
    )
print(f"\n✓ Merged dataset: {len(merged)} months with {len(trends_data)} search terms")


# 4. CORRELATION ANALYSIS
# -----------------------
print("\n" + "="*80)
print("CORRELATION ANALYSIS")
print("="*80)

correlations = []
for term in trends_data.keys():
    if term in merged.columns:
        valid_data = merged[['EVENTS', 'FATALITIES', term]].dropna()
        if len(valid_data) > 10:
            corr_events = valid_data['EVENTS'].corr(valid_data[term])
            corr_fatalities = valid_data['FATALITIES'].corr(valid_data[term])
            correlations.append({
                'Search Term': term,
                'Corr w/ Events': corr_events,
                'Corr w/ Fatalities': corr_fatalities,
                'Data Points': len(valid_data)
            })

corr_df = pd.DataFrame(correlations).sort_values('Corr w/ Events', ascending=False)
print("\n" + corr_df.to_string(index=False))

# 5. TIME-LAG ANALYSIS
# --------------------
print("\n" + "="*80)
print("TIME-LAG ANALYSIS")
print("="*80)

top_terms = corr_df.head(3)['Search Term'].tolist()

for term in top_terms:
    print(f"\n{term}:")
    valid_data = merged[['EVENTS', term]].dropna()
    best_corr = -999
    best_lag = 0
    
    for lag in range(-3, 4):
        if lag == 0:
            corr = valid_data['EVENTS'].corr(valid_data[term])
        elif lag > 0:
            if len(valid_data) > lag:
                corr = valid_data['EVENTS'].iloc[lag:].corr(valid_data[term].iloc[:-lag])
            else:
                corr = 0
        else:
            if len(valid_data) > abs(lag):
                corr = valid_data['EVENTS'].iloc[:lag].corr(valid_data[term].iloc[-lag:])
            else:
                corr = 0
        
        if abs(corr) > abs(best_corr):
            best_corr = corr
            best_lag = lag
        
        direction = "searches LAG" if lag > 0 else ("searches LEAD" if lag < 0 else "CONCURRENT")
        print(f"  Lag {lag:+2d} months ({direction:15s}): correlation = {corr:+.3f}")
    
    interpretation = "REACTIVE (searches follow events)" if best_lag > 0 else \
                    "PREDICTIVE (searches precede events)" if best_lag < 0 else \
                    "CONCURRENT (searches match events)"
    print(f"\n  → Best correlation at lag {best_lag:+d}: {best_corr:+.3f} ({interpretation})")

# 6. KEY PERIODS IDENTIFICATION
# -----------------------------
print("\n" + "="*80)
print("KEY PERIODS")
print("="*80)

print("\nTop 5 Event Spikes:")
top_spikes = merged.nlargest(5, 'EVENTS')[['month', 'EVENTS', 'FATALITIES'] + list(trends_data.keys())]
for idx, row in top_spikes.iterrows():
    print(f"\n{row['month'].strftime('%B %Y')}:")
    print(f"  ACLED Events: {row['EVENTS']:,}")
    print(f"  ACLED Fatalities: {row['FATALITIES']:,}")
    print(f"  Search Interest:")
    for term in trends_data.keys():
        if pd.notna(row[term]):
            print(f"    - {term:25s}: {row[term]:.0f}/100")

# 7. VISUALIZATION
# ---------------
print("\n" + "="*80)
print("CREATING VISUALIZATIONS")
print("="*80)

# Normalize data
merged_normalized = merged.copy()
merged_normalized['EVENTS_norm'] = (merged['EVENTS'] / merged['EVENTS'].max()) * 100
merged_normalized['FATALITIES_norm'] = (merged['FATALITIES'] / merged['FATALITIES'].max()) * 100

# Reshape for Altair
plot_data = []
for _, row in merged_normalized.iterrows():
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Events',
        'value': row['EVENTS_norm'],
        'type': 'Conflict Data',
        'raw_value': row['EVENTS']
    })
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Fatalities',
        'value': row['FATALITIES_norm'],
        'type': 'Conflict Data',
        'raw_value': row['FATALITIES']
    })
    for term in top_terms:
        if term in row and pd.notna(row[term]):
            plot_data.append({
                'month': row['month'],
                'metric': f'Search: {term}',
                'value': row[term],
                'type': 'Google Trends',
                'raw_value': row[term]
            })

plot_df = pd.DataFrame(plot_data)

# Main chart
chart = alt.Chart(plot_df).mark_line(strokeWidth=2.5, point=True).encode(
    x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45)),
    y=alt.Y('value:Q', title='Normalized Value (0-100)', scale=alt.Scale(domain=[0, 105])),
    color=alt.Color('metric:N', title='Metric', scale=alt.Scale(scheme='tableau10')),
    strokeDash=alt.StrokeDash('type:N', title='Data Type',
                               scale=alt.Scale(domain=['Conflict Data', 'Google Trends'],
                                             range=[[1,0], [5,3]])),
    tooltip=[
        alt.Tooltip('month:T', title='Month', format='%B %Y'),
        alt.Tooltip('metric:N', title='Metric'),
        alt.Tooltip('value:Q', title='Normalized', format='.1f'),
        alt.Tooltip('raw_value:Q', title='Raw Value', format=',.0f')
    ]
).properties(
    width=1400,
    height=450,
    title={
        'text': 'Afghanistan Hashtags: ACLED Events vs Google Search Interest (2020-2025)',
        'subtitle': 'Examining Afghanistan-specific Hashtags',
        'fontSize': 18,
        'subtitleFontSize': 13
    }
).interactive()

chart.save('afghanistan_hashtags_acled_vs_trends.html')
print(f"✓ Saved: afghanistan_hashtags_acled_vs_trends.html")

# Display
chart

# 8. INDIVIDUAL COMPARISON CHARTS
# --------------------------------
for term in top_terms:
    term_data = merged[['month', 'EVENTS', 'FATALITIES', term]].dropna().copy()
    
    base = alt.Chart(term_data).encode(
        x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45))
    )
    
    events_line = base.mark_line(color='steelblue', strokeWidth=3).encode(
        y=alt.Y('EVENTS:Q', title='ACLED Events', axis=alt.Axis(titleColor='steelblue')),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    trends_line = base.mark_line(color='red', strokeWidth=3).encode(
        y=alt.Y(f'{term}:Q', title=f'Google Trends: {term}',
                axis=alt.Axis(titleColor='red'), scale=alt.Scale(domain=[0, 100])),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    term_chart = alt.layer(events_line, trends_line).resolve_scale(
        y='independent'
    ).properties(
        width=1200,
        height=400,
        title=f'Afghanistan Hashtags: ACLED Events vs "{term}" Search Interest'
    ).interactive()
    
    filename = f"afghanistan_hashtags_{term.lower().replace(' ', '_')}_comparison.html"
    term_chart.save(filename)
    print(f"✓ Saved: {filename}")

print("\n✓ Afghanistan hashtags analysis complete!")

AFGHANISTAN HASHTAGS ANALYSIS
✓ ACLED Data: 106 months
  Date range: 2016-12-01 00:00:00 to 2025-09-01 00:00:00
  Total events: 23,895
  Total fatalities: 70,176
  ✓ Loaded: #Afghanistan              - 70 months, max=100
  ✓ Loaded: #Taliban                  - 70 months, max=100.0

✓ Merged dataset: 106 months with 2 search terms

CORRELATION ANALYSIS

 Search Term  Corr w/ Events  Corr w/ Fatalities  Data Points
    #Taliban       -0.043329           -0.003701           69
#Afghanistan       -0.053181            0.013884           69

TIME-LAG ANALYSIS

#Taliban:
  Lag -3 months (searches LEAD  ): correlation = -0.042
  Lag -2 months (searches LEAD  ): correlation = -0.043
  Lag -1 months (searches LEAD  ): correlation = -0.043
  Lag +0 months (CONCURRENT     ): correlation = -0.043
  Lag +1 months (searches LAG   ): correlation = -0.043
  Lag +2 months (searches LAG   ): correlation = -0.043
  Lag +3 months (searches LAG   ): correlation = -0.042

  → Best correlation at lag +0: -999

#### 8) Iraq

In [51]:

# IRAQ HASHTAGS ANALYSIS: ACLED EVENTS vs GOOGLE TRENDS

print("="*80)
print("IRAQ HASHTAGS ANALYSIS")
print("="*80)

# 1. FILTER ACLED DATA
# ---------------------

iraq_acled = acled[
    (acled['COUNTRY'] == 'Iraq') & 
    (acled['WEEK'] >= '2020-01-01')
].copy()

iraq_acled['WEEK'] = pd.to_datetime(iraq_acled['WEEK'])

iraq_acled['month'] = iraq_acled['WEEK'].dt.to_period('M').dt.to_timestamp()
monthly = iraq_acled.groupby('month').agg({
    'EVENTS': 'sum',
    'FATALITIES': 'sum'
}).reset_index()

print(f"✓ ACLED Data: {len(monthly)} months")
print(f"  Date range: {monthly['month'].min()} to {monthly['month'].max()}")
print(f"  Total events: {monthly['EVENTS'].sum():,}")
print(f"  Total fatalities: {monthly['FATALITIES'].sum():,}")


# 2. LOAD GOOGLE TRENDS FILES
# ----------------------------
iraq_hashtag_files = {
    '#Iraq': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_Iraq.csv',
    '#IraqCeasefire': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_IraqCeasefire.csv'
}


trends_data = {}
for name, filepath in iraq_hashtag_files.items():
    try:
        df = pd.read_csv(filepath, skiprows=1)
        df.columns = ['month', 'value']
        df['month'] = pd.to_datetime(df['month'])
        df['value'] = df['value'].replace('<1', '0.5')
        df['value'] = pd.to_numeric(df['value'], errors='coerce')
        trends_data[name] = df
        print(f"  ✓ Loaded: {name:25s} - {len(df)} months, max={df['value'].max()}")
    except Exception as e:
        print(f"    ✗ Error loading {name}: {e}")
        

# 3. MERGE DATASETS
# -----------------
merged = monthly.copy()
for name, df in trends_data.items():
    merged = merged.merge(
        df.rename(columns={'value': name}),
        on='month',
        how='left'
    )
print(f"\n✓ Merged dataset: {len(merged)} months with {len(trends_data)} search terms")


# 4. CORRELATION ANALYSIS
# -----------------------
print("\n" + "="*80)
print("CORRELATION ANALYSIS")
print("="*80)

correlations = []
for term in trends_data.keys():
    if term in merged.columns:
        valid_data = merged[['EVENTS', 'FATALITIES', term]].dropna()
        if len(valid_data) > 10:
            corr_events = valid_data['EVENTS'].corr(valid_data[term])
            corr_fatalities = valid_data['FATALITIES'].corr(valid_data[term])
            correlations.append({
                'Search Term': term,
                'Corr w/ Events': corr_events,
                'Corr w/ Fatalities': corr_fatalities,
                'Data Points': len(valid_data)
            })

corr_df = pd.DataFrame(correlations).sort_values('Corr w/ Events', ascending=False)
print("\n" + corr_df.to_string(index=False))

# 5. TIME-LAG ANALYSIS
# --------------------
print("\n" + "="*80)
print("TIME-LAG ANALYSIS")
print("="*80)

top_terms = corr_df.head(3)['Search Term'].tolist()

for term in top_terms:
    print(f"\n{term}:")
    valid_data = merged[['EVENTS', term]].dropna()
    best_corr = -999
    best_lag = 0
    
    for lag in range(-3, 4):
        if lag == 0:
            corr = valid_data['EVENTS'].corr(valid_data[term])
        elif lag > 0:
            if len(valid_data) > lag:
                corr = valid_data['EVENTS'].iloc[lag:].corr(valid_data[term].iloc[:-lag])
            else:
                corr = 0
        else:
            if len(valid_data) > abs(lag):
                corr = valid_data['EVENTS'].iloc[:lag].corr(valid_data[term].iloc[-lag:])
            else:
                corr = 0
        
        if abs(corr) > abs(best_corr):
            best_corr = corr
            best_lag = lag
        
        direction = "searches LAG" if lag > 0 else ("searches LEAD" if lag < 0 else "CONCURRENT")
        print(f"  Lag {lag:+2d} months ({direction:15s}): correlation = {corr:+.3f}")
    
    interpretation = "REACTIVE (searches follow events)" if best_lag > 0 else \
                    "PREDICTIVE (searches precede events)" if best_lag < 0 else \
                    "CONCURRENT (searches match events)"
    print(f"\n  → Best correlation at lag {best_lag:+d}: {best_corr:+.3f} ({interpretation})")

# 6. KEY PERIODS IDENTIFICATION
# -----------------------------
print("\n" + "="*80)
print("KEY PERIODS")
print("="*80)

print("\nTop 5 Event Spikes:")
top_spikes = merged.nlargest(5, 'EVENTS')[['month', 'EVENTS', 'FATALITIES'] + list(trends_data.keys())]
for idx, row in top_spikes.iterrows():
    print(f"\n{row['month'].strftime('%B %Y')}:")
    print(f"  ACLED Events: {row['EVENTS']:,}")
    print(f"  ACLED Fatalities: {row['FATALITIES']:,}")
    print(f"  Search Interest:")
    for term in trends_data.keys():
        if pd.notna(row[term]):
            print(f"    - {term:25s}: {row[term]:.0f}/100")

# 7. VISUALIZATION
# ---------------
print("\n" + "="*80)
print("CREATING VISUALIZATIONS")
print("="*80)

# Normalize data
merged_normalized = merged.copy()
merged_normalized['EVENTS_norm'] = (merged['EVENTS'] / merged['EVENTS'].max()) * 100
merged_normalized['FATALITIES_norm'] = (merged['FATALITIES'] / merged['FATALITIES'].max()) * 100

# Reshape for Altair
plot_data = []
for _, row in merged_normalized.iterrows():
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Events',
        'value': row['EVENTS_norm'],
        'type': 'Conflict Data',
        'raw_value': row['EVENTS']
    })
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Fatalities',
        'value': row['FATALITIES_norm'],
        'type': 'Conflict Data',
        'raw_value': row['FATALITIES']
    })
    for term in top_terms:
        if term in row and pd.notna(row[term]):
            plot_data.append({
                'month': row['month'],
                'metric': f'Search: {term}',
                'value': row[term],
                'type': 'Google Trends',
                'raw_value': row[term]
            })

plot_df = pd.DataFrame(plot_data)

# Main chart
chart = alt.Chart(plot_df).mark_line(strokeWidth=2.5, point=True).encode(
    x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45)),
    y=alt.Y('value:Q', title='Normalized Value (0-100)', scale=alt.Scale(domain=[0, 105])),
    color=alt.Color('metric:N', title='Metric', scale=alt.Scale(scheme='tableau10')),
    strokeDash=alt.StrokeDash('type:N', title='Data Type',
                               scale=alt.Scale(domain=['Conflict Data', 'Google Trends'],
                                             range=[[1,0], [5,3]])),
    tooltip=[
        alt.Tooltip('month:T', title='Month', format='%B %Y'),
        alt.Tooltip('metric:N', title='Metric'),
        alt.Tooltip('value:Q', title='Normalized', format='.1f'),
        alt.Tooltip('raw_value:Q', title='Raw Value', format=',.0f')
    ]
).properties(
    width=1400,
    height=450,
    title={
        'text': 'Iraq Hashtags: ACLED Events vs Google Search Interest (2020-2025)',
        'subtitle': 'Examining Iraq-specific Hashtags',
        'fontSize': 18,
        'subtitleFontSize': 13
    }
).interactive()

chart.save('iraq_hashtags_acled_vs_trends.html')
print(f"✓ Saved: iraq_hashtags_acled_vs_trends.html")

# Display
chart

# 8. INDIVIDUAL COMPARISON CHARTS
# --------------------------------
for term in top_terms:
    term_data = merged[['month', 'EVENTS', 'FATALITIES', term]].dropna().copy()
    
    base = alt.Chart(term_data).encode(
        x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45))
    )
    
    events_line = base.mark_line(color='steelblue', strokeWidth=3).encode(
        y=alt.Y('EVENTS:Q', title='ACLED Events', axis=alt.Axis(titleColor='steelblue')),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    trends_line = base.mark_line(color='red', strokeWidth=3).encode(
        y=alt.Y(f'{term}:Q', title=f'Google Trends: {term}',
                axis=alt.Axis(titleColor='red'), scale=alt.Scale(domain=[0, 100])),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    term_chart = alt.layer(events_line, trends_line).resolve_scale(
        y='independent'
    ).properties(
        width=1200,
        height=400,
        title=f'Iraq Hashtags: ACLED Events vs "{term}" Search Interest'
    ).interactive()
    
    filename = f"iraq_hashtags_{term.lower().replace(' ', '_')}_comparison.html"
    term_chart.save(filename)
    print(f"✓ Saved: {filename}")

print("\n✓ Iraq hashtags analysis complete!")

IRAQ HASHTAGS ANALYSIS
✓ ACLED Data: 118 months
  Date range: 2015-12-01 00:00:00 to 2025-09-01 00:00:00
  Total events: 22,556
  Total fatalities: 36,368
  ✓ Loaded: #Iraq                     - 70 months, max=100
  ✓ Loaded: #IraqCeasefire            - 70 months, max=100

✓ Merged dataset: 118 months with 2 search terms

CORRELATION ANALYSIS

   Search Term  Corr w/ Events  Corr w/ Fatalities  Data Points
         #Iraq       -0.187089           -0.043383           69
#IraqCeasefire             NaN                 NaN           69

TIME-LAG ANALYSIS

#Iraq:
  Lag -3 months (searches LEAD  ): correlation = -0.141
  Lag -2 months (searches LEAD  ): correlation = -0.161
  Lag -1 months (searches LEAD  ): correlation = -0.174
  Lag +0 months (CONCURRENT     ): correlation = -0.187
  Lag +1 months (searches LAG   ): correlation = -0.174
  Lag +2 months (searches LAG   ): correlation = -0.161
  Lag +3 months (searches LAG   ): correlation = -0.141

  → Best correlation at lag +0: -999.000 (

  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]


#### 9) Somalia

In [52]:
# SOMALIA HASHTAGS ANALYSIS: ACLED EVENTS vs GOOGLE TRENDS

print("="*80)
print("SOMALIA HASHTAGS ANALYSIS")
print("="*80)

# 1. FILTER ACLED DATA
# ---------------------

somalia_acled = acled[
    (acled['COUNTRY'] == 'Somalia') & 
    (acled['WEEK'] >= '2020-01-01')
].copy()

somalia_acled['WEEK'] = pd.to_datetime(somalia_acled['WEEK'])

somalia_acled['month'] = somalia_acled['WEEK'].dt.to_period('M').dt.to_timestamp()
monthly = somalia_acled.groupby('month').agg({
    'EVENTS': 'sum',
    'FATALITIES': 'sum'
}).reset_index()

print(f"✓ ACLED Data: {len(monthly)} months")
print(f"  Date range: {monthly['month'].min()} to {monthly['month'].max()}")
print(f"  Total events: {monthly['EVENTS'].sum():,}")
print(f"  Total fatalities: {monthly['FATALITIES'].sum():,}")


# 2. LOAD GOOGLE TRENDS FILES
# ----------------------------
somalia_hashtag_files = {
    '#Somaliland': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_Somaliland.csv',
    '#SomaliaHumanRights': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_SomaliaHumanRights.csv'
}

trends_data = {}
for name, filepath in somalia_hashtag_files.items():
    try:
        df = pd.read_csv(filepath, skiprows=1)
        df.columns = ['month', 'value']
        df['month'] = pd.to_datetime(df['month'])
        df['value'] = df['value'].replace('<1', '0.5')
        df['value'] = pd.to_numeric(df['value'], errors='coerce')
        trends_data[name] = df
        print(f"  ✓ Loaded: {name:25s} - {len(df)} months, max={df['value'].max()}")
    except Exception as e:
        print(f"    ✗ Error loading {name}: {e}")
        

# 3. MERGE DATASETS
# -----------------
merged = monthly.copy()
for name, df in trends_data.items():
    merged = merged.merge(
        df.rename(columns={'value': name}),
        on='month',
        how='left'
    )
print(f"\n✓ Merged dataset: {len(merged)} months with {len(trends_data)} search terms")


# 4. CORRELATION ANALYSIS
# -----------------------
print("\n" + "="*80)
print("CORRELATION ANALYSIS")
print("="*80)

correlations = []
for term in trends_data.keys():
    if term in merged.columns:
        valid_data = merged[['EVENTS', 'FATALITIES', term]].dropna()
        if len(valid_data) > 10:
            corr_events = valid_data['EVENTS'].corr(valid_data[term])
            corr_fatalities = valid_data['FATALITIES'].corr(valid_data[term])
            correlations.append({
                'Search Term': term,
                'Corr w/ Events': corr_events,
                'Corr w/ Fatalities': corr_fatalities,
                'Data Points': len(valid_data)
            })

corr_df = pd.DataFrame(correlations).sort_values('Corr w/ Events', ascending=False)
print("\n" + corr_df.to_string(index=False))

# 5. TIME-LAG ANALYSIS
# --------------------
print("\n" + "="*80)
print("TIME-LAG ANALYSIS")
print("="*80)

top_terms = corr_df.head(3)['Search Term'].tolist()

for term in top_terms:
    print(f"\n{term}:")
    valid_data = merged[['EVENTS', term]].dropna()
    best_corr = -999
    best_lag = 0
    
    for lag in range(-3, 4):
        if lag == 0:
            corr = valid_data['EVENTS'].corr(valid_data[term])
        elif lag > 0:
            if len(valid_data) > lag:
                corr = valid_data['EVENTS'].iloc[lag:].corr(valid_data[term].iloc[:-lag])
            else:
                corr = 0
        else:
            if len(valid_data) > abs(lag):
                corr = valid_data['EVENTS'].iloc[:lag].corr(valid_data[term].iloc[-lag:])
            else:
                corr = 0
        
        if abs(corr) > abs(best_corr):
            best_corr = corr
            best_lag = lag
        
        direction = "searches LAG" if lag > 0 else ("searches LEAD" if lag < 0 else "CONCURRENT")
        print(f"  Lag {lag:+2d} months ({direction:15s}): correlation = {corr:+.3f}")
    
    interpretation = "REACTIVE (searches follow events)" if best_lag > 0 else \
                    "PREDICTIVE (searches precede events)" if best_lag < 0 else \
                    "CONCURRENT (searches match events)"
    print(f"\n  → Best correlation at lag {best_lag:+d}: {best_corr:+.3f} ({interpretation})")

# 6. KEY PERIODS IDENTIFICATION
# -----------------------------
print("\n" + "="*80)
print("KEY PERIODS")
print("="*80)

print("\nTop 5 Event Spikes:")
top_spikes = merged.nlargest(5, 'EVENTS')[['month', 'EVENTS', 'FATALITIES'] + list(trends_data.keys())]
for idx, row in top_spikes.iterrows():
    print(f"\n{row['month'].strftime('%B %Y')}:")
    print(f"  ACLED Events: {row['EVENTS']:,}")
    print(f"  ACLED Fatalities: {row['FATALITIES']:,}")
    print(f"  Search Interest:")
    for term in trends_data.keys():
        if pd.notna(row[term]):
            print(f"    - {term:25s}: {row[term]:.0f}/100")

# 7. VISUALIZATION
# ---------------
print("\n" + "="*80)
print("CREATING VISUALIZATIONS")
print("="*80)

# Normalize data
merged_normalized = merged.copy()
merged_normalized['EVENTS_norm'] = (merged['EVENTS'] / merged['EVENTS'].max()) * 100
merged_normalized['FATALITIES_norm'] = (merged['FATALITIES'] / merged['FATALITIES'].max()) * 100

# Reshape for Altair
plot_data = []
for _, row in merged_normalized.iterrows():
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Events',
        'value': row['EVENTS_norm'],
        'type': 'Conflict Data',
        'raw_value': row['EVENTS']
    })
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Fatalities',
        'value': row['FATALITIES_norm'],
        'type': 'Conflict Data',
        'raw_value': row['FATALITIES']
    })
    for term in top_terms:
        if term in row and pd.notna(row[term]):
            plot_data.append({
                'month': row['month'],
                'metric': f'Search: {term}',
                'value': row[term],
                'type': 'Google Trends',
                'raw_value': row[term]
            })

plot_df = pd.DataFrame(plot_data)

# Main chart
chart = alt.Chart(plot_df).mark_line(strokeWidth=2.5, point=True).encode(
    x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45)),
    y=alt.Y('value:Q', title='Normalized Value (0-100)', scale=alt.Scale(domain=[0, 105])),
    color=alt.Color('metric:N', title='Metric', scale=alt.Scale(scheme='tableau10')),
    strokeDash=alt.StrokeDash('type:N', title='Data Type',
                               scale=alt.Scale(domain=['Conflict Data', 'Google Trends'],
                                             range=[[1,0], [5,3]])),
    tooltip=[
        alt.Tooltip('month:T', title='Month', format='%B %Y'),
        alt.Tooltip('metric:N', title='Metric'),
        alt.Tooltip('value:Q', title='Normalized', format='.1f'),
        alt.Tooltip('raw_value:Q', title='Raw Value', format=',.0f')
    ]
).properties(
    width=1400,
    height=450,
    title={
        'text': 'Somalia Hashtags: ACLED Events vs Google Search Interest (2020-2025)',
        'subtitle': 'Examining Somalia-specific Hashtags',
        'fontSize': 18,
        'subtitleFontSize': 13
    }
).interactive()

chart.save('somalia_hashtags_acled_vs_trends.html')
print(f"✓ Saved: somalia_hashtags_acled_vs_trends.html")

# Display
chart

# 8. INDIVIDUAL COMPARISON CHARTS
# --------------------------------
for term in top_terms:
    term_data = merged[['month', 'EVENTS', 'FATALITIES', term]].dropna().copy()
    
    base = alt.Chart(term_data).encode(
        x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45))
    )
    
    events_line = base.mark_line(color='steelblue', strokeWidth=3).encode(
        y=alt.Y('EVENTS:Q', title='ACLED Events', axis=alt.Axis(titleColor='steelblue')),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    trends_line = base.mark_line(color='red', strokeWidth=3).encode(
        y=alt.Y(f'{term}:Q', title=f'Google Trends: {term}',
                axis=alt.Axis(titleColor='red'), scale=alt.Scale(domain=[0, 100])),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    term_chart = alt.layer(events_line, trends_line).resolve_scale(
        y='independent'
    ).properties(
        width=1200,
        height=400,
        title=f'Somalia Hashtags: ACLED Events vs "{term}" Search Interest'
    ).interactive()
    
    filename = f"somalia_hashtags_{term.lower().replace(' ', '_')}_comparison.html"
    term_chart.save(filename)
    print(f"✓ Saved: {filename}")

print("\n✓ Somalia hashtags analysis complete!")

SOMALIA HASHTAGS ANALYSIS
✓ ACLED Data: 312 months
  Date range: 1997-03-01 00:00:00 to 2025-09-01 00:00:00
  Total events: 17,889
  Total fatalities: 29,221
  ✓ Loaded: #Somaliland               - 70 months, max=100
  ✓ Loaded: #SomaliaHumanRights       - 70 months, max=100

✓ Merged dataset: 312 months with 2 search terms

CORRELATION ANALYSIS

        Search Term  Corr w/ Events  Corr w/ Fatalities  Data Points
#SomaliaHumanRights        0.009107           -0.065277           69
        #Somaliland       -0.007841           -0.150355           69

TIME-LAG ANALYSIS

#SomaliaHumanRights:
  Lag -3 months (searches LEAD  ): correlation = +0.019
  Lag -2 months (searches LEAD  ): correlation = +0.016
  Lag -1 months (searches LEAD  ): correlation = +0.008
  Lag +0 months (CONCURRENT     ): correlation = +0.009
  Lag +1 months (searches LAG   ): correlation = +0.008
  Lag +2 months (searches LAG   ): correlation = +0.016
  Lag +3 months (searches LAG   ): correlation = +0.019

  → Best c

#### 10) India/Pakistan

In [54]:
# INDIA-PAKISTAN HASHTAGS ANALYSIS: ACLED EVENTS vs GOOGLE TRENDS

print("="*80)
print("INDIA-PAKISTAN HASHTAGS ANALYSIS")
print("="*80)

# 1. FILTER ACLED DATA
# ---------------------

indiaPakistan_acled = acled[
    ((acled['COUNTRY'] == 'India') | (acled['COUNTRY'] == 'Pakistan')) & 
    (acled['WEEK'] >= '2020-01-01')
].copy()

indiaPakistan_acled['WEEK'] = pd.to_datetime(indiaPakistan_acled['WEEK'])

indiaPakistan_acled['month'] = indiaPakistan_acled['WEEK'].dt.to_period('M').dt.to_timestamp()
monthly = indiaPakistan_acled.groupby('month').agg({
    'EVENTS': 'sum',
    'FATALITIES': 'sum'
}).reset_index()

print(f"✓ ACLED Data: {len(monthly)} months")
print(f"  Date range: {monthly['month'].min()} to {monthly['month'].max()}")
print(f"  Total events: {monthly['EVENTS'].sum():,}")
print(f"  Total fatalities: {monthly['FATALITIES'].sum():,}")


# 2. LOAD GOOGLE TRENDS FILES
# ----------------------------
indiaPakistan_hashtag_files = {
    '#IndiaPakistan': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_IndiaPakistan.csv',
    '#LiberateIndia': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_LiberateIndia.csv',
    '#Pakistan': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_Pakistan.csv',
    '#PakistanZindabad': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_PakistanZindabad.csv'
}

trends_data = {}
for name, filepath in indiaPakistan_hashtag_files.items():
    try:
        df = pd.read_csv(filepath, skiprows=1)
        df.columns = ['month', 'value']
        df['month'] = pd.to_datetime(df['month'])
        df['value'] = df['value'].replace('<1', '0.5')
        df['value'] = pd.to_numeric(df['value'], errors='coerce')
        trends_data[name] = df
        print(f"  ✓ Loaded: {name:25s} - {len(df)} months, max={df['value'].max()}")
    except Exception as e:
        print(f"    ✗ Error loading {name}: {e}")
        

# 3. MERGE DATASETS
# -----------------
merged = monthly.copy()
for name, df in trends_data.items():
    merged = merged.merge(
        df.rename(columns={'value': name}),
        on='month',
        how='left'
    )
print(f"\n✓ Merged dataset: {len(merged)} months with {len(trends_data)} search terms")


# 4. CORRELATION ANALYSIS
# -----------------------
print("\n" + "="*80)
print("CORRELATION ANALYSIS")
print("="*80)

correlations = []
for term in trends_data.keys():
    if term in merged.columns:
        valid_data = merged[['EVENTS', 'FATALITIES', term]].dropna()
        if len(valid_data) > 10:
            corr_events = valid_data['EVENTS'].corr(valid_data[term])
            corr_fatalities = valid_data['FATALITIES'].corr(valid_data[term])
            correlations.append({
                'Search Term': term,
                'Corr w/ Events': corr_events,
                'Corr w/ Fatalities': corr_fatalities,
                'Data Points': len(valid_data)
            })

corr_df = pd.DataFrame(correlations).sort_values('Corr w/ Events', ascending=False)
print("\n" + corr_df.to_string(index=False))

# 5. TIME-LAG ANALYSIS
# --------------------
print("\n" + "="*80)
print("TIME-LAG ANALYSIS")
print("="*80)

top_terms = corr_df.head(3)['Search Term'].tolist()

for term in top_terms:
    print(f"\n{term}:")
    valid_data = merged[['EVENTS', term]].dropna()
    best_corr = -999
    best_lag = 0
    
    for lag in range(-3, 4):
        if lag == 0:
            corr = valid_data['EVENTS'].corr(valid_data[term])
        elif lag > 0:
            if len(valid_data) > lag:
                corr = valid_data['EVENTS'].iloc[lag:].corr(valid_data[term].iloc[:-lag])
            else:
                corr = 0
        else:
            if len(valid_data) > abs(lag):
                corr = valid_data['EVENTS'].iloc[:lag].corr(valid_data[term].iloc[-lag:])
            else:
                corr = 0
        
        if abs(corr) > abs(best_corr):
            best_corr = corr
            best_lag = lag
        
        direction = "searches LAG" if lag > 0 else ("searches LEAD" if lag < 0 else "CONCURRENT")
        print(f"  Lag {lag:+2d} months ({direction:15s}): correlation = {corr:+.3f}")
    
    interpretation = "REACTIVE (searches follow events)" if best_lag > 0 else \
                    "PREDICTIVE (searches precede events)" if best_lag < 0 else \
                    "CONCURRENT (searches match events)"
    print(f"\n  → Best correlation at lag {best_lag:+d}: {best_corr:+.3f} ({interpretation})")

# 6. KEY PERIODS IDENTIFICATION
# -----------------------------
print("\n" + "="*80)
print("KEY PERIODS")
print("="*80)

print("\nTop 5 Event Spikes:")
top_spikes = merged.nlargest(5, 'EVENTS')[['month', 'EVENTS', 'FATALITIES'] + list(trends_data.keys())]
for idx, row in top_spikes.iterrows():
    print(f"\n{row['month'].strftime('%B %Y')}:")
    print(f"  ACLED Events: {row['EVENTS']:,}")
    print(f"  ACLED Fatalities: {row['FATALITIES']:,}")
    print(f"  Search Interest:")
    for term in trends_data.keys():
        if pd.notna(row[term]):
            print(f"    - {term:25s}: {row[term]:.0f}/100")

# 7. VISUALIZATION
# ---------------
print("\n" + "="*80)
print("CREATING VISUALIZATIONS")
print("="*80)

# Normalize data
merged_normalized = merged.copy()
merged_normalized['EVENTS_norm'] = (merged['EVENTS'] / merged['EVENTS'].max()) * 100
merged_normalized['FATALITIES_norm'] = (merged['FATALITIES'] / merged['FATALITIES'].max()) * 100

# Reshape for Altair
plot_data = []
for _, row in merged_normalized.iterrows():
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Events',
        'value': row['EVENTS_norm'],
        'type': 'Conflict Data',
        'raw_value': row['EVENTS']
    })
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Fatalities',
        'value': row['FATALITIES_norm'],
        'type': 'Conflict Data',
        'raw_value': row['FATALITIES']
    })
    for term in top_terms:
        if term in row and pd.notna(row[term]):
            plot_data.append({
                'month': row['month'],
                'metric': f'Search: {term}',
                'value': row[term],
                'type': 'Google Trends',
                'raw_value': row[term]
            })

plot_df = pd.DataFrame(plot_data)

# Main chart
chart = alt.Chart(plot_df).mark_line(strokeWidth=2.5, point=True).encode(
    x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45)),
    y=alt.Y('value:Q', title='Normalized Value (0-100)', scale=alt.Scale(domain=[0, 105])),
    color=alt.Color('metric:N', title='Metric', scale=alt.Scale(scheme='tableau10')),
    strokeDash=alt.StrokeDash('type:N', title='Data Type',
                               scale=alt.Scale(domain=['Conflict Data', 'Google Trends'],
                                             range=[[1,0], [5,3]])),
    tooltip=[
        alt.Tooltip('month:T', title='Month', format='%B %Y'),
        alt.Tooltip('metric:N', title='Metric'),
        alt.Tooltip('value:Q', title='Normalized', format='.1f'),
        alt.Tooltip('raw_value:Q', title='Raw Value', format=',.0f')
    ]
).properties(
    width=1400,
    height=450,
    title={
        'text': 'India & Pakistan Hashtags: ACLED Events vs Google Search Interest (2020-2025)',
        'subtitle': 'Examining India & Pakistan-specific Hashtags',
        'fontSize': 18,
        'subtitleFontSize': 13
    }
).interactive()

chart.save('indiaPakistan_hashtags_acled_vs_trends.html')
print(f"✓ Saved: indiaPakistan_hashtags_acled_vs_trends.html")

# Display
chart

# 8. INDIVIDUAL COMPARISON CHARTS
# --------------------------------
for term in top_terms:
    term_data = merged[['month', 'EVENTS', 'FATALITIES', term]].dropna().copy()
    
    base = alt.Chart(term_data).encode(
        x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45))
    )
    
    events_line = base.mark_line(color='steelblue', strokeWidth=3).encode(
        y=alt.Y('EVENTS:Q', title='ACLED Events', axis=alt.Axis(titleColor='steelblue')),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    trends_line = base.mark_line(color='red', strokeWidth=3).encode(
        y=alt.Y(f'{term}:Q', title=f'Google Trends: {term}',
                axis=alt.Axis(titleColor='red'), scale=alt.Scale(domain=[0, 100])),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    term_chart = alt.layer(events_line, trends_line).resolve_scale(
        y='independent'
    ).properties(
        width=1200,
        height=400,
        title=f'India-Pakistan Hashtags: ACLED Events vs "{term}" Search Interest'
    ).interactive()
    
    filename = f"indiaPakistan_hashtags_{term.lower().replace(' ', '_')}_comparison.html"
    term_chart.save(filename)
    print(f"✓ Saved: {filename}")

print("\n✓ India-Pakistan hashtags analysis complete!")

INDIA-PAKISTAN HASHTAGS ANALYSIS
✓ ACLED Data: 190 months
  Date range: 2009-12-01 00:00:00 to 2025-09-01 00:00:00
  Total events: 102,060
  Total fatalities: 23,132
  ✓ Loaded: #IndiaPakistan            - 70 months, max=100
  ✓ Loaded: #LiberateIndia            - 70 months, max=100
  ✓ Loaded: #Pakistan                 - 70 months, max=100
  ✓ Loaded: #PakistanZindabad         - 70 months, max=100

✓ Merged dataset: 190 months with 4 search terms

CORRELATION ANALYSIS

      Search Term  Corr w/ Events  Corr w/ Fatalities  Data Points
   #LiberateIndia        0.128510           -0.046084           69
#PakistanZindabad        0.109530            0.290239           69
   #IndiaPakistan        0.087810            0.355312           69
        #Pakistan       -0.125019           -0.148872           69

TIME-LAG ANALYSIS

#LiberateIndia:
  Lag -3 months (searches LEAD  ): correlation = +0.129
  Lag -2 months (searches LEAD  ): correlation = +0.130
  Lag -1 months (searches LEAD  ): correla

#### 11) United States [NEED NOTE]

In [55]:
# UNITED STATES HASHTAGS ANALYSIS: ACLED EVENTS vs GOOGLE TRENDS

print("="*80)
print("UNITED STATES HASHTAGS ANALYSIS")
print("="*80)

# 1. FILTER ACLED DATA
# ---------------------

us_acled = acled[
    (acled['COUNTRY'] == 'United States') & 
    (acled['WEEK'] >= '2020-01-01')
].copy()

us_acled['WEEK'] = pd.to_datetime(us_acled['WEEK'])

us_acled['month'] = us_acled['WEEK'].dt.to_period('M').dt.to_timestamp()
monthly = us_acled.groupby('month').agg({
    'EVENTS': 'sum',
    'FATALITIES': 'sum'
}).reset_index()

print(f"✓ ACLED Data: {len(monthly)} months")
print(f"  Date range: {monthly['month'].min()} to {monthly['month'].max()}")
print(f"  Total events: {monthly['EVENTS'].sum():,}")
print(f"  Total fatalities: {monthly['FATALITIES'].sum():,}")


# 2. LOAD GOOGLE TRENDS FILES
# ----------------------------
us_hashtag_files = {
    '#StopGunViolence': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_StopGunViolence.csv',
    '#BlackLivesMatter': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_BlackLivesMatter.csv'
}

print("\nNOTE: Hashtag removed for StopGunViolence due to no data.")

trends_data = {}
for name, filepath in us_hashtag_files.items():
    try:
        df = pd.read_csv(filepath, skiprows=1)
        df.columns = ['month', 'value']
        df['month'] = pd.to_datetime(df['month'])
        df['value'] = df['value'].replace('<1', '0.5')
        df['value'] = pd.to_numeric(df['value'], errors='coerce')
        trends_data[name] = df
        print(f"  ✓ Loaded: {name:25s} - {len(df)} months, max={df['value'].max()}")
    except Exception as e:
        print(f"    ✗ Error loading {name}: {e}")
        

# 3. MERGE DATASETS
# -----------------
merged = monthly.copy()
for name, df in trends_data.items():
    merged = merged.merge(
        df.rename(columns={'value': name}),
        on='month',
        how='left'
    )
print(f"\n✓ Merged dataset: {len(merged)} months with {len(trends_data)} search terms")


# 4. CORRELATION ANALYSIS
# -----------------------
print("\n" + "="*80)
print("CORRELATION ANALYSIS")
print("="*80)

correlations = []
for term in trends_data.keys():
    if term in merged.columns:
        valid_data = merged[['EVENTS', 'FATALITIES', term]].dropna()
        if len(valid_data) > 10:
            corr_events = valid_data['EVENTS'].corr(valid_data[term])
            corr_fatalities = valid_data['FATALITIES'].corr(valid_data[term])
            correlations.append({
                'Search Term': term,
                'Corr w/ Events': corr_events,
                'Corr w/ Fatalities': corr_fatalities,
                'Data Points': len(valid_data)
            })

corr_df = pd.DataFrame(correlations).sort_values('Corr w/ Events', ascending=False)
print("\n" + corr_df.to_string(index=False))

# 5. TIME-LAG ANALYSIS
# --------------------
print("\n" + "="*80)
print("TIME-LAG ANALYSIS")
print("="*80)

top_terms = corr_df.head(3)['Search Term'].tolist()

for term in top_terms:
    print(f"\n{term}:")
    valid_data = merged[['EVENTS', term]].dropna()
    best_corr = -999
    best_lag = 0
    
    for lag in range(-3, 4):
        if lag == 0:
            corr = valid_data['EVENTS'].corr(valid_data[term])
        elif lag > 0:
            if len(valid_data) > lag:
                corr = valid_data['EVENTS'].iloc[lag:].corr(valid_data[term].iloc[:-lag])
            else:
                corr = 0
        else:
            if len(valid_data) > abs(lag):
                corr = valid_data['EVENTS'].iloc[:lag].corr(valid_data[term].iloc[-lag:])
            else:
                corr = 0
        
        if abs(corr) > abs(best_corr):
            best_corr = corr
            best_lag = lag
        
        direction = "searches LAG" if lag > 0 else ("searches LEAD" if lag < 0 else "CONCURRENT")
        print(f"  Lag {lag:+2d} months ({direction:15s}): correlation = {corr:+.3f}")
    
    interpretation = "REACTIVE (searches follow events)" if best_lag > 0 else \
                    "PREDICTIVE (searches precede events)" if best_lag < 0 else \
                    "CONCURRENT (searches match events)"
    print(f"\n  → Best correlation at lag {best_lag:+d}: {best_corr:+.3f} ({interpretation})")

# 6. KEY PERIODS IDENTIFICATION
# -----------------------------
print("\n" + "="*80)
print("KEY PERIODS")
print("="*80)

print("\nTop 5 Event Spikes:")
top_spikes = merged.nlargest(5, 'EVENTS')[['month', 'EVENTS', 'FATALITIES'] + list(trends_data.keys())]
for idx, row in top_spikes.iterrows():
    print(f"\n{row['month'].strftime('%B %Y')}:")
    print(f"  ACLED Events: {row['EVENTS']:,}")
    print(f"  ACLED Fatalities: {row['FATALITIES']:,}")
    print(f"  Search Interest:")
    for term in trends_data.keys():
        if pd.notna(row[term]):
            print(f"    - {term:25s}: {row[term]:.0f}/100")

# 7. VISUALIZATION
# ---------------
print("\n" + "="*80)
print("CREATING VISUALIZATIONS")
print("="*80)

# Normalize data
merged_normalized = merged.copy()
merged_normalized['EVENTS_norm'] = (merged['EVENTS'] / merged['EVENTS'].max()) * 100
merged_normalized['FATALITIES_norm'] = (merged['FATALITIES'] / merged['FATALITIES'].max()) * 100

# Reshape for Altair
plot_data = []
for _, row in merged_normalized.iterrows():
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Events',
        'value': row['EVENTS_norm'],
        'type': 'Conflict Data',
        'raw_value': row['EVENTS']
    })
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Fatalities',
        'value': row['FATALITIES_norm'],
        'type': 'Conflict Data',
        'raw_value': row['FATALITIES']
    })
    for term in top_terms:
        if term in row and pd.notna(row[term]):
            plot_data.append({
                'month': row['month'],
                'metric': f'Search: {term}',
                'value': row[term],
                'type': 'Google Trends',
                'raw_value': row[term]
            })

plot_df = pd.DataFrame(plot_data)

# Main chart
chart = alt.Chart(plot_df).mark_line(strokeWidth=2.5, point=True).encode(
    x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45)),
    y=alt.Y('value:Q', title='Normalized Value (0-100)', scale=alt.Scale(domain=[0, 105])),
    color=alt.Color('metric:N', title='Metric', scale=alt.Scale(scheme='tableau10')),
    strokeDash=alt.StrokeDash('type:N', title='Data Type',
                               scale=alt.Scale(domain=['Conflict Data', 'Google Trends'],
                                             range=[[1,0], [5,3]])),
    tooltip=[
        alt.Tooltip('month:T', title='Month', format='%B %Y'),
        alt.Tooltip('metric:N', title='Metric'),
        alt.Tooltip('value:Q', title='Normalized', format='.1f'),
        alt.Tooltip('raw_value:Q', title='Raw Value', format=',.0f')
    ]
).properties(
    width=1400,
    height=450,
    title={
        'text': 'United States Hashtags: ACLED Events vs Google Search Interest (2020-2025)',
        'subtitle': 'Examining United States-specific Hashtags',
        'fontSize': 18,
        'subtitleFontSize': 13
    }
).interactive()

chart.save('us_hashtags_acled_vs_trends.html')
print(f"✓ Saved: us_hashtags_acled_vs_trends.html")

# Display
chart

# 8. INDIVIDUAL COMPARISON CHARTS
# --------------------------------
for term in top_terms:
    term_data = merged[['month', 'EVENTS', 'FATALITIES', term]].dropna().copy()
    
    base = alt.Chart(term_data).encode(
        x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45))
    )
    
    events_line = base.mark_line(color='steelblue', strokeWidth=3).encode(
        y=alt.Y('EVENTS:Q', title='ACLED Events', axis=alt.Axis(titleColor='steelblue')),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    trends_line = base.mark_line(color='red', strokeWidth=3).encode(
        y=alt.Y(f'{term}:Q', title=f'Google Trends: {term}',
                axis=alt.Axis(titleColor='red'), scale=alt.Scale(domain=[0, 100])),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    term_chart = alt.layer(events_line, trends_line).resolve_scale(
        y='independent'
    ).properties(
        width=1200,
        height=400,
        title=f'United States Hashtags: ACLED Events vs "{term}" Search Interest'
    ).interactive()
    
    filename = f"us_hashtags_{term.lower().replace(' ', '_')}_comparison.html"
    term_chart.save(filename)
    print(f"✓ Saved: {filename}")

print("\n✓ United States hashtags analysis complete!")

UNITED STATES HASHTAGS ANALYSIS
✓ ACLED Data: 70 months
  Date range: 2019-12-01 00:00:00 to 2025-09-01 00:00:00
  Total events: 30,592
  Total fatalities: 211

NOTE: Hashtag removed for StopGunViolence due to no data.
  ✓ Loaded: #StopGunViolence          - 70 months, max=100
  ✓ Loaded: #BlackLivesMatter         - 70 months, max=100

✓ Merged dataset: 70 months with 2 search terms

CORRELATION ANALYSIS

      Search Term  Corr w/ Events  Corr w/ Fatalities  Data Points
#BlackLivesMatter        0.234386            0.115816           69
 #StopGunViolence       -0.021451            0.373328           69

TIME-LAG ANALYSIS

#BlackLivesMatter:
  Lag -3 months (searches LEAD  ): correlation = +0.232
  Lag -2 months (searches LEAD  ): correlation = +0.234
  Lag -1 months (searches LEAD  ): correlation = +0.232
  Lag +0 months (CONCURRENT     ): correlation = +0.234
  Lag +1 months (searches LAG   ): correlation = +0.232
  Lag +2 months (searches LAG   ): correlation = +0.234
  Lag +3 months

#### 12) Mexico

In [56]:
# MEXICO HASHTAGS ANALYSIS: ACLED EVENTS vs GOOGLE TRENDS

print("="*80)
print("MEXICO HASHTAGS ANALYSIS")
print("="*80)

# 1. FILTER ACLED DATA
# ---------------------

mexico_acled = acled[
    (acled['COUNTRY'] == 'Mexico') & 
    (acled['WEEK'] >= '2020-01-01')
].copy()

mexico_acled['WEEK'] = pd.to_datetime(mexico_acled['WEEK'])

mexico_acled['month'] = mexico_acled['WEEK'].dt.to_period('M').dt.to_timestamp()
monthly = mexico_acled.groupby('month').agg({
    'EVENTS': 'sum',
    'FATALITIES': 'sum'
}).reset_index()

print(f"✓ ACLED Data: {len(monthly)} months")
print(f"  Date range: {monthly['month'].min()} to {monthly['month'].max()}")
print(f"  Total events: {monthly['EVENTS'].sum():,}")
print(f"  Total fatalities: {monthly['FATALITIES'].sum():,}")


# 2. LOAD GOOGLE TRENDS FILES
# ----------------------------
mexico_hashtag_files = {
    '#StandWithMexico': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_StandWithMexico.csv',
    '#HopeForMexico': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_HopeForMexico.csv'
}

trends_data = {}
for name, filepath in mexico_hashtag_files.items():
    try:
        df = pd.read_csv(filepath, skiprows=1)
        df.columns = ['month', 'value']
        df['month'] = pd.to_datetime(df['month'])
        df['value'] = df['value'].replace('<1', '0.5')
        df['value'] = pd.to_numeric(df['value'], errors='coerce')
        trends_data[name] = df
        print(f"  ✓ Loaded: {name:25s} - {len(df)} months, max={df['value'].max()}")
    except Exception as e:
        print(f"    ✗ Error loading {name}: {e}")
        

# 3. MERGE DATASETS
# -----------------
merged = monthly.copy()
for name, df in trends_data.items():
    merged = merged.merge(
        df.rename(columns={'value': name}),
        on='month',
        how='left'
    )
print(f"\n✓ Merged dataset: {len(merged)} months with {len(trends_data)} search terms")


# 4. CORRELATION ANALYSIS
# -----------------------
print("\n" + "="*80)
print("CORRELATION ANALYSIS")
print("="*80)

correlations = []
for term in trends_data.keys():
    if term in merged.columns:
        valid_data = merged[['EVENTS', 'FATALITIES', term]].dropna()
        if len(valid_data) > 10:
            corr_events = valid_data['EVENTS'].corr(valid_data[term])
            corr_fatalities = valid_data['FATALITIES'].corr(valid_data[term])
            correlations.append({
                'Search Term': term,
                'Corr w/ Events': corr_events,
                'Corr w/ Fatalities': corr_fatalities,
                'Data Points': len(valid_data)
            })

corr_df = pd.DataFrame(correlations).sort_values('Corr w/ Events', ascending=False)
print("\n" + corr_df.to_string(index=False))

# 5. TIME-LAG ANALYSIS
# --------------------
print("\n" + "="*80)
print("TIME-LAG ANALYSIS")
print("="*80)

top_terms = corr_df.head(3)['Search Term'].tolist()

for term in top_terms:
    print(f"\n{term}:")
    valid_data = merged[['EVENTS', term]].dropna()
    best_corr = -999
    best_lag = 0
    
    for lag in range(-3, 4):
        if lag == 0:
            corr = valid_data['EVENTS'].corr(valid_data[term])
        elif lag > 0:
            if len(valid_data) > lag:
                corr = valid_data['EVENTS'].iloc[lag:].corr(valid_data[term].iloc[:-lag])
            else:
                corr = 0
        else:
            if len(valid_data) > abs(lag):
                corr = valid_data['EVENTS'].iloc[:lag].corr(valid_data[term].iloc[-lag:])
            else:
                corr = 0
        
        if abs(corr) > abs(best_corr):
            best_corr = corr
            best_lag = lag
        
        direction = "searches LAG" if lag > 0 else ("searches LEAD" if lag < 0 else "CONCURRENT")
        print(f"  Lag {lag:+2d} months ({direction:15s}): correlation = {corr:+.3f}")
    
    interpretation = "REACTIVE (searches follow events)" if best_lag > 0 else \
                    "PREDICTIVE (searches precede events)" if best_lag < 0 else \
                    "CONCURRENT (searches match events)"
    print(f"\n  → Best correlation at lag {best_lag:+d}: {best_corr:+.3f} ({interpretation})")

# 6. KEY PERIODS IDENTIFICATION
# -----------------------------
print("\n" + "="*80)
print("KEY PERIODS")
print("="*80)

print("\nTop 5 Event Spikes:")
top_spikes = merged.nlargest(5, 'EVENTS')[['month', 'EVENTS', 'FATALITIES'] + list(trends_data.keys())]
for idx, row in top_spikes.iterrows():
    print(f"\n{row['month'].strftime('%B %Y')}:")
    print(f"  ACLED Events: {row['EVENTS']:,}")
    print(f"  ACLED Fatalities: {row['FATALITIES']:,}")
    print(f"  Search Interest:")
    for term in trends_data.keys():
        if pd.notna(row[term]):
            print(f"    - {term:25s}: {row[term]:.0f}/100")

# 7. VISUALIZATION
# ---------------
print("\n" + "="*80)
print("CREATING VISUALIZATIONS")
print("="*80)

# Normalize data
merged_normalized = merged.copy()
merged_normalized['EVENTS_norm'] = (merged['EVENTS'] / merged['EVENTS'].max()) * 100
merged_normalized['FATALITIES_norm'] = (merged['FATALITIES'] / merged['FATALITIES'].max()) * 100

# Reshape for Altair
plot_data = []
for _, row in merged_normalized.iterrows():
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Events',
        'value': row['EVENTS_norm'],
        'type': 'Conflict Data',
        'raw_value': row['EVENTS']
    })
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Fatalities',
        'value': row['FATALITIES_norm'],
        'type': 'Conflict Data',
        'raw_value': row['FATALITIES']
    })
    for term in top_terms:
        if term in row and pd.notna(row[term]):
            plot_data.append({
                'month': row['month'],
                'metric': f'Search: {term}',
                'value': row[term],
                'type': 'Google Trends',
                'raw_value': row[term]
            })

plot_df = pd.DataFrame(plot_data)

# Main chart
chart = alt.Chart(plot_df).mark_line(strokeWidth=2.5, point=True).encode(
    x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45)),
    y=alt.Y('value:Q', title='Normalized Value (0-100)', scale=alt.Scale(domain=[0, 105])),
    color=alt.Color('metric:N', title='Metric', scale=alt.Scale(scheme='tableau10')),
    strokeDash=alt.StrokeDash('type:N', title='Data Type',
                               scale=alt.Scale(domain=['Conflict Data', 'Google Trends'],
                                             range=[[1,0], [5,3]])),
    tooltip=[
        alt.Tooltip('month:T', title='Month', format='%B %Y'),
        alt.Tooltip('metric:N', title='Metric'),
        alt.Tooltip('value:Q', title='Normalized', format='.1f'),
        alt.Tooltip('raw_value:Q', title='Raw Value', format=',.0f')
    ]
).properties(
    width=1400,
    height=450,
    title={
        'text': 'Mexico Hashtags: ACLED Events vs Google Search Interest (2020-2025)',
        'subtitle': 'Examining Mexico-specific Hashtags',
        'fontSize': 18,
        'subtitleFontSize': 13
    }
).interactive()

chart.save('mexico_hashtags_acled_vs_trends.html')
print(f"✓ Saved: mexico_hashtags_acled_vs_trends.html")

# Display
chart

# 8. INDIVIDUAL COMPARISON CHARTS
# --------------------------------
for term in top_terms:
    term_data = merged[['month', 'EVENTS', 'FATALITIES', term]].dropna().copy()
    
    base = alt.Chart(term_data).encode(
        x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45))
    )
    
    events_line = base.mark_line(color='steelblue', strokeWidth=3).encode(
        y=alt.Y('EVENTS:Q', title='ACLED Events', axis=alt.Axis(titleColor='steelblue')),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    trends_line = base.mark_line(color='red', strokeWidth=3).encode(
        y=alt.Y(f'{term}:Q', title=f'Google Trends: {term}',
                axis=alt.Axis(titleColor='red'), scale=alt.Scale(domain=[0, 100])),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    term_chart = alt.layer(events_line, trends_line).resolve_scale(
        y='independent'
    ).properties(
        width=1200,
        height=400,
        title=f'Mexico Hashtags: ACLED Events vs "{term}" Search Interest'
    ).interactive()
    
    filename = f"mexico_hashtags_{term.lower().replace(' ', '_')}_comparison.html"
    term_chart.save(filename)
    print(f"✓ Saved: {filename}")

print("\n✓ Mexico hashtags analysis complete!")

MEXICO HASHTAGS ANALYSIS
✓ ACLED Data: 94 months
  Date range: 2017-12-01 00:00:00 to 2025-09-01 00:00:00
  Total events: 37,588
  Total fatalities: 22,233
  ✓ Loaded: #StandWithMexico          - 70 months, max=100
  ✓ Loaded: #HopeForMexico            - 70 months, max=100

✓ Merged dataset: 94 months with 2 search terms

CORRELATION ANALYSIS

     Search Term  Corr w/ Events  Corr w/ Fatalities  Data Points
#StandWithMexico        0.146900            0.212126           69
  #HopeForMexico        0.046268           -0.091090           69

TIME-LAG ANALYSIS

#StandWithMexico:
  Lag -3 months (searches LEAD  ): correlation = +0.153
  Lag -2 months (searches LEAD  ): correlation = +0.154
  Lag -1 months (searches LEAD  ): correlation = +0.145
  Lag +0 months (CONCURRENT     ): correlation = +0.147
  Lag +1 months (searches LAG   ): correlation = +0.145
  Lag +2 months (searches LAG   ): correlation = +0.154
  Lag +3 months (searches LAG   ): correlation = +0.153

  → Best correlation at l

#### 13) Brazil

In [57]:
# BRAZIL HASHTAGS ANALYSIS: ACLED EVENTS vs GOOGLE TRENDS

print("="*80)
print("BRAZIL HASHTAGS ANALYSIS")
print("="*80)

# 1. FILTER ACLED DATA
# ---------------------

brazil_acled = acled[
    (acled['COUNTRY'] == 'Brazil') & 
    (acled['WEEK'] >= '2020-01-01')
].copy()

brazil_acled['WEEK'] = pd.to_datetime(brazil_acled['WEEK'])

brazil_acled['month'] = brazil_acled['WEEK'].dt.to_period('M').dt.to_timestamp()
monthly = brazil_acled.groupby('month').agg({
    'EVENTS': 'sum',
    'FATALITIES': 'sum'
}).reset_index()

print(f"✓ ACLED Data: {len(monthly)} months")
print(f"  Date range: {monthly['month'].min()} to {monthly['month'].max()}")
print(f"  Total events: {monthly['EVENTS'].sum():,}")
print(f"  Total fatalities: {monthly['FATALITIES'].sum():,}")


# 2. LOAD GOOGLE TRENDS FILES
# ----------------------------
brazil_hashtag_files = {
    '#SaveBrazil': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_SaveBrazil.csv',
    '#RioCrisis': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_RioCrisis.csv'
}

trends_data = {}
for name, filepath in brazil_hashtag_files.items():
    try:
        df = pd.read_csv(filepath, skiprows=1)
        df.columns = ['month', 'value']
        df['month'] = pd.to_datetime(df['month'])
        df['value'] = df['value'].replace('<1', '0.5')
        df['value'] = pd.to_numeric(df['value'], errors='coerce')
        trends_data[name] = df
        print(f"  ✓ Loaded: {name:25s} - {len(df)} months, max={df['value'].max()}")
    except Exception as e:
        print(f"    ✗ Error loading {name}: {e}")
        

# 3. MERGE DATASETS
# -----------------
merged = monthly.copy()
for name, df in trends_data.items():
    merged = merged.merge(
        df.rename(columns={'value': name}),
        on='month',
        how='left'
    )
print(f"\n✓ Merged dataset: {len(merged)} months with {len(trends_data)} search terms")


# 4. CORRELATION ANALYSIS
# -----------------------
print("\n" + "="*80)
print("CORRELATION ANALYSIS")
print("="*80)

correlations = []
for term in trends_data.keys():
    if term in merged.columns:
        valid_data = merged[['EVENTS', 'FATALITIES', term]].dropna()
        if len(valid_data) > 10:
            corr_events = valid_data['EVENTS'].corr(valid_data[term])
            corr_fatalities = valid_data['FATALITIES'].corr(valid_data[term])
            correlations.append({
                'Search Term': term,
                'Corr w/ Events': corr_events,
                'Corr w/ Fatalities': corr_fatalities,
                'Data Points': len(valid_data)
            })

corr_df = pd.DataFrame(correlations).sort_values('Corr w/ Events', ascending=False)
print("\n" + corr_df.to_string(index=False))

# 5. TIME-LAG ANALYSIS
# --------------------
print("\n" + "="*80)
print("TIME-LAG ANALYSIS")
print("="*80)

top_terms = corr_df.head(3)['Search Term'].tolist()

for term in top_terms:
    print(f"\n{term}:")
    valid_data = merged[['EVENTS', term]].dropna()
    best_corr = -999
    best_lag = 0
    
    for lag in range(-3, 4):
        if lag == 0:
            corr = valid_data['EVENTS'].corr(valid_data[term])
        elif lag > 0:
            if len(valid_data) > lag:
                corr = valid_data['EVENTS'].iloc[lag:].corr(valid_data[term].iloc[:-lag])
            else:
                corr = 0
        else:
            if len(valid_data) > abs(lag):
                corr = valid_data['EVENTS'].iloc[:lag].corr(valid_data[term].iloc[-lag:])
            else:
                corr = 0
        
        if abs(corr) > abs(best_corr):
            best_corr = corr
            best_lag = lag
        
        direction = "searches LAG" if lag > 0 else ("searches LEAD" if lag < 0 else "CONCURRENT")
        print(f"  Lag {lag:+2d} months ({direction:15s}): correlation = {corr:+.3f}")
    
    interpretation = "REACTIVE (searches follow events)" if best_lag > 0 else \
                    "PREDICTIVE (searches precede events)" if best_lag < 0 else \
                    "CONCURRENT (searches match events)"
    print(f"\n  → Best correlation at lag {best_lag:+d}: {best_corr:+.3f} ({interpretation})")

# 6. KEY PERIODS IDENTIFICATION
# -----------------------------
print("\n" + "="*80)
print("KEY PERIODS")
print("="*80)

print("\nTop 5 Event Spikes:")
top_spikes = merged.nlargest(5, 'EVENTS')[['month', 'EVENTS', 'FATALITIES'] + list(trends_data.keys())]
for idx, row in top_spikes.iterrows():
    print(f"\n{row['month'].strftime('%B %Y')}:")
    print(f"  ACLED Events: {row['EVENTS']:,}")
    print(f"  ACLED Fatalities: {row['FATALITIES']:,}")
    print(f"  Search Interest:")
    for term in trends_data.keys():
        if pd.notna(row[term]):
            print(f"    - {term:25s}: {row[term]:.0f}/100")

# 7. VISUALIZATION
# ---------------
print("\n" + "="*80)
print("CREATING VISUALIZATIONS")
print("="*80)

# Normalize data
merged_normalized = merged.copy()
merged_normalized['EVENTS_norm'] = (merged['EVENTS'] / merged['EVENTS'].max()) * 100
merged_normalized['FATALITIES_norm'] = (merged['FATALITIES'] / merged['FATALITIES'].max()) * 100

# Reshape for Altair
plot_data = []
for _, row in merged_normalized.iterrows():
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Events',
        'value': row['EVENTS_norm'],
        'type': 'Conflict Data',
        'raw_value': row['EVENTS']
    })
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Fatalities',
        'value': row['FATALITIES_norm'],
        'type': 'Conflict Data',
        'raw_value': row['FATALITIES']
    })
    for term in top_terms:
        if term in row and pd.notna(row[term]):
            plot_data.append({
                'month': row['month'],
                'metric': f'Search: {term}',
                'value': row[term],
                'type': 'Google Trends',
                'raw_value': row[term]
            })

plot_df = pd.DataFrame(plot_data)

# Main chart
chart = alt.Chart(plot_df).mark_line(strokeWidth=2.5, point=True).encode(
    x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45)),
    y=alt.Y('value:Q', title='Normalized Value (0-100)', scale=alt.Scale(domain=[0, 105])),
    color=alt.Color('metric:N', title='Metric', scale=alt.Scale(scheme='tableau10')),
    strokeDash=alt.StrokeDash('type:N', title='Data Type',
                               scale=alt.Scale(domain=['Conflict Data', 'Google Trends'],
                                             range=[[1,0], [5,3]])),
    tooltip=[
        alt.Tooltip('month:T', title='Month', format='%B %Y'),
        alt.Tooltip('metric:N', title='Metric'),
        alt.Tooltip('value:Q', title='Normalized', format='.1f'),
        alt.Tooltip('raw_value:Q', title='Raw Value', format=',.0f')
    ]
).properties(
    width=1400,
    height=450,
    title={
        'text': 'Brazil Hashtags: ACLED Events vs Google Search Interest (2020-2025)',
        'subtitle': 'Examining Brazil-specific Hashtags',
        'fontSize': 18,
        'subtitleFontSize': 13
    }
).interactive()

chart.save('brazil_hashtags_acled_vs_trends.html')
print(f"✓ Saved: brazil_hashtags_acled_vs_trends.html")

# Display
chart

# 8. INDIVIDUAL COMPARISON CHARTS
# --------------------------------
for term in top_terms:
    term_data = merged[['month', 'EVENTS', 'FATALITIES', term]].dropna().copy()
    
    base = alt.Chart(term_data).encode(
        x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45))
    )
    
    events_line = base.mark_line(color='steelblue', strokeWidth=3).encode(
        y=alt.Y('EVENTS:Q', title='ACLED Events', axis=alt.Axis(titleColor='steelblue')),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    trends_line = base.mark_line(color='red', strokeWidth=3).encode(
        y=alt.Y(f'{term}:Q', title=f'Google Trends: {term}',
                axis=alt.Axis(titleColor='red'), scale=alt.Scale(domain=[0, 100])),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    term_chart = alt.layer(events_line, trends_line).resolve_scale(
        y='independent'
    ).properties(
        width=1200,
        height=400,
        title=f'Brazil Hashtags: ACLED Events vs "{term}" Search Interest'
    ).interactive()
    
    filename = f"brazil_hashtags_{term.lower().replace(' ', '_')}_comparison.html"
    term_chart.save(filename)
    print(f"✓ Saved: {filename}")

print("\n✓ Brazil hashtags analysis complete!")

BRAZIL HASHTAGS ANALYSIS
✓ ACLED Data: 94 months
  Date range: 2017-12-01 00:00:00 to 2025-09-01 00:00:00
  Total events: 35,287
  Total fatalities: 18,930
  ✓ Loaded: #SaveBrazil               - 70 months, max=100
  ✓ Loaded: #RioCrisis                - 70 months, max=100

✓ Merged dataset: 94 months with 2 search terms

CORRELATION ANALYSIS

Search Term  Corr w/ Events  Corr w/ Fatalities  Data Points
 #RioCrisis        0.009725           -0.024521           69
#SaveBrazil       -0.044267           -0.072561           69

TIME-LAG ANALYSIS

#RioCrisis:
  Lag -3 months (searches LEAD  ): correlation = +0.009
  Lag -2 months (searches LEAD  ): correlation = +0.009
  Lag -1 months (searches LEAD  ): correlation = +0.008
  Lag +0 months (CONCURRENT     ): correlation = +0.010
  Lag +1 months (searches LAG   ): correlation = +0.008
  Lag +2 months (searches LAG   ): correlation = +0.009
  Lag +3 months (searches LAG   ): correlation = +0.009

  → Best correlation at lag +0: -999.000 (CONC

### II. Thematic Hashtags

#### 1) Human Rights & Protests

In [59]:
# HUMAN RIGHTS & PROTESTS HASHTAGS ANALYSIS: ACLED EVENTS vs GOOGLE TRENDS

print("="*80)
print("HUMAN RIGHTS & PROTESTS HASHTAGS ANALYSIS")
print("="*80)

# 1. FILTER ACLED DATA
# ---------------------

acled_copy = acled.copy()

acled_copy['WEEK'] = pd.to_datetime(acled_copy['WEEK'])

acled_copy['month'] = acled_copy['WEEK'].dt.to_period('M').dt.to_timestamp()
monthly = acled_copy.groupby('month').agg({
    'EVENTS': 'sum',
    'FATALITIES': 'sum'
}).reset_index()

print(f"✓ ACLED Data: {len(monthly)} months")
print(f"  Date range: {monthly['month'].min()} to {monthly['month'].max()}")
print(f"  Total events: {monthly['EVENTS'].sum():,}")
print(f"  Total fatalities: {monthly['FATALITIES'].sum():,}")


# 2. LOAD GOOGLE TRENDS FILES
# ----------------------------
human_rights_hashtag_files = {
    '#NeverAgain': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_NeverAgain.csv',
    '#Democracy': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_Democracy.csv',
    '#FreeSpeech': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_FreeSpeech.csv',
    '#HumanRights': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_HumanRights.csv',
    '#FreePress': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_FreePress.csv',
    '#YouthForDemocracy': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_YouthForDemocracy.csv',
    '#Protest': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_Protest.csv',
    '#DemocracyForAll': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_DemocracyForAll.csv',
    '#NeverForget': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_NeverForget.csv',
    '#Equality': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_Equality.csv',
    '#Justice': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_Justice.csv',
    '#Freedom': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_Freedom.csv',
    '#Change': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_Change.csv',
    '#FridaysForFuture': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_FridaysForFuture.csv',
    '#PeopleNotProfit': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_PeopleNotProfit.csv'
}

print("\nNOTE: Hashtags removed for FreePress and PeopleNotProfit due to no data.")

trends_data = {}
for name, filepath in human_rights_hashtag_files.items():
    try:
        df = pd.read_csv(filepath, skiprows=1)
        df.columns = ['month', 'value']
        df['month'] = pd.to_datetime(df['month'])
        df['value'] = df['value'].replace('<1', '0.5')
        df['value'] = pd.to_numeric(df['value'], errors='coerce')
        trends_data[name] = df
        print(f"  ✓ Loaded: {name:25s} - {len(df)} months, max={df['value'].max()}")
    except Exception as e:
        print(f"    ✗ Error loading {name}: {e}")
        

# 3. MERGE DATASETS
# -----------------
merged = monthly.copy()
for name, df in trends_data.items():
    merged = merged.merge(
        df.rename(columns={'value': name}),
        on='month',
        how='left'
    )
print(f"\n✓ Merged dataset: {len(merged)} months with {len(trends_data)} search terms")


# 4. CORRELATION ANALYSIS
# -----------------------
print("\n" + "="*80)
print("CORRELATION ANALYSIS")
print("="*80)

correlations = []
for term in trends_data.keys():
    if term in merged.columns:
        valid_data = merged[['EVENTS', 'FATALITIES', term]].dropna()
        if len(valid_data) > 10:
            corr_events = valid_data['EVENTS'].corr(valid_data[term])
            corr_fatalities = valid_data['FATALITIES'].corr(valid_data[term])
            correlations.append({
                'Search Term': term,
                'Corr w/ Events': corr_events,
                'Corr w/ Fatalities': corr_fatalities,
                'Data Points': len(valid_data)
            })

corr_df = pd.DataFrame(correlations).sort_values('Corr w/ Events', ascending=False)
print("\n" + corr_df.to_string(index=False))

# 5. TIME-LAG ANALYSIS
# --------------------
print("\n" + "="*80)
print("TIME-LAG ANALYSIS")
print("="*80)

top_terms = corr_df.head(5)['Search Term'].tolist() # changed to top 5

for term in top_terms:
    print(f"\n{term}:")
    valid_data = merged[['EVENTS', term]].dropna()
    best_corr = -999
    best_lag = 0
    
    for lag in range(-3, 4):
        if lag == 0:
            corr = valid_data['EVENTS'].corr(valid_data[term])
        elif lag > 0:
            if len(valid_data) > lag:
                corr = valid_data['EVENTS'].iloc[lag:].corr(valid_data[term].iloc[:-lag])
            else:
                corr = 0
        else:
            if len(valid_data) > abs(lag):
                corr = valid_data['EVENTS'].iloc[:lag].corr(valid_data[term].iloc[-lag:])
            else:
                corr = 0
        
        if abs(corr) > abs(best_corr):
            best_corr = corr
            best_lag = lag
        
        direction = "searches LAG" if lag > 0 else ("searches LEAD" if lag < 0 else "CONCURRENT")
        print(f"  Lag {lag:+2d} months ({direction:15s}): correlation = {corr:+.3f}")
    
    interpretation = "REACTIVE (searches follow events)" if best_lag > 0 else \
                    "PREDICTIVE (searches precede events)" if best_lag < 0 else \
                    "CONCURRENT (searches match events)"
    print(f"\n  → Best correlation at lag {best_lag:+d}: {best_corr:+.3f} ({interpretation})")

# 6. KEY PERIODS IDENTIFICATION
# -----------------------------
print("\n" + "="*80)
print("KEY PERIODS")
print("="*80)

print("\nTop 5 Event Spikes:")
top_spikes = merged.nlargest(5, 'EVENTS')[['month', 'EVENTS', 'FATALITIES'] + list(trends_data.keys())]
for idx, row in top_spikes.iterrows():
    print(f"\n{row['month'].strftime('%B %Y')}:")
    print(f"  ACLED Events: {row['EVENTS']:,}")
    print(f"  ACLED Fatalities: {row['FATALITIES']:,}")
    print(f"  Search Interest:")
    for term in trends_data.keys():
        if pd.notna(row[term]):
            print(f"    - {term:25s}: {row[term]:.0f}/100")

# 7. VISUALIZATION
# ---------------
print("\n" + "="*80)
print("CREATING VISUALIZATIONS")
print("="*80)

# Normalize data
merged_normalized = merged.copy()
merged_normalized['EVENTS_norm'] = (merged['EVENTS'] / merged['EVENTS'].max()) * 100
merged_normalized['FATALITIES_norm'] = (merged['FATALITIES'] / merged['FATALITIES'].max()) * 100

# Reshape for Altair
plot_data = []
for _, row in merged_normalized.iterrows():
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Events',
        'value': row['EVENTS_norm'],
        'type': 'Conflict Data',
        'raw_value': row['EVENTS']
    })
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Fatalities',
        'value': row['FATALITIES_norm'],
        'type': 'Conflict Data',
        'raw_value': row['FATALITIES']
    })
    for term in top_terms:
        if term in row and pd.notna(row[term]):
            plot_data.append({
                'month': row['month'],
                'metric': f'Search: {term}',
                'value': row[term],
                'type': 'Google Trends',
                'raw_value': row[term]
            })

plot_df = pd.DataFrame(plot_data)

# Main chart
chart = alt.Chart(plot_df).mark_line(strokeWidth=2.5, point=True).encode(
    x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45)),
    y=alt.Y('value:Q', title='Normalized Value (0-100)', scale=alt.Scale(domain=[0, 105])),
    color=alt.Color('metric:N', title='Metric', scale=alt.Scale(scheme='tableau10')),
    strokeDash=alt.StrokeDash('type:N', title='Data Type',
                               scale=alt.Scale(domain=['Conflict Data', 'Google Trends'],
                                             range=[[1,0], [5,3]])),
    tooltip=[
        alt.Tooltip('month:T', title='Month', format='%B %Y'),
        alt.Tooltip('metric:N', title='Metric'),
        alt.Tooltip('value:Q', title='Normalized', format='.1f'),
        alt.Tooltip('raw_value:Q', title='Raw Value', format=',.0f')
    ]
).properties(
    width=1400,
    height=450,
    title={
        'text': 'Human Rights & Protests Hashtags: ACLED Events vs Google Search Interest (2020-2025)',
        'subtitle': 'Examining Human Rights & Protests-related Hashtags',
        'fontSize': 18,
        'subtitleFontSize': 13
    }
).interactive()

chart.save('humanRights_hashtags_acled_vs_trends.html')
print(f"✓ Saved: humanRights_hashtags_acled_vs_trends.html")

# Display
chart

# 8. INDIVIDUAL COMPARISON CHARTS
# --------------------------------
for term in top_terms:
    term_data = merged[['month', 'EVENTS', 'FATALITIES', term]].dropna().copy()
    
    base = alt.Chart(term_data).encode(
        x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45))
    )
    
    events_line = base.mark_line(color='steelblue', strokeWidth=3).encode(
        y=alt.Y('EVENTS:Q', title='ACLED Events', axis=alt.Axis(titleColor='steelblue')),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    trends_line = base.mark_line(color='red', strokeWidth=3).encode(
        y=alt.Y(f'{term}:Q', title=f'Google Trends: {term}',
                axis=alt.Axis(titleColor='red'), scale=alt.Scale(domain=[0, 100])),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    term_chart = alt.layer(events_line, trends_line).resolve_scale(
        y='independent'
    ).properties(
        width=1200,
        height=400,
        title=f'Human Rights & Protests Hashtags: ACLED Events vs "{term}" Search Interest'
    ).interactive()
    
    filename = f"humanRights_hashtags_{term.lower().replace(' ', '_')}_comparison.html"
    term_chart.save(filename)
    print(f"✓ Saved: {filename}")

print("\n✓ Human Rights & Protests hashtags analysis complete!")

HUMAN RIGHTS & PROTESTS HASHTAGS ANALYSIS
✓ ACLED Data: 347 months
  Date range: 1996-12-01 00:00:00 to 2025-10-01 00:00:00
  Total events: 2,772,023
  Total fatalities: 2,399,712

NOTE: Hashtags removed for FreePress and PeopleNotProfit due to no data.
  ✓ Loaded: #NeverAgain               - 70 months, max=100
  ✓ Loaded: #Democracy                - 70 months, max=100
  ✓ Loaded: #FreeSpeech               - 70 months, max=100
  ✓ Loaded: #HumanRights              - 70 months, max=100
  ✓ Loaded: #FreePress                - 70 months, max=100
  ✓ Loaded: #YouthForDemocracy        - 70 months, max=100
  ✓ Loaded: #Protest                  - 70 months, max=100
  ✓ Loaded: #DemocracyForAll          - 70 months, max=100
  ✓ Loaded: #NeverForget              - 70 months, max=100
  ✓ Loaded: #Equality                 - 70 months, max=100
  ✓ Loaded: #Justice                  - 70 months, max=100
  ✓ Loaded: #Freedom                  - 70 months, max=100
  ✓ Loaded: #Change                   

#### 2) Gender & Social Justice

In [62]:
# GENDER & SOCIAL JUSTICE HASHTAGS ANALYSIS: ACLED EVENTS vs GOOGLE TRENDS

print("="*80)
print("GENDER & SOCIAL JUSTICE HASHTAGS ANALYSIS")
print("="*80)

# 1. FILTER ACLED DATA
# ---------------------

acled_copy = acled.copy()

acled_copy['WEEK'] = pd.to_datetime(acled_copy['WEEK'])

acled_copy['month'] = acled_copy['WEEK'].dt.to_period('M').dt.to_timestamp()
monthly = acled_copy.groupby('month').agg({
    'EVENTS': 'sum',
    'FATALITIES': 'sum'
}).reset_index()

print(f"✓ ACLED Data: {len(monthly)} months")
print(f"  Date range: {monthly['month'].min()} to {monthly['month'].max()}")
print(f"  Total events: {monthly['EVENTS'].sum():,}")
print(f"  Total fatalities: {monthly['FATALITIES'].sum():,}")


# 2. LOAD GOOGLE TRENDS FILES
# ----------------------------
genderSocialJustice_hashtag_files = {
    '#MeToo': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_MeToo.csv',
    '#StopFundingHate': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_StopFundingHate.csv',
    '#WomensMarch': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_WomensMarch.csv',
    '#TimesUp': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_TimesUp.csv',
    '#GenerationEquality': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_GenerationEquality.csv',
    '#MyBodyMyChoice': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_MyBodyMyChoice.csv',
    '#ProChoice': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_ProChoice.csv',
    '#LoveIsLove': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_LoveIsLove.csv',
    '#TransRights': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_TransRights.csv',
    '#EqualityForAll': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_EqualityForAll.csv',
    '#TransRightsAreHumanRights': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_TransRightsAreHumanRights.csv',
    '#EndSexualViolence': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_EndSexualViolence.csv',
    '#WomensRights': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_WomensRights.csv',
    '#LGBTQ+': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_LGBTQ+.csv',
    '#GayPride': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_GayPride.csv',
    '#ImmigrantRights': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_ImmigrantRights.csv'
}

print("\nNOTE: Hashtags removed & spaces added for StopFundingHate, GenerationEquality, and TransRightsAreHumanRights due to no data.")

trends_data = {}
for name, filepath in genderSocialJustice_hashtag_files.items():
    try:
        df = pd.read_csv(filepath, skiprows=1)
        df.columns = ['month', 'value']
        df['month'] = pd.to_datetime(df['month'])
        df['value'] = df['value'].replace('<1', '0.5')
        df['value'] = pd.to_numeric(df['value'], errors='coerce')
        trends_data[name] = df
        print(f"  ✓ Loaded: {name:25s} - {len(df)} months, max={df['value'].max()}")
    except Exception as e:
        print(f"    ✗ Error loading {name}: {e}")
        

# 3. MERGE DATASETS
# -----------------
merged = monthly.copy()
for name, df in trends_data.items():
    merged = merged.merge(
        df.rename(columns={'value': name}),
        on='month',
        how='left'
    )
print(f"\n✓ Merged dataset: {len(merged)} months with {len(trends_data)} search terms")


# 4. CORRELATION ANALYSIS
# -----------------------
print("\n" + "="*80)
print("CORRELATION ANALYSIS")
print("="*80)

correlations = []
for term in trends_data.keys():
    if term in merged.columns:
        valid_data = merged[['EVENTS', 'FATALITIES', term]].dropna()
        if len(valid_data) > 10:
            corr_events = valid_data['EVENTS'].corr(valid_data[term])
            corr_fatalities = valid_data['FATALITIES'].corr(valid_data[term])
            correlations.append({
                'Search Term': term,
                'Corr w/ Events': corr_events,
                'Corr w/ Fatalities': corr_fatalities,
                'Data Points': len(valid_data)
            })

corr_df = pd.DataFrame(correlations).sort_values('Corr w/ Events', ascending=False)
print("\n" + corr_df.to_string(index=False))

# 5. TIME-LAG ANALYSIS
# --------------------
print("\n" + "="*80)
print("TIME-LAG ANALYSIS")
print("="*80)

top_terms = corr_df.head(5)['Search Term'].tolist()

for term in top_terms:
    print(f"\n{term}:")
    valid_data = merged[['EVENTS', term]].dropna()
    best_corr = -999
    best_lag = 0
    
    for lag in range(-3, 4):
        if lag == 0:
            corr = valid_data['EVENTS'].corr(valid_data[term])
        elif lag > 0:
            if len(valid_data) > lag:
                corr = valid_data['EVENTS'].iloc[lag:].corr(valid_data[term].iloc[:-lag])
            else:
                corr = 0
        else:
            if len(valid_data) > abs(lag):
                corr = valid_data['EVENTS'].iloc[:lag].corr(valid_data[term].iloc[-lag:])
            else:
                corr = 0
        
        if abs(corr) > abs(best_corr):
            best_corr = corr
            best_lag = lag
        
        direction = "searches LAG" if lag > 0 else ("searches LEAD" if lag < 0 else "CONCURRENT")
        print(f"  Lag {lag:+2d} months ({direction:15s}): correlation = {corr:+.3f}")
    
    interpretation = "REACTIVE (searches follow events)" if best_lag > 0 else \
                    "PREDICTIVE (searches precede events)" if best_lag < 0 else \
                    "CONCURRENT (searches match events)"
    print(f"\n  → Best correlation at lag {best_lag:+d}: {best_corr:+.3f} ({interpretation})")

# 6. KEY PERIODS IDENTIFICATION
# -----------------------------
print("\n" + "="*80)
print("KEY PERIODS")
print("="*80)

print("\nTop 5 Event Spikes:")
top_spikes = merged.nlargest(5, 'EVENTS')[['month', 'EVENTS', 'FATALITIES'] + list(trends_data.keys())]
for idx, row in top_spikes.iterrows():
    print(f"\n{row['month'].strftime('%B %Y')}:")
    print(f"  ACLED Events: {row['EVENTS']:,}")
    print(f"  ACLED Fatalities: {row['FATALITIES']:,}")
    print(f"  Search Interest:")
    for term in trends_data.keys():
        if pd.notna(row[term]):
            print(f"    - {term:25s}: {row[term]:.0f}/100")

# 7. VISUALIZATION
# ---------------
print("\n" + "="*80)
print("CREATING VISUALIZATIONS")
print("="*80)

# Normalize data
merged_normalized = merged.copy()
merged_normalized['EVENTS_norm'] = (merged['EVENTS'] / merged['EVENTS'].max()) * 100
merged_normalized['FATALITIES_norm'] = (merged['FATALITIES'] / merged['FATALITIES'].max()) * 100

# Reshape for Altair
plot_data = []
for _, row in merged_normalized.iterrows():
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Events',
        'value': row['EVENTS_norm'],
        'type': 'Conflict Data',
        'raw_value': row['EVENTS']
    })
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Fatalities',
        'value': row['FATALITIES_norm'],
        'type': 'Conflict Data',
        'raw_value': row['FATALITIES']
    })
    for term in top_terms:
        if term in row and pd.notna(row[term]):
            plot_data.append({
                'month': row['month'],
                'metric': f'Search: {term}',
                'value': row[term],
                'type': 'Google Trends',
                'raw_value': row[term]
            })

plot_df = pd.DataFrame(plot_data)

# Main chart
chart = alt.Chart(plot_df).mark_line(strokeWidth=2.5, point=True).encode(
    x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45)),
    y=alt.Y('value:Q', title='Normalized Value (0-100)', scale=alt.Scale(domain=[0, 105])),
    color=alt.Color('metric:N', title='Metric', scale=alt.Scale(scheme='tableau10')),
    strokeDash=alt.StrokeDash('type:N', title='Data Type',
                               scale=alt.Scale(domain=['Conflict Data', 'Google Trends'],
                                             range=[[1,0], [5,3]])),
    tooltip=[
        alt.Tooltip('month:T', title='Month', format='%B %Y'),
        alt.Tooltip('metric:N', title='Metric'),
        alt.Tooltip('value:Q', title='Normalized', format='.1f'),
        alt.Tooltip('raw_value:Q', title='Raw Value', format=',.0f')
    ]
).properties(
    width=1400,
    height=450,
    title={
        'text': 'Gender & Social Justice Hashtags: ACLED Events vs Google Search Interest (2020-2025)',
        'subtitle': 'Examining Gener & Social Justice-specific Hashtags',
        'fontSize': 18,
        'subtitleFontSize': 13
    }
).interactive()

chart.save('genderSocialJustice_hashtags_acled_vs_trends.html')
print(f"✓ Saved: genderSocialJustice_hashtags_acled_vs_trends.html")

# Display
chart

# 8. INDIVIDUAL COMPARISON CHARTS
# --------------------------------
for term in top_terms:
    term_data = merged[['month', 'EVENTS', 'FATALITIES', term]].dropna().copy()
    
    base = alt.Chart(term_data).encode(
        x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45))
    )
    
    events_line = base.mark_line(color='steelblue', strokeWidth=3).encode(
        y=alt.Y('EVENTS:Q', title='ACLED Events', axis=alt.Axis(titleColor='steelblue')),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    trends_line = base.mark_line(color='red', strokeWidth=3).encode(
        y=alt.Y(f'{term}:Q', title=f'Google Trends: {term}',
                axis=alt.Axis(titleColor='red'), scale=alt.Scale(domain=[0, 100])),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    term_chart = alt.layer(events_line, trends_line).resolve_scale(
        y='independent'
    ).properties(
        width=1200,
        height=400,
        title=f'Gender & Social Justice Hashtags: ACLED Events vs "{term}" Search Interest'
    ).interactive()
    
    filename = f"genderSocialJustice_hashtags_{term.lower().replace(' ', '_')}_comparison.html"
    term_chart.save(filename)
    print(f"✓ Saved: {filename}")

print("\n✓ Gender & Social Justice hashtags analysis complete!")

GENDER & SOCIAL JUSTICE HASHTAGS ANALYSIS
✓ ACLED Data: 347 months
  Date range: 1996-12-01 00:00:00 to 2025-10-01 00:00:00
  Total events: 2,772,023
  Total fatalities: 2,399,712

NOTE: Hashtags removed & spaces added for StopFundingHate, GenerationEquality, and TransRightsAreHumanRights due to no data.
  ✓ Loaded: #MeToo                    - 70 months, max=100
  ✓ Loaded: #StopFundingHate          - 70 months, max=100
  ✓ Loaded: #WomensMarch              - 70 months, max=100
  ✓ Loaded: #TimesUp                  - 70 months, max=100
  ✓ Loaded: #GenerationEquality       - 70 months, max=100
  ✓ Loaded: #MyBodyMyChoice           - 70 months, max=100
  ✓ Loaded: #ProChoice                - 70 months, max=100
  ✓ Loaded: #LoveIsLove               - 70 months, max=100
  ✓ Loaded: #TransRights              - 70 months, max=100
  ✓ Loaded: #EqualityForAll           - 70 months, max=100
  ✓ Loaded: #TransRightsAreHumanRights - 70 months, max=100
  ✓ Loaded: #EndSexualViolence        - 70 m

#### 3) Conflict & Peace

In [64]:
# COFLICT & PEACE HASHTAGS ANALYSIS: ACLED EVENTS vs GOOGLE TRENDS

print("="*80)
print("CONFLICT & PEACE HASHTAGS ANALYSIS")
print("="*80)

# 1. FILTER ACLED DATA
# ---------------------

acled_copy = acled.copy()

acled_copy['WEEK'] = pd.to_datetime(acled_copy['WEEK'])

acled_copy['month'] = acled_copy['WEEK'].dt.to_period('M').dt.to_timestamp()
monthly = acled_copy.groupby('month').agg({
    'EVENTS': 'sum',
    'FATALITIES': 'sum'
}).reset_index()

print(f"✓ ACLED Data: {len(monthly)} months")
print(f"  Date range: {monthly['month'].min()} to {monthly['month'].max()}")
print(f"  Total events: {monthly['EVENTS'].sum():,}")
print(f"  Total fatalities: {monthly['FATALITIES'].sum():,}")


# 2. LOAD GOOGLE TRENDS FILES
# ----------------------------
conflictPeace_hashtag_files = {
    '#NoWarCrimes': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_NoWarCrimes.csv',
    '#Peace': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_Peace.csv',
    '#NoWar': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_NoWar.csv',
    '#StopWar': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_StopWar.csv',
    '#StopTheWar': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_StopTheWar.csv',
    '#HumanatarianCrisis': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_HumanatarianCrisis.csv',
    '#RefugeeRelief': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_RefugeeRelief.csv',
    '#PeaceForAll': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_PeaceForAll.csv',
    '#Solidarity': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_Solidarity.csv',
    '#Ceasefire': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_Ceasefire.csv',
    '#CeasefireNOW': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_CeasefireNOW.csv',
    '#StopGenocide': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_StopGenocide.csv',
    '#Aid': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_Aid.csv',
    '#StandWithPeace': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_StandWithPeace.csv',
    '#SaveHumanity': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_SaveHumanity.csv',
    '#EndViolence': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_EndViolence.csv'
}

print("\nNOTE: Hashtags removed & spaces added for RefugeeRelief, PeaceForAll, and StandWithPeace due to no data.")

trends_data = {}
for name, filepath in conflictPeace_hashtag_files.items():
    try:
        df = pd.read_csv(filepath, skiprows=1)
        df.columns = ['month', 'value']
        df['month'] = pd.to_datetime(df['month'])
        df['value'] = df['value'].replace('<1', '0.5')
        df['value'] = pd.to_numeric(df['value'], errors='coerce')
        trends_data[name] = df
        print(f"  ✓ Loaded: {name:25s} - {len(df)} months, max={df['value'].max()}")
    except Exception as e:
        print(f"    ✗ Error loading {name}: {e}")
        

# 3. MERGE DATASETS
# -----------------
merged = monthly.copy()
for name, df in trends_data.items():
    merged = merged.merge(
        df.rename(columns={'value': name}),
        on='month',
        how='left'
    )
print(f"\n✓ Merged dataset: {len(merged)} months with {len(trends_data)} search terms")


# 4. CORRELATION ANALYSIS
# -----------------------
print("\n" + "="*80)
print("CORRELATION ANALYSIS")
print("="*80)

correlations = []
for term in trends_data.keys():
    if term in merged.columns:
        valid_data = merged[['EVENTS', 'FATALITIES', term]].dropna()
        if len(valid_data) > 10:
            corr_events = valid_data['EVENTS'].corr(valid_data[term])
            corr_fatalities = valid_data['FATALITIES'].corr(valid_data[term])
            correlations.append({
                'Search Term': term,
                'Corr w/ Events': corr_events,
                'Corr w/ Fatalities': corr_fatalities,
                'Data Points': len(valid_data)
            })

corr_df = pd.DataFrame(correlations).sort_values('Corr w/ Events', ascending=False)
print("\n" + corr_df.to_string(index=False))

# 5. TIME-LAG ANALYSIS
# --------------------
print("\n" + "="*80)
print("TIME-LAG ANALYSIS")
print("="*80)

top_terms = corr_df.head(5)['Search Term'].tolist()

for term in top_terms:
    print(f"\n{term}:")
    valid_data = merged[['EVENTS', term]].dropna()
    best_corr = -999
    best_lag = 0
    
    for lag in range(-3, 4):
        if lag == 0:
            corr = valid_data['EVENTS'].corr(valid_data[term])
        elif lag > 0:
            if len(valid_data) > lag:
                corr = valid_data['EVENTS'].iloc[lag:].corr(valid_data[term].iloc[:-lag])
            else:
                corr = 0
        else:
            if len(valid_data) > abs(lag):
                corr = valid_data['EVENTS'].iloc[:lag].corr(valid_data[term].iloc[-lag:])
            else:
                corr = 0
        
        if abs(corr) > abs(best_corr):
            best_corr = corr
            best_lag = lag
        
        direction = "searches LAG" if lag > 0 else ("searches LEAD" if lag < 0 else "CONCURRENT")
        print(f"  Lag {lag:+2d} months ({direction:15s}): correlation = {corr:+.3f}")
    
    interpretation = "REACTIVE (searches follow events)" if best_lag > 0 else \
                    "PREDICTIVE (searches precede events)" if best_lag < 0 else \
                    "CONCURRENT (searches match events)"
    print(f"\n  → Best correlation at lag {best_lag:+d}: {best_corr:+.3f} ({interpretation})")

# 6. KEY PERIODS IDENTIFICATION
# -----------------------------
print("\n" + "="*80)
print("KEY PERIODS")
print("="*80)

print("\nTop 5 Event Spikes:")
top_spikes = merged.nlargest(5, 'EVENTS')[['month', 'EVENTS', 'FATALITIES'] + list(trends_data.keys())]
for idx, row in top_spikes.iterrows():
    print(f"\n{row['month'].strftime('%B %Y')}:")
    print(f"  ACLED Events: {row['EVENTS']:,}")
    print(f"  ACLED Fatalities: {row['FATALITIES']:,}")
    print(f"  Search Interest:")
    for term in trends_data.keys():
        if pd.notna(row[term]):
            print(f"    - {term:25s}: {row[term]:.0f}/100")

# 7. VISUALIZATION
# ---------------
print("\n" + "="*80)
print("CREATING VISUALIZATIONS")
print("="*80)

# Normalize data
merged_normalized = merged.copy()
merged_normalized['EVENTS_norm'] = (merged['EVENTS'] / merged['EVENTS'].max()) * 100
merged_normalized['FATALITIES_norm'] = (merged['FATALITIES'] / merged['FATALITIES'].max()) * 100

# Reshape for Altair
plot_data = []
for _, row in merged_normalized.iterrows():
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Events',
        'value': row['EVENTS_norm'],
        'type': 'Conflict Data',
        'raw_value': row['EVENTS']
    })
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Fatalities',
        'value': row['FATALITIES_norm'],
        'type': 'Conflict Data',
        'raw_value': row['FATALITIES']
    })
    for term in top_terms:
        if term in row and pd.notna(row[term]):
            plot_data.append({
                'month': row['month'],
                'metric': f'Search: {term}',
                'value': row[term],
                'type': 'Google Trends',
                'raw_value': row[term]
            })

plot_df = pd.DataFrame(plot_data)

# Main chart
chart = alt.Chart(plot_df).mark_line(strokeWidth=2.5, point=True).encode(
    x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45)),
    y=alt.Y('value:Q', title='Normalized Value (0-100)', scale=alt.Scale(domain=[0, 105])),
    color=alt.Color('metric:N', title='Metric', scale=alt.Scale(scheme='tableau10')),
    strokeDash=alt.StrokeDash('type:N', title='Data Type',
                               scale=alt.Scale(domain=['Conflict Data', 'Google Trends'],
                                             range=[[1,0], [5,3]])),
    tooltip=[
        alt.Tooltip('month:T', title='Month', format='%B %Y'),
        alt.Tooltip('metric:N', title='Metric'),
        alt.Tooltip('value:Q', title='Normalized', format='.1f'),
        alt.Tooltip('raw_value:Q', title='Raw Value', format=',.0f')
    ]
).properties(
    width=1400,
    height=450,
    title={
        'text': 'Conflict & Peace Hashtags: ACLED Events vs Google Search Interest (2020-2025)',
        'subtitle': 'Examining Conflict & Peace-specific Hashtags',
        'fontSize': 18,
        'subtitleFontSize': 13
    }
).interactive()

chart.save('conflictPeace_hashtags_acled_vs_trends.html')
print(f"✓ Saved: conflictPeace_hashtags_acled_vs_trends.html")

# Display
chart

# 8. INDIVIDUAL COMPARISON CHARTS
# --------------------------------
for term in top_terms:
    term_data = merged[['month', 'EVENTS', 'FATALITIES', term]].dropna().copy()
    
    base = alt.Chart(term_data).encode(
        x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45))
    )
    
    events_line = base.mark_line(color='steelblue', strokeWidth=3).encode(
        y=alt.Y('EVENTS:Q', title='ACLED Events', axis=alt.Axis(titleColor='steelblue')),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    trends_line = base.mark_line(color='red', strokeWidth=3).encode(
        y=alt.Y(f'{term}:Q', title=f'Google Trends: {term}',
                axis=alt.Axis(titleColor='red'), scale=alt.Scale(domain=[0, 100])),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    term_chart = alt.layer(events_line, trends_line).resolve_scale(
        y='independent'
    ).properties(
        width=1200,
        height=400,
        title=f'Conflict & Peace Hashtags: ACLED Events vs "{term}" Search Interest'
    ).interactive()
    
    filename = f"conflictPeace_hashtags_{term.lower().replace(' ', '_')}_comparison.html"
    term_chart.save(filename)
    print(f"✓ Saved: {filename}")

print("\n✓ Conflict & Peace hashtags analysis complete!")

CONFLICT & PEACE HASHTAGS ANALYSIS
✓ ACLED Data: 347 months
  Date range: 1996-12-01 00:00:00 to 2025-10-01 00:00:00
  Total events: 2,772,023
  Total fatalities: 2,399,712

NOTE: Hashtags removed & spaces added for RefugeeRelief, PeaceForAll, and StandWithPeace due to no data.
  ✓ Loaded: #NoWarCrimes              - 70 months, max=100
  ✓ Loaded: #Peace                    - 70 months, max=100
  ✓ Loaded: #NoWar                    - 70 months, max=100
  ✓ Loaded: #StopWar                  - 70 months, max=100
  ✓ Loaded: #StopTheWar               - 70 months, max=100
  ✓ Loaded: #HumanatarianCrisis       - 70 months, max=100
  ✓ Loaded: #RefugeeRelief            - 70 months, max=100
  ✓ Loaded: #PeaceForAll              - 70 months, max=100
  ✓ Loaded: #Solidarity               - 70 months, max=100
  ✓ Loaded: #Ceasefire                - 70 months, max=100
  ✓ Loaded: #CeasefireNOW             - 70 months, max=100
  ✓ Loaded: #StopGenocide             - 70 months, max=100
  ✓ Loaded: #

### III. Regional Hashtags

#### 1) Middle East

In [65]:
# MIDDLE EAST HASHTAGS ANALYSIS: ACLED EVENTS vs GOOGLE TRENDS

print("="*80)
print("MIDDLE EAST HASHTAGS ANALYSIS")
print("="*80)

# 1. FILTER ACLED DATA
# ---------------------

middleEast_acled = acled[
    (acled['REGION'] == 'Middle East') & 
    (acled['WEEK'] >= '2020-01-01')
].copy()

middleEast_acled['WEEK'] = pd.to_datetime(middleEast_acled['WEEK'])

middleEast_acled['month'] = middleEast_acled['WEEK'].dt.to_period('M').dt.to_timestamp()
monthly = middleEast_acled.groupby('month').agg({
    'EVENTS': 'sum',
    'FATALITIES': 'sum'
}).reset_index()

print(f"✓ ACLED Data: {len(monthly)} months")
print(f"  Date range: {monthly['month'].min()} to {monthly['month'].max()}")
print(f"  Total events: {monthly['EVENTS'].sum():,}")
print(f"  Total fatalities: {monthly['FATALITIES'].sum():,}")


# 2. LOAD GOOGLE TRENDS FILES
# ----------------------------
middleEast_hashtag_files = {
    '#MiddleEastCrisis': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_MiddleEastCrisis.csv',
    '#MENA': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_MENA.csv',
    '#FreeMiddleEast': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_FreeMiddleEast.csv'
}

print("\nNOTE: Hashtag removed for MENA due to no data.")

trends_data = {}
for name, filepath in middleEast_hashtag_files.items():
    try:
        df = pd.read_csv(filepath, skiprows=1)
        df.columns = ['month', 'value']
        df['month'] = pd.to_datetime(df['month'])
        df['value'] = df['value'].replace('<1', '0.5')
        df['value'] = pd.to_numeric(df['value'], errors='coerce')
        trends_data[name] = df
        print(f"  ✓ Loaded: {name:25s} - {len(df)} months, max={df['value'].max()}")
    except Exception as e:
        print(f"    ✗ Error loading {name}: {e}")
        

# 3. MERGE DATASETS
# -----------------
merged = monthly.copy()
for name, df in trends_data.items():
    merged = merged.merge(
        df.rename(columns={'value': name}),
        on='month',
        how='left'
    )
print(f"\n✓ Merged dataset: {len(merged)} months with {len(trends_data)} search terms")


# 4. CORRELATION ANALYSIS
# -----------------------
print("\n" + "="*80)
print("CORRELATION ANALYSIS")
print("="*80)

correlations = []
for term in trends_data.keys():
    if term in merged.columns:
        valid_data = merged[['EVENTS', 'FATALITIES', term]].dropna()
        if len(valid_data) > 10:
            corr_events = valid_data['EVENTS'].corr(valid_data[term])
            corr_fatalities = valid_data['FATALITIES'].corr(valid_data[term])
            correlations.append({
                'Search Term': term,
                'Corr w/ Events': corr_events,
                'Corr w/ Fatalities': corr_fatalities,
                'Data Points': len(valid_data)
            })

corr_df = pd.DataFrame(correlations).sort_values('Corr w/ Events', ascending=False)
print("\n" + corr_df.to_string(index=False))

# 5. TIME-LAG ANALYSIS
# --------------------
print("\n" + "="*80)
print("TIME-LAG ANALYSIS")
print("="*80)

top_terms = corr_df.head(3)['Search Term'].tolist()

for term in top_terms:
    print(f"\n{term}:")
    valid_data = merged[['EVENTS', term]].dropna()
    best_corr = -999
    best_lag = 0
    
    for lag in range(-3, 4):
        if lag == 0:
            corr = valid_data['EVENTS'].corr(valid_data[term])
        elif lag > 0:
            if len(valid_data) > lag:
                corr = valid_data['EVENTS'].iloc[lag:].corr(valid_data[term].iloc[:-lag])
            else:
                corr = 0
        else:
            if len(valid_data) > abs(lag):
                corr = valid_data['EVENTS'].iloc[:lag].corr(valid_data[term].iloc[-lag:])
            else:
                corr = 0
        
        if abs(corr) > abs(best_corr):
            best_corr = corr
            best_lag = lag
        
        direction = "searches LAG" if lag > 0 else ("searches LEAD" if lag < 0 else "CONCURRENT")
        print(f"  Lag {lag:+2d} months ({direction:15s}): correlation = {corr:+.3f}")
    
    interpretation = "REACTIVE (searches follow events)" if best_lag > 0 else \
                    "PREDICTIVE (searches precede events)" if best_lag < 0 else \
                    "CONCURRENT (searches match events)"
    print(f"\n  → Best correlation at lag {best_lag:+d}: {best_corr:+.3f} ({interpretation})")

# 6. KEY PERIODS IDENTIFICATION
# -----------------------------
print("\n" + "="*80)
print("KEY PERIODS")
print("="*80)

print("\nTop 5 Event Spikes:")
top_spikes = merged.nlargest(5, 'EVENTS')[['month', 'EVENTS', 'FATALITIES'] + list(trends_data.keys())]
for idx, row in top_spikes.iterrows():
    print(f"\n{row['month'].strftime('%B %Y')}:")
    print(f"  ACLED Events: {row['EVENTS']:,}")
    print(f"  ACLED Fatalities: {row['FATALITIES']:,}")
    print(f"  Search Interest:")
    for term in trends_data.keys():
        if pd.notna(row[term]):
            print(f"    - {term:25s}: {row[term]:.0f}/100")

# 7. VISUALIZATION
# ---------------
print("\n" + "="*80)
print("CREATING VISUALIZATIONS")
print("="*80)

# Normalize data
merged_normalized = merged.copy()
merged_normalized['EVENTS_norm'] = (merged['EVENTS'] / merged['EVENTS'].max()) * 100
merged_normalized['FATALITIES_norm'] = (merged['FATALITIES'] / merged['FATALITIES'].max()) * 100

# Reshape for Altair
plot_data = []
for _, row in merged_normalized.iterrows():
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Events',
        'value': row['EVENTS_norm'],
        'type': 'Conflict Data',
        'raw_value': row['EVENTS']
    })
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Fatalities',
        'value': row['FATALITIES_norm'],
        'type': 'Conflict Data',
        'raw_value': row['FATALITIES']
    })
    for term in top_terms:
        if term in row and pd.notna(row[term]):
            plot_data.append({
                'month': row['month'],
                'metric': f'Search: {term}',
                'value': row[term],
                'type': 'Google Trends',
                'raw_value': row[term]
            })

plot_df = pd.DataFrame(plot_data)

# Main chart
chart = alt.Chart(plot_df).mark_line(strokeWidth=2.5, point=True).encode(
    x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45)),
    y=alt.Y('value:Q', title='Normalized Value (0-100)', scale=alt.Scale(domain=[0, 105])),
    color=alt.Color('metric:N', title='Metric', scale=alt.Scale(scheme='tableau10')),
    strokeDash=alt.StrokeDash('type:N', title='Data Type',
                               scale=alt.Scale(domain=['Conflict Data', 'Google Trends'],
                                             range=[[1,0], [5,3]])),
    tooltip=[
        alt.Tooltip('month:T', title='Month', format='%B %Y'),
        alt.Tooltip('metric:N', title='Metric'),
        alt.Tooltip('value:Q', title='Normalized', format='.1f'),
        alt.Tooltip('raw_value:Q', title='Raw Value', format=',.0f')
    ]
).properties(
    width=1400,
    height=450,
    title={
        'text': 'Middle East Hashtags: ACLED Events vs Google Search Interest (2020-2025)',
        'subtitle': 'Examining Middle East-related Hashtags',
        'fontSize': 18,
        'subtitleFontSize': 13
    }
).interactive()

chart.save('middleEast_hashtags_acled_vs_trends.html')
print(f"✓ Saved: middleEast_hashtags_acled_vs_trends.html")

# Display
chart

# 8. INDIVIDUAL COMPARISON CHARTS
# --------------------------------
for term in top_terms:
    term_data = merged[['month', 'EVENTS', 'FATALITIES', term]].dropna().copy()
    
    base = alt.Chart(term_data).encode(
        x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45))
    )
    
    events_line = base.mark_line(color='steelblue', strokeWidth=3).encode(
        y=alt.Y('EVENTS:Q', title='ACLED Events', axis=alt.Axis(titleColor='steelblue')),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    trends_line = base.mark_line(color='red', strokeWidth=3).encode(
        y=alt.Y(f'{term}:Q', title=f'Google Trends: {term}',
                axis=alt.Axis(titleColor='red'), scale=alt.Scale(domain=[0, 100])),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    term_chart = alt.layer(events_line, trends_line).resolve_scale(
        y='independent'
    ).properties(
        width=1200,
        height=400,
        title=f'Middle East Hashtags: ACLED Events vs "{term}" Search Interest'
    ).interactive()
    
    filename = f"middleEast_hashtags_{term.lower().replace(' ', '_')}_comparison.html"
    term_chart.save(filename)
    print(f"✓ Saved: {filename}")

print("\n✓ Middle East hashtags analysis complete!")

MIDDLE EAST HASHTAGS ANALYSIS
✓ ACLED Data: 130 months
  Date range: 2014-12-01 00:00:00 to 2025-09-01 00:00:00
  Total events: 189,134
  Total fatalities: 171,435

NOTE: Hashtag removed for MENA due to no data.
  ✓ Loaded: #MiddleEastCrisis         - 70 months, max=100
  ✓ Loaded: #MENA                     - 70 months, max=100
  ✓ Loaded: #FreeMiddleEast           - 70 months, max=100

✓ Merged dataset: 130 months with 3 search terms

CORRELATION ANALYSIS

      Search Term  Corr w/ Events  Corr w/ Fatalities  Data Points
            #MENA        0.135069            0.062346           69
#MiddleEastCrisis        0.004776           -0.069052           69
  #FreeMiddleEast        0.004776           -0.069052           69

TIME-LAG ANALYSIS

#MENA:
  Lag -3 months (searches LEAD  ): correlation = +0.143
  Lag -2 months (searches LEAD  ): correlation = +0.142
  Lag -1 months (searches LEAD  ): correlation = +0.153
  Lag +0 months (CONCURRENT     ): correlation = +0.135
  Lag +1 months (se

#### 2) Europe

In [66]:
# EUROPE HASHTAGS ANALYSIS: ACLED EVENTS vs GOOGLE TRENDS

print("="*80)
print("EUROPE HASHTAGS ANALYSIS")
print("="*80)

# 1. FILTER ACLED DATA
# ---------------------

europe_acled = acled[
    (acled['REGION'] == 'Europe') & 
    (acled['WEEK'] >= '2020-01-01')
].copy()

europe_acled['WEEK'] = pd.to_datetime(europe_acled['WEEK'])

europe_acled['month'] = europe_acled['WEEK'].dt.to_period('M').dt.to_timestamp()
monthly = europe_acled.groupby('month').agg({
    'EVENTS': 'sum',
    'FATALITIES': 'sum'
}).reset_index()

print(f"✓ ACLED Data: {len(monthly)} months")
print(f"  Date range: {monthly['month'].min()} to {monthly['month'].max()}")
print(f"  Total events: {monthly['EVENTS'].sum():,}")
print(f"  Total fatalities: {monthly['FATALITIES'].sum():,}")


# 2. LOAD GOOGLE TRENDS FILES
# ----------------------------
europe_hashtag_files = {
    '#EuropeanSolidarity': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_EuropeanSolidarity.csv',
    '#EU': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_EU.csv'
}

trends_data = {}
for name, filepath in europe_hashtag_files.items():
    try:
        df = pd.read_csv(filepath, skiprows=1)
        df.columns = ['month', 'value']
        df['month'] = pd.to_datetime(df['month'])
        df['value'] = df['value'].replace('<1', '0.5')
        df['value'] = pd.to_numeric(df['value'], errors='coerce')
        trends_data[name] = df
        print(f"  ✓ Loaded: {name:25s} - {len(df)} months, max={df['value'].max()}")
    except Exception as e:
        print(f"    ✗ Error loading {name}: {e}")
        

# 3. MERGE DATASETS
# -----------------
merged = monthly.copy()
for name, df in trends_data.items():
    merged = merged.merge(
        df.rename(columns={'value': name}),
        on='month',
        how='left'
    )
print(f"\n✓ Merged dataset: {len(merged)} months with {len(trends_data)} search terms")


# 4. CORRELATION ANALYSIS
# -----------------------
print("\n" + "="*80)
print("CORRELATION ANALYSIS")
print("="*80)

correlations = []
for term in trends_data.keys():
    if term in merged.columns:
        valid_data = merged[['EVENTS', 'FATALITIES', term]].dropna()
        if len(valid_data) > 10:
            corr_events = valid_data['EVENTS'].corr(valid_data[term])
            corr_fatalities = valid_data['FATALITIES'].corr(valid_data[term])
            correlations.append({
                'Search Term': term,
                'Corr w/ Events': corr_events,
                'Corr w/ Fatalities': corr_fatalities,
                'Data Points': len(valid_data)
            })

corr_df = pd.DataFrame(correlations).sort_values('Corr w/ Events', ascending=False)
print("\n" + corr_df.to_string(index=False))

# 5. TIME-LAG ANALYSIS
# --------------------
print("\n" + "="*80)
print("TIME-LAG ANALYSIS")
print("="*80)

top_terms = corr_df.head(3)['Search Term'].tolist()

for term in top_terms:
    print(f"\n{term}:")
    valid_data = merged[['EVENTS', term]].dropna()
    best_corr = -999
    best_lag = 0
    
    for lag in range(-3, 4):
        if lag == 0:
            corr = valid_data['EVENTS'].corr(valid_data[term])
        elif lag > 0:
            if len(valid_data) > lag:
                corr = valid_data['EVENTS'].iloc[lag:].corr(valid_data[term].iloc[:-lag])
            else:
                corr = 0
        else:
            if len(valid_data) > abs(lag):
                corr = valid_data['EVENTS'].iloc[:lag].corr(valid_data[term].iloc[-lag:])
            else:
                corr = 0
        
        if abs(corr) > abs(best_corr):
            best_corr = corr
            best_lag = lag
        
        direction = "searches LAG" if lag > 0 else ("searches LEAD" if lag < 0 else "CONCURRENT")
        print(f"  Lag {lag:+2d} months ({direction:15s}): correlation = {corr:+.3f}")
    
    interpretation = "REACTIVE (searches follow events)" if best_lag > 0 else \
                    "PREDICTIVE (searches precede events)" if best_lag < 0 else \
                    "CONCURRENT (searches match events)"
    print(f"\n  → Best correlation at lag {best_lag:+d}: {best_corr:+.3f} ({interpretation})")

# 6. KEY PERIODS IDENTIFICATION
# -----------------------------
print("\n" + "="*80)
print("KEY PERIODS")
print("="*80)

print("\nTop 5 Event Spikes:")
top_spikes = merged.nlargest(5, 'EVENTS')[['month', 'EVENTS', 'FATALITIES'] + list(trends_data.keys())]
for idx, row in top_spikes.iterrows():
    print(f"\n{row['month'].strftime('%B %Y')}:")
    print(f"  ACLED Events: {row['EVENTS']:,}")
    print(f"  ACLED Fatalities: {row['FATALITIES']:,}")
    print(f"  Search Interest:")
    for term in trends_data.keys():
        if pd.notna(row[term]):
            print(f"    - {term:25s}: {row[term]:.0f}/100")

# 7. VISUALIZATION
# ---------------
print("\n" + "="*80)
print("CREATING VISUALIZATIONS")
print("="*80)

# Normalize data
merged_normalized = merged.copy()
merged_normalized['EVENTS_norm'] = (merged['EVENTS'] / merged['EVENTS'].max()) * 100
merged_normalized['FATALITIES_norm'] = (merged['FATALITIES'] / merged['FATALITIES'].max()) * 100

# Reshape for Altair
plot_data = []
for _, row in merged_normalized.iterrows():
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Events',
        'value': row['EVENTS_norm'],
        'type': 'Conflict Data',
        'raw_value': row['EVENTS']
    })
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Fatalities',
        'value': row['FATALITIES_norm'],
        'type': 'Conflict Data',
        'raw_value': row['FATALITIES']
    })
    for term in top_terms:
        if term in row and pd.notna(row[term]):
            plot_data.append({
                'month': row['month'],
                'metric': f'Search: {term}',
                'value': row[term],
                'type': 'Google Trends',
                'raw_value': row[term]
            })

plot_df = pd.DataFrame(plot_data)

# Main chart
chart = alt.Chart(plot_df).mark_line(strokeWidth=2.5, point=True).encode(
    x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45)),
    y=alt.Y('value:Q', title='Normalized Value (0-100)', scale=alt.Scale(domain=[0, 105])),
    color=alt.Color('metric:N', title='Metric', scale=alt.Scale(scheme='tableau10')),
    strokeDash=alt.StrokeDash('type:N', title='Data Type',
                               scale=alt.Scale(domain=['Conflict Data', 'Google Trends'],
                                             range=[[1,0], [5,3]])),
    tooltip=[
        alt.Tooltip('month:T', title='Month', format='%B %Y'),
        alt.Tooltip('metric:N', title='Metric'),
        alt.Tooltip('value:Q', title='Normalized', format='.1f'),
        alt.Tooltip('raw_value:Q', title='Raw Value', format=',.0f')
    ]
).properties(
    width=1400,
    height=450,
    title={
        'text': 'Europe Hashtags: ACLED Events vs Google Search Interest (2020-2025)',
        'subtitle': 'Examining Europe-related Hashtags',
        'fontSize': 18,
        'subtitleFontSize': 13
    }
).interactive()

chart.save('europe_hashtags_acled_vs_trends.html')
print(f"✓ Saved: europe_hashtags_acled_vs_trends.html")

# Display
chart

# 8. INDIVIDUAL COMPARISON CHARTS
# --------------------------------
for term in top_terms:
    term_data = merged[['month', 'EVENTS', 'FATALITIES', term]].dropna().copy()
    
    base = alt.Chart(term_data).encode(
        x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45))
    )
    
    events_line = base.mark_line(color='steelblue', strokeWidth=3).encode(
        y=alt.Y('EVENTS:Q', title='ACLED Events', axis=alt.Axis(titleColor='steelblue')),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    trends_line = base.mark_line(color='red', strokeWidth=3).encode(
        y=alt.Y(f'{term}:Q', title=f'Google Trends: {term}',
                axis=alt.Axis(titleColor='red'), scale=alt.Scale(domain=[0, 100])),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    term_chart = alt.layer(events_line, trends_line).resolve_scale(
        y='independent'
    ).properties(
        width=1200,
        height=400,
        title=f'Europe Hashtags: ACLED Events vs "{term}" Search Interest'
    ).interactive()
    
    filename = f"europe_hashtags_{term.lower().replace(' ', '_')}_comparison.html"
    term_chart.save(filename)
    print(f"✓ Saved: {filename}")

print("\n✓ Europe hashtags analysis complete!")

EUROPE HASHTAGS ANALYSIS
✓ ACLED Data: 94 months
  Date range: 2017-12-01 00:00:00 to 2025-09-01 00:00:00
  Total events: 176,858
  Total fatalities: 74,699
  ✓ Loaded: #EuropeanSolidarity       - 70 months, max=100
  ✓ Loaded: #EU                       - 70 months, max=100

✓ Merged dataset: 94 months with 2 search terms

CORRELATION ANALYSIS

        Search Term  Corr w/ Events  Corr w/ Fatalities  Data Points
#EuropeanSolidarity       -0.132766            0.009089           69
                #EU       -0.290454           -0.381270           69

TIME-LAG ANALYSIS

#EuropeanSolidarity:
  Lag -3 months (searches LEAD  ): correlation = -0.141
  Lag -2 months (searches LEAD  ): correlation = -0.135
  Lag -1 months (searches LEAD  ): correlation = -0.135
  Lag +0 months (CONCURRENT     ): correlation = -0.133
  Lag +1 months (searches LAG   ): correlation = -0.135
  Lag +2 months (searches LAG   ): correlation = -0.135
  Lag +3 months (searches LAG   ): correlation = -0.141

  → Best cor

#### 3) South Asia

In [67]:
# SOUTH ASIA HASHTAGS ANALYSIS: ACLED EVENTS vs GOOGLE TRENDS

print("="*80)
print("SOUTH ASIA HASHTAGS ANALYSIS")
print("="*80)

# 1. FILTER ACLED DATA
# ---------------------

southAsia_acled = acled[
    (acled['REGION'] == 'South Asia') & 
    (acled['WEEK'] >= '2020-01-01')
].copy()

southAsia_acled['WEEK'] = pd.to_datetime(southAsia_acled['WEEK'])

southAsia_acled['month'] = southAsia_acled['WEEK'].dt.to_period('M').dt.to_timestamp()
monthly = southAsia_acled.groupby('month').agg({
    'EVENTS': 'sum',
    'FATALITIES': 'sum'
}).reset_index()

print(f"✓ ACLED Data: {len(monthly)} months")
print(f"  Date range: {monthly['month'].min()} to {monthly['month'].max()}")
print(f"  Total events: {monthly['EVENTS'].sum():,}")
print(f"  Total fatalities: {monthly['FATALITIES'].sum():,}")


# 2. LOAD GOOGLE TRENDS FILES
# ----------------------------
southAsia_hashtag_files = {
    '#HelpSouthAsia': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_HelpSouthAsia.csv',
    '#UnifySouthAsia': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_UnifySouthAsia.csv'
}

trends_data = {}
for name, filepath in southAsia_hashtag_files.items():
    try:
        df = pd.read_csv(filepath, skiprows=1)
        df.columns = ['month', 'value']
        df['month'] = pd.to_datetime(df['month'])
        df['value'] = df['value'].replace('<1', '0.5')
        df['value'] = pd.to_numeric(df['value'], errors='coerce')
        trends_data[name] = df
        print(f"  ✓ Loaded: {name:25s} - {len(df)} months, max={df['value'].max()}")
    except Exception as e:
        print(f"    ✗ Error loading {name}: {e}")
        

# 3. MERGE DATASETS
# -----------------
merged = monthly.copy()
for name, df in trends_data.items():
    merged = merged.merge(
        df.rename(columns={'value': name}),
        on='month',
        how='left'
    )
print(f"\n✓ Merged dataset: {len(merged)} months with {len(trends_data)} search terms")


# 4. CORRELATION ANALYSIS
# -----------------------
print("\n" + "="*80)
print("CORRELATION ANALYSIS")
print("="*80)

correlations = []
for term in trends_data.keys():
    if term in merged.columns:
        valid_data = merged[['EVENTS', 'FATALITIES', term]].dropna()
        if len(valid_data) > 10:
            corr_events = valid_data['EVENTS'].corr(valid_data[term])
            corr_fatalities = valid_data['FATALITIES'].corr(valid_data[term])
            correlations.append({
                'Search Term': term,
                'Corr w/ Events': corr_events,
                'Corr w/ Fatalities': corr_fatalities,
                'Data Points': len(valid_data)
            })

corr_df = pd.DataFrame(correlations).sort_values('Corr w/ Events', ascending=False)
print("\n" + corr_df.to_string(index=False))

# 5. TIME-LAG ANALYSIS
# --------------------
print("\n" + "="*80)
print("TIME-LAG ANALYSIS")
print("="*80)

top_terms = corr_df.head(3)['Search Term'].tolist()

for term in top_terms:
    print(f"\n{term}:")
    valid_data = merged[['EVENTS', term]].dropna()
    best_corr = -999
    best_lag = 0
    
    for lag in range(-3, 4):
        if lag == 0:
            corr = valid_data['EVENTS'].corr(valid_data[term])
        elif lag > 0:
            if len(valid_data) > lag:
                corr = valid_data['EVENTS'].iloc[lag:].corr(valid_data[term].iloc[:-lag])
            else:
                corr = 0
        else:
            if len(valid_data) > abs(lag):
                corr = valid_data['EVENTS'].iloc[:lag].corr(valid_data[term].iloc[-lag:])
            else:
                corr = 0
        
        if abs(corr) > abs(best_corr):
            best_corr = corr
            best_lag = lag
        
        direction = "searches LAG" if lag > 0 else ("searches LEAD" if lag < 0 else "CONCURRENT")
        print(f"  Lag {lag:+2d} months ({direction:15s}): correlation = {corr:+.3f}")
    
    interpretation = "REACTIVE (searches follow events)" if best_lag > 0 else \
                    "PREDICTIVE (searches precede events)" if best_lag < 0 else \
                    "CONCURRENT (searches match events)"
    print(f"\n  → Best correlation at lag {best_lag:+d}: {best_corr:+.3f} ({interpretation})")

# 6. KEY PERIODS IDENTIFICATION
# -----------------------------
print("\n" + "="*80)
print("KEY PERIODS")
print("="*80)

print("\nTop 5 Event Spikes:")
top_spikes = merged.nlargest(5, 'EVENTS')[['month', 'EVENTS', 'FATALITIES'] + list(trends_data.keys())]
for idx, row in top_spikes.iterrows():
    print(f"\n{row['month'].strftime('%B %Y')}:")
    print(f"  ACLED Events: {row['EVENTS']:,}")
    print(f"  ACLED Fatalities: {row['FATALITIES']:,}")
    print(f"  Search Interest:")
    for term in trends_data.keys():
        if pd.notna(row[term]):
            print(f"    - {term:25s}: {row[term]:.0f}/100")

# 7. VISUALIZATION
# ---------------
print("\n" + "="*80)
print("CREATING VISUALIZATIONS")
print("="*80)

# Normalize data
merged_normalized = merged.copy()
merged_normalized['EVENTS_norm'] = (merged['EVENTS'] / merged['EVENTS'].max()) * 100
merged_normalized['FATALITIES_norm'] = (merged['FATALITIES'] / merged['FATALITIES'].max()) * 100

# Reshape for Altair
plot_data = []
for _, row in merged_normalized.iterrows():
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Events',
        'value': row['EVENTS_norm'],
        'type': 'Conflict Data',
        'raw_value': row['EVENTS']
    })
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Fatalities',
        'value': row['FATALITIES_norm'],
        'type': 'Conflict Data',
        'raw_value': row['FATALITIES']
    })
    for term in top_terms:
        if term in row and pd.notna(row[term]):
            plot_data.append({
                'month': row['month'],
                'metric': f'Search: {term}',
                'value': row[term],
                'type': 'Google Trends',
                'raw_value': row[term]
            })

plot_df = pd.DataFrame(plot_data)

# Main chart
chart = alt.Chart(plot_df).mark_line(strokeWidth=2.5, point=True).encode(
    x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45)),
    y=alt.Y('value:Q', title='Normalized Value (0-100)', scale=alt.Scale(domain=[0, 105])),
    color=alt.Color('metric:N', title='Metric', scale=alt.Scale(scheme='tableau10')),
    strokeDash=alt.StrokeDash('type:N', title='Data Type',
                               scale=alt.Scale(domain=['Conflict Data', 'Google Trends'],
                                             range=[[1,0], [5,3]])),
    tooltip=[
        alt.Tooltip('month:T', title='Month', format='%B %Y'),
        alt.Tooltip('metric:N', title='Metric'),
        alt.Tooltip('value:Q', title='Normalized', format='.1f'),
        alt.Tooltip('raw_value:Q', title='Raw Value', format=',.0f')
    ]
).properties(
    width=1400,
    height=450,
    title={
        'text': 'South Asia Hashtags: ACLED Events vs Google Search Interest (2020-2025)',
        'subtitle': 'Examining South Asia-related Hashtags',
        'fontSize': 18,
        'subtitleFontSize': 13
    }
).interactive()

chart.save('southAsia_hashtags_acled_vs_trends.html')
print(f"✓ Saved: southAsia_hashtags_acled_vs_trends.html")

# Display
chart

# 8. INDIVIDUAL COMPARISON CHARTS
# --------------------------------
for term in top_terms:
    term_data = merged[['month', 'EVENTS', 'FATALITIES', term]].dropna().copy()
    
    base = alt.Chart(term_data).encode(
        x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45))
    )
    
    events_line = base.mark_line(color='steelblue', strokeWidth=3).encode(
        y=alt.Y('EVENTS:Q', title='ACLED Events', axis=alt.Axis(titleColor='steelblue')),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    trends_line = base.mark_line(color='red', strokeWidth=3).encode(
        y=alt.Y(f'{term}:Q', title=f'Google Trends: {term}',
                axis=alt.Axis(titleColor='red'), scale=alt.Scale(domain=[0, 100])),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    term_chart = alt.layer(events_line, trends_line).resolve_scale(
        y='independent'
    ).properties(
        width=1200,
        height=400,
        title=f'South Asia Hashtags: ACLED Events vs "{term}" Search Interest'
    ).interactive()
    
    filename = f"southAsia_hashtags_{term.lower().replace(' ', '_')}_comparison.html"
    term_chart.save(filename)
    print(f"✓ Saved: {filename}")

print("\n✓ South Asia hashtags analysis complete!")

SOUTH ASIA HASHTAGS ANALYSIS
✓ ACLED Data: 190 months
  Date range: 2009-12-01 00:00:00 to 2025-09-01 00:00:00
  Total events: 124,599
  Total fatalities: 25,752
  ✓ Loaded: #HelpSouthAsia            - 70 months, max=100
  ✓ Loaded: #UnifySouthAsia           - 70 months, max=100

✓ Merged dataset: 190 months with 2 search terms

CORRELATION ANALYSIS

    Search Term  Corr w/ Events  Corr w/ Fatalities  Data Points
#UnifySouthAsia        0.242820            0.325104           69
 #HelpSouthAsia       -0.160525           -0.093602           69

TIME-LAG ANALYSIS

#UnifySouthAsia:
  Lag -3 months (searches LEAD  ): correlation = +0.250
  Lag -2 months (searches LEAD  ): correlation = +0.249
  Lag -1 months (searches LEAD  ): correlation = +0.243
  Lag +0 months (CONCURRENT     ): correlation = +0.243
  Lag +1 months (searches LAG   ): correlation = +0.243
  Lag +2 months (searches LAG   ): correlation = +0.249
  Lag +3 months (searches LAG   ): correlation = +0.250

  → Best correlation a

#### 4) Latin America

In [32]:
# Skipped beacuse both hastags had no data so had to remove the '#'

#### 5) North America

In [68]:
# NORTH AMERICA HASHTAGS ANALYSIS: ACLED EVENTS vs GOOGLE TRENDS

print("="*80)
print("NORTH AMERICA HASHTAGS ANALYSIS")
print("="*80)

# 1. FILTER ACLED DATA
# ---------------------

northAmerica_acled = acled[
    (acled['REGION'] == 'North America') & 
    (acled['WEEK'] >= '2020-01-01')
].copy()

northAmerica_acled['WEEK'] = pd.to_datetime(northAmerica_acled['WEEK'])

northAmerica_acled['month'] = northAmerica_acled['WEEK'].dt.to_period('M').dt.to_timestamp()
monthly = northAmerica_acled.groupby('month').agg({
    'EVENTS': 'sum',
    'FATALITIES': 'sum'
}).reset_index()

print(f"✓ ACLED Data: {len(monthly)} months")
print(f"  Date range: {monthly['month'].min()} to {monthly['month'].max()}")
print(f"  Total events: {monthly['EVENTS'].sum():,}")
print(f"  Total fatalities: {monthly['FATALITIES'].sum():,}")


# 2. LOAD GOOGLE TRENDS FILES
# ----------------------------
northAmerica_hashtag_files = {
    '#SaveNorthAmerica': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_SaveNorthAmerica.csv',
    '#FreedomConvoy2022': 'data/google_trends_raw/TIER3_HASHTAGS/google_trends_FreedomConvoy2022.csv'
}

trends_data = {}
for name, filepath in northAmerica_hashtag_files.items():
    try:
        df = pd.read_csv(filepath, skiprows=1)
        df.columns = ['month', 'value']
        df['month'] = pd.to_datetime(df['month'])
        df['value'] = df['value'].replace('<1', '0.5')
        df['value'] = pd.to_numeric(df['value'], errors='coerce')
        trends_data[name] = df
        print(f"  ✓ Loaded: {name:25s} - {len(df)} months, max={df['value'].max()}")
    except Exception as e:
        print(f"    ✗ Error loading {name}: {e}")
        

# 3. MERGE DATASETS
# -----------------
merged = monthly.copy()
for name, df in trends_data.items():
    merged = merged.merge(
        df.rename(columns={'value': name}),
        on='month',
        how='left'
    )
print(f"\n✓ Merged dataset: {len(merged)} months with {len(trends_data)} search terms")


# 4. CORRELATION ANALYSIS
# -----------------------
print("\n" + "="*80)
print("CORRELATION ANALYSIS")
print("="*80)

correlations = []
for term in trends_data.keys():
    if term in merged.columns:
        valid_data = merged[['EVENTS', 'FATALITIES', term]].dropna()
        if len(valid_data) > 10:
            corr_events = valid_data['EVENTS'].corr(valid_data[term])
            corr_fatalities = valid_data['FATALITIES'].corr(valid_data[term])
            correlations.append({
                'Search Term': term,
                'Corr w/ Events': corr_events,
                'Corr w/ Fatalities': corr_fatalities,
                'Data Points': len(valid_data)
            })

corr_df = pd.DataFrame(correlations).sort_values('Corr w/ Events', ascending=False)
print("\n" + corr_df.to_string(index=False))

# 5. TIME-LAG ANALYSIS
# --------------------
print("\n" + "="*80)
print("TIME-LAG ANALYSIS")
print("="*80)

top_terms = corr_df.head(3)['Search Term'].tolist()

for term in top_terms:
    print(f"\n{term}:")
    valid_data = merged[['EVENTS', term]].dropna()
    best_corr = -999
    best_lag = 0
    
    for lag in range(-3, 4):
        if lag == 0:
            corr = valid_data['EVENTS'].corr(valid_data[term])
        elif lag > 0:
            if len(valid_data) > lag:
                corr = valid_data['EVENTS'].iloc[lag:].corr(valid_data[term].iloc[:-lag])
            else:
                corr = 0
        else:
            if len(valid_data) > abs(lag):
                corr = valid_data['EVENTS'].iloc[:lag].corr(valid_data[term].iloc[-lag:])
            else:
                corr = 0
        
        if abs(corr) > abs(best_corr):
            best_corr = corr
            best_lag = lag
        
        direction = "searches LAG" if lag > 0 else ("searches LEAD" if lag < 0 else "CONCURRENT")
        print(f"  Lag {lag:+2d} months ({direction:15s}): correlation = {corr:+.3f}")
    
    interpretation = "REACTIVE (searches follow events)" if best_lag > 0 else \
                    "PREDICTIVE (searches precede events)" if best_lag < 0 else \
                    "CONCURRENT (searches match events)"
    print(f"\n  → Best correlation at lag {best_lag:+d}: {best_corr:+.3f} ({interpretation})")

# 6. KEY PERIODS IDENTIFICATION
# -----------------------------
print("\n" + "="*80)
print("KEY PERIODS")
print("="*80)

print("\nTop 5 Event Spikes:")
top_spikes = merged.nlargest(5, 'EVENTS')[['month', 'EVENTS', 'FATALITIES'] + list(trends_data.keys())]
for idx, row in top_spikes.iterrows():
    print(f"\n{row['month'].strftime('%B %Y')}:")
    print(f"  ACLED Events: {row['EVENTS']:,}")
    print(f"  ACLED Fatalities: {row['FATALITIES']:,}")
    print(f"  Search Interest:")
    for term in trends_data.keys():
        if pd.notna(row[term]):
            print(f"    - {term:25s}: {row[term]:.0f}/100")

# 7. VISUALIZATION
# ---------------
print("\n" + "="*80)
print("CREATING VISUALIZATIONS")
print("="*80)

# Normalize data
merged_normalized = merged.copy()
merged_normalized['EVENTS_norm'] = (merged['EVENTS'] / merged['EVENTS'].max()) * 100
merged_normalized['FATALITIES_norm'] = (merged['FATALITIES'] / merged['FATALITIES'].max()) * 100

# Reshape for Altair
plot_data = []
for _, row in merged_normalized.iterrows():
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Events',
        'value': row['EVENTS_norm'],
        'type': 'Conflict Data',
        'raw_value': row['EVENTS']
    })
    plot_data.append({
        'month': row['month'],
        'metric': 'ACLED Fatalities',
        'value': row['FATALITIES_norm'],
        'type': 'Conflict Data',
        'raw_value': row['FATALITIES']
    })
    for term in top_terms:
        if term in row and pd.notna(row[term]):
            plot_data.append({
                'month': row['month'],
                'metric': f'Search: {term}',
                'value': row[term],
                'type': 'Google Trends',
                'raw_value': row[term]
            })

plot_df = pd.DataFrame(plot_data)

# Main chart
chart = alt.Chart(plot_df).mark_line(strokeWidth=2.5, point=True).encode(
    x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45)),
    y=alt.Y('value:Q', title='Normalized Value (0-100)', scale=alt.Scale(domain=[0, 105])),
    color=alt.Color('metric:N', title='Metric', scale=alt.Scale(scheme='tableau10')),
    strokeDash=alt.StrokeDash('type:N', title='Data Type',
                               scale=alt.Scale(domain=['Conflict Data', 'Google Trends'],
                                             range=[[1,0], [5,3]])),
    tooltip=[
        alt.Tooltip('month:T', title='Month', format='%B %Y'),
        alt.Tooltip('metric:N', title='Metric'),
        alt.Tooltip('value:Q', title='Normalized', format='.1f'),
        alt.Tooltip('raw_value:Q', title='Raw Value', format=',.0f')
    ]
).properties(
    width=1400,
    height=450,
    title={
        'text': 'North America Hashtags: ACLED Events vs Google Search Interest (2020-2025)',
        'subtitle': 'Examining North America-related Hashtags',
        'fontSize': 18,
        'subtitleFontSize': 13
    }
).interactive()

chart.save('northAmerica_hashtags_acled_vs_trends.html')
print(f"✓ Saved: northAmerica_hashtags_acled_vs_trends.html")

# Display
chart

# 8. INDIVIDUAL COMPARISON CHARTS
# --------------------------------
for term in top_terms:
    term_data = merged[['month', 'EVENTS', 'FATALITIES', term]].dropna().copy()
    
    base = alt.Chart(term_data).encode(
        x=alt.X('month:T', title='Month', axis=alt.Axis(format='%b %Y', labelAngle=-45))
    )
    
    events_line = base.mark_line(color='steelblue', strokeWidth=3).encode(
        y=alt.Y('EVENTS:Q', title='ACLED Events', axis=alt.Axis(titleColor='steelblue')),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    trends_line = base.mark_line(color='red', strokeWidth=3).encode(
        y=alt.Y(f'{term}:Q', title=f'Google Trends: {term}',
                axis=alt.Axis(titleColor='red'), scale=alt.Scale(domain=[0, 100])),
        tooltip=[
            alt.Tooltip('month:T', title='Month', format='%B %Y'),
            alt.Tooltip('EVENTS:Q', title='Events', format=','),
            alt.Tooltip(f'{term}:Q', title='Search Interest', format='.0f')
        ]
    )
    
    term_chart = alt.layer(events_line, trends_line).resolve_scale(
        y='independent'
    ).properties(
        width=1200,
        height=400,
        title=f'northAmerica Hashtags: ACLED Events vs "{term}" Search Interest'
    ).interactive()
    
    filename = f"northAmerica_hashtags_{term.lower().replace(' ', '_')}_comparison.html"
    term_chart.save(filename)
    print(f"✓ Saved: {filename}")

print("\n✓ North America hashtags analysis complete!")

NORTH AMERICA HASHTAGS ANALYSIS
✓ ACLED Data: 94 months
  Date range: 2017-12-01 00:00:00 to 2025-09-01 00:00:00
  Total events: 71,250
  Total fatalities: 22,446
  ✓ Loaded: #SaveNorthAmerica         - 70 months, max=100
  ✓ Loaded: #FreedomConvoy2022        - 70 months, max=100

✓ Merged dataset: 94 months with 2 search terms

CORRELATION ANALYSIS

       Search Term  Corr w/ Events  Corr w/ Fatalities  Data Points
#FreedomConvoy2022        0.019829           -0.037146           69
 #SaveNorthAmerica       -0.100813           -0.137019           69

TIME-LAG ANALYSIS

#FreedomConvoy2022:
  Lag -3 months (searches LEAD  ): correlation = +0.016
  Lag -2 months (searches LEAD  ): correlation = +0.019
  Lag -1 months (searches LEAD  ): correlation = +0.017
  Lag +0 months (CONCURRENT     ): correlation = +0.020
  Lag +1 months (searches LAG   ): correlation = +0.017
  Lag +2 months (searches LAG   ): correlation = +0.019
  Lag +3 months (searches LAG   ): correlation = +0.016

  → Best c