In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import torch 

import sys
sys.path.append('../src/')

from models import IdealPointNN
from corpus import Corpus

xclip_df = pd.read_csv("../data/xclip_embeddings_temporal_from_square.csv")
text_df = pd.read_csv("../data/text_embeddings_bert_translated.csv")

from sklearn.preprocessing import normalize
text_embed_cols  = [c for c in text_df.columns  if c.startswith("bert_dim_")]
video_embed_cols = [c for c in xclip_df.columns if c.startswith("xclip_dim_")]

# Normalisation robuste des IDs + choix auto de la meilleure règle
def id_identity(s): return s.astype(str)
def id_lower(s):   return s.astype(str).str.strip().str.lower()
def id_basename(s):
    s = id_lower(s)
    s = s.str.replace(r"\\", "/", regex=True).str.replace(r"^.*/", "", regex=True)
    return s
def id_noext(s):
    s = id_basename(s)
    return s.str.replace(r"\.(mp4|mov|mkv|avi|webm|m4v|txt|csv)$", "", regex=True)

candidates = [
    ("identity", id_identity),
    ("lower",    id_lower),
    ("basename", id_basename),
    ("noext",    id_noext),
]

def try_merge(bert_df, xclip_df, f_text, f_video):
    a = text_df.copy()
    b = xclip_df.copy()
    a["vid"] = f_text(a["filename"])
    b["vid"] = f_video(b["video_id"])
    a = a.dropna(subset=["vid"])
    b = b.dropna(subset=["vid"])
    merged = pd.merge(
        a[["vid", "filename"] + text_embed_cols].drop_duplicates("vid"),
        b[["vid", "video_id"] + video_embed_cols].drop_duplicates("vid"),
        on="vid", how="inner"
    )
    inter = merged.shape[0]
    return inter, merged

best = None
best_name = None
for name_a, f_a in candidates:
    for name_b, f_b in candidates:
        inter, merged_candidate = try_merge(text_df, xclip_df, f_a, f_b)
        if best is None or inter > best[0]:
            best = (inter, merged_candidate)
            best_name = (name_a, name_b)

print(f"Best normalization: text={best_name[0]}, video={best_name[1]}, matches={best[0]}")
merged = best[1]
print("merged shape:", merged.shape)
if merged.shape[0] <= 1:
    raise ValueError("Trop peu de matchs après normalisation. Vérifie 'filename' vs 'video_id'.")

# Extract page_id and map to party
merged['page_id'] = merged['filename'].str.split('_').str[1].str.replace('.mp4', '', regex=True)
merged['party'] = merged['page_id'].map({
    '24413227922': 'Harris', 
    '153080620724': 'Trump'
})
merged['party'] = merged['party'].fillna('OTHER')

# Filter to keep only Trump and Harris videos
merged = merged[merged['party'].isin(['Trump', 'Harris'])].reset_index(drop=True)

X_text  = merged[text_embed_cols].to_numpy(dtype=np.float32)
X_video = merged[video_embed_cols].to_numpy(dtype=np.float32)
X_text  = normalize(X_text)
X_video = normalize(X_video)

embed_texts = torch.tensor(X_text, dtype=torch.float32)
embed_videos = torch.tensor(X_video, dtype=torch.float32)

modalities = {
    "text": {
        "column": "video_id",
        "views": {
            "embedding": {
                "type": "embedding",
                "matrix": embed_texts,
            
            }
        }
    },
    "video": {
        "column": "video_id",
        "views": {
            "embedding": {
                "type": "embedding",
                "matrix": embed_videos,
                
            }
        }
    }
}

dataset = Corpus(merged, modalities=modalities)
print("N documents:", len(dataset))

In [None]:
# Modèle idéal-point 1D (n_ideal_points=1)
encoder_args = {
    #"text_embedding":  {"hidden_dims": [128], "activation": "relu", "bias": True, "dropout": 0.0},
    "video_embedding": {"hidden_dims": [], "activation": "relu", "bias": True, "dropout": 0.0},
}
decoder_args = {
    #"text_embedding":  {"hidden_dims": [128], "activation": "relu", "bias": True, "dropout": 0.0},
    "video_embedding": {"hidden_dims": [], "activation": "relu", "bias": True, "dropout": 0.0},
}

m = IdealPointNN(
    train_data=dataset,
    n_ideal_points=1,              
    ae_type="wae",
    encoder_args=encoder_args,
    decoder_args=decoder_args,
    fusion="moe_average",           
    w_prior=1,
    num_epochs=100,
    batch_size=128,
    patience=3,
    print_every_n_epochs=1,
    print_every_n_batches=100,
    device = "cuda"
)

In [None]:
# Ideal points par fichier
Z = m.get_ideal_points(dataset, to_numpy=True)  # (N, 1)
ideal_points = Z.flatten()

out = merged[["filename", "video_id", "party"]].copy()
out["ideal_point"] = ideal_points
print(out.head())

# Contribution moyenne par modalité
weights = m.get_modality_weights(dataset)  # (N, 2)
modality_names = list(m.encoder.encoders.keys())  # ['text_embedding', 'video_embedding']
means = weights.mean(axis=0)
summary = pd.DataFrame({"mean": means}, index=modality_names)
print("\nAverage contribution per modality:\n", summary)

# Vérifier la répartition
party_counts = out['party'].value_counts()
print("Répartition par parti:")
print(party_counts)

In [None]:
# Couleurs par parti
colors = {'Trump': 'red', 'Harris': 'blue'}

# Plot: histogrammes (barplots) par parti
fig, ax = plt.subplots(1, 1, figsize=(12, 6))

# Histogrammes par parti (barplots)
for party in out['party'].unique():
    subset = out[out['party'] == party]
    if len(subset) > 0:
        ax.hist(subset['ideal_point'], 
                bins=20, 
                alpha=0.7, 
                color=colors[party], 
                label=f'{party} (n={len(subset)})',
                density=True)  # Normalise pour créer des densités

ax.set_xlabel('Ideal Point')
ax.set_ylabel('Density')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()