# Domain Classification and GNN Feature Preparation

This notebook preprocesses data to prepare node features and edges for a GNN model.

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import load_npz
from sklearn.model_selection import train_test_split
from collections import defaultdict


## Step 1: Load Data

In [None]:
# Load datasets
article_embeddings = pd.read_csv('article_embeddings.csv')
url_domains = pd.read_csv('url_domains.csv')
chats = pd.read_csv('chats.csv')
chat_url_shares = load_npz('chat_url_shares.npz')  # Sparse matrix


## Step 2: Merge Data

In [None]:
# Align URLs and domains
article_embeddings = article_embeddings.merge(url_domains[['url', 'domain', 'virality', 'year', 'pc1']], on='url', how='inner')

# Group by domain and average article embeddings
article_embeddings['article_embedding'] = article_embeddings['article_embedding'].apply(eval)  # Convert string to list
article_embeddings_grouped = (
    article_embeddings.groupby('domain')['article_embedding']
    .apply(lambda x: np.mean(x.tolist(), axis=0))
    .reset_index()
)

# Add other domain-level features
domain_features = (
    url_domains[['domain', 'virality', 'year', 'pc1']]
    .drop_duplicates()
    .merge(article_embeddings_grouped, on='domain', how='inner')
)


## Step 3: Compute Domain-Level Chat Statistics

In [None]:
chat_counts = chat_url_shares.sum(axis=0).A1  # Total shares per URL
url_domains['total_chat_shares'] = chat_counts

# Aggregate per domain
domain_chat_stats = (
    url_domains.groupby('domain')['total_chat_shares']
    .sum()
    .reset_index()
)

# Merge with domain features
domain_features = domain_features.merge(domain_chat_stats, on='domain', how='inner')


## Step 4: Train-Test Split

In [None]:
train_chats, test_chats = train_test_split(chats['id'], test_size=0.2, random_state=42)

def assign_split(chat_url_shares, train_chats, test_chats):
    train_indices = train_chats.to_list()
    test_indices = test_chats.to_list()
    
    train_urls = set()
    test_urls = set()
    
    for chat_idx, url_idx in zip(*chat_url_shares.nonzero()):
        if chat_idx in train_indices:
            train_urls.add(url_idx)
        elif chat_idx in test_indices:
            test_urls.add(url_idx)

    return train_urls, test_urls

train_urls, test_urls = assign_split(chat_url_shares, train_chats, test_chats)

train_features = domain_features[domain_features['domain'].isin(train_urls)]
test_features = domain_features[domain_features['domain'].isin(test_urls)]


## Step 5: Prepare GNN Data

In [None]:
rows, cols = chat_url_shares.nonzero()
edges = defaultdict(int)

for row, col in zip(rows, cols):
    domain_a = url_domains.iloc[row]['domain']
    domain_b = url_domains.iloc[col]['domain']
    if domain_a != domain_b:
        edges[(domain_a, domain_b)] += chat_url_shares[row, col]

edge_list = pd.DataFrame(
    [(source, target, weight) for (source, target), weight in edges.items()],
    columns=['source', 'target', 'weight']
)

train_features.to_csv('train_features.csv', index=False)
test_features.to_csv('test_features.csv', index=False)
edge_list.to_csv('edges.csv', index=False)

print("Preprocessing complete. Train/Test features and edges saved.")
