In [None]:
import os
import numpy as np
import pandas as pd
import json
import itertools
import pickle
import matplotlib.pyplot as plt
import torch
from sentence_transformers import SentenceTransformer, util

## Prepare data

### Selecte data relevant to federal races

In [None]:
source_path = '' # Define local data import path

In [None]:
var = pd.read_csv(os.path.join(source_path, 'fb_2022_adid_var.csv.gz'))

In [None]:
var.columns

In [None]:
'''
Filter for Set 3
'''
var = var[var.set3 == 1]

In [None]:
var[~var.race_of_focus.isin(['No race of focus', 'Downballot'])].sub_bucket.unique()

In [None]:
'''
Only house/senate races 
'''
condition = (var.wmp_office.isin(['us senate', 'us house'])) | (~var.race_of_focus.isin(['No race of focus', 'Downballot']))

In [None]:
var = var[condition]

In [None]:
'''
Merge with text and cid mapping tables
'''
text = pd.read_csv(os.path.join(source_path, 'fb_2022_adid_text.csv.gz'))

In [None]:
var = var.merge(text, how='left', on='ad_id')

In [None]:
'''
Drop clearly non-federal race spnosors
'''
var = var[var.wmp_spontype!= 'down ballot']

### Aggregate data at sponsor-media level

#### Federal candidates

In [None]:
var_cand = var[var.federal_cd==1]

In [None]:
cols = ['pd_id', 'wmp_media_type', 'wmp_creative_id']

In [None]:
# Aggregate by pdid AND media type
cand_sponsor_agg = var_cand[cols].groupby(cols).first().reset_index().groupby(['pd_id', 'wmp_media_type']).agg(lambda x: ' '.join(x)).reset_index()

In [None]:
cand_sponsor_agg.head(2)

In [None]:
cand_sponsor_agg['num_unique'] = cand_sponsor_agg.cid.apply(lambda x: len(x.split()))

In [None]:
cand_sponsor_agg = cand_sponsor_agg[cand_sponsor_agg.num_unique > 1]

In [None]:
var_cand.wmp_spontype.unique()

#### Non-campaign sponsors: Advertiser-media-race_of_focuse-level aggregation

In [None]:
var[var.federal_cd == 0].wmp_spontype.unique()

In [None]:
var[var.federal_cd == 0].wmp_office.unique()

In [None]:
'''
filter irrelevant offices among non-campaign sponsors
'''
irrelevant_offices = ['us house - other cycle', 'us senate - other cycle', 'down ballot', 
 'president - other cycle', 'ballot measure', 'election outside US or in US comm/territory', 'governor']
condition3 = (var.federal_cd == 0) & (~var.wmp_office.isin(irrelevant_offices))

var_noncand = var[condition3]

In [None]:
'''
Remove those with no race of focus identified
'''
var_noncand = var_noncand[~pd.isna(var_noncand.race_of_focus)]
var_noncand = var_noncand[var_noncand.race_of_focus != 'No race of focus']

In [None]:
'''
Aggregate at sponsor-media-race level
'''
cols2 = ['pd_id', 'wmp_creative_id', 'race_of_focus', 'wmp_media_type']

In [None]:
'''
Agg by pdid AND media type AND race of focus
'''
noncand_sponsor_agg = var_noncand[cols2].groupby(['pd_id', 'wmp_creative_id', 'race_of_focus']).first().reset_index()\
.groupby(['pd_id', 'race_of_focus', 'wmp_media_type']).agg(lambda x: ' '.join(x)).reset_index()

In [None]:
noncand_sponsor_agg.head(2)

In [None]:
noncand_sponsor_agg['num_unique'] = noncand_sponsor_agg.cid.apply(lambda x: len(x.split()))

# filter for sponsors with at least a pair of unique creatives
noncand_sponsor_agg = noncand_sponsor_agg[noncand_sponsor_agg.num_unique > 1]

In [None]:
noncand_sponsor_agg.shape

### Import trained corpous embedding and compute average similarity

In [None]:
corpus_embeddings = torch.load('../input_data/corpus_embedding_fb2022_uni.pt', map_location=torch.device('cpu'))

In [None]:
corpus_embeddings.shape
# corpus embedding index aligned with ascending cid index starting from 0 

In [None]:
def compute_avg_similarity(cid_lst):
    cid_lst = cid_lst.split()
    
    pairs = list(itertools.combinations(cid_lst, 2))
    if pairs:
        scores = []
        for pair in pairs: 
            cid1, cid2 = pair
            idx1 = int(cid1.lstrip('cid_'))
            idx2 = int(cid2.lstrip('cid_'))
            
            score = util.cos_sim(corpus_embeddings[idx1], corpus_embeddings[idx2])[0][0].numpy()
            scores.append(score)
        return np.array(scores).mean(), np.array(scores).std()
    return None

### Federal candidates: by advertiser and media type¶

In [None]:
cand_sponsor_agg.loc[:, ['avg', 'std']] = cand_sponsor_agg.cid.apply(lambda x: pd.Series(compute_avg_similarity(x), index=['avg', 'std'], dtype="float"))

In [None]:
cand_sponsor_agg.head(4)

In [None]:
'''
Save staging table for regression analysis: Candidates only
'''

cand_sponsor_agg.to_csv('../output_data/fb_set3_cand_pdid_media_average_pairwise_similarity.csv', index=False)

### non-campaign candidate sponsors: by sponsor - media - race of focus


In [None]:
noncand_sponsor_agg.loc[:, ['avg', 'std']] = noncand_sponsor_agg.cid.apply(lambda x: pd.Series(compute_avg_similarity(x), index=['avg', 'std'], dtype="float"))

In [None]:
'''
Save staging table for regression analysis: Non-candidates grouped by race of focus
'''
noncand_sponsor_agg.to_csv('../output_data/fb_set3_noncandidate_pdid_racefocus_media_average_pairwise_similarity.csv', index=False)