In [None]:
import os
import numpy as np
import pandas as pd
import json
import itertools
import pickle
import matplotlib.pyplot as plt
import torch
from sentence_transformers import SentenceTransformer, util

## Prepare data

In [None]:
source_path = '' # Local data source path

In [None]:
'''
Import the var table
'''
var = pd.read_csv(os.path.join(source_path, 'g2022_adid_var.csv.gz'))

### Subset for general election period data (Sept 2022 - Nov 2022)
**Note: This step can be skipped because the current Google dataset we have made public includes advertising data from the general election period only**

In [None]:
condition = (var.date_range_end >= '2022-09-01') & (var.date_range_start <= '2022-11-30')
'''
Second condition is not necessary: resulting dataframe shapes are the same.
'''

var = var[condition]

### Filter for set 3 and select only the confirmed Senate/House races

In [None]:
'''
Set 3
'''
var = var.loc[var.set3 == 1]

In [None]:
var[~var.race_of_focus.isin(['No race of focus', 'Downballot'])].sub_bucket.unique()

In [None]:
'''
Only house/senate races 
'''
condition2 = (var.wmp_sen == 1) | (var.wmp_hse == 1) | (var.wmp_office == 'us senate') | (var.wmp_office == 'us house') | (~var.race_of_focus.isin(['No race of focus', 'Downballot']))

In [None]:
var = var[condition2]

### Import text and ad-cid mapping tables

In [None]:
'''
Import "text" table to get "ad_type"

'''
text = pd.read_csv(os.path.join(source_path, 'g2022_adid_text.csv.gz'),
                   usecols=['ad_id', 'ad_type'])

In [None]:
'''
Merge with text fields
'''
var = var.merge(text, how='left', on='ad_id').drop_duplicates()

### Import creative ids with nan text fields and remove nan values

In [None]:
nan_ids = set(pd.read_csv('../input_data/nan_cids.csv').nan_cid.to_list())

In [None]:
'''
Remove nan text values
'''
var = var[var.wmp_creative_id.apply(lambda x: x not in nan_ids)]

### Aggregate data at sponsor-media level 

#### Federal candidates: House/Senate

In [None]:
var_cand = var[var.federal_cd==1]

In [None]:
cols = ['advertiser_id', 'wmp_creative_id', 'ad_type']

In [None]:
cand_sponsor_agg = var_cand[cols].groupby(['advertiser_id', 'ad_type', 'wmp_creative_id']).first().reset_index().groupby(['advertiser_id', 'ad_type']).agg(lambda x: ' '.join(x)).reset_index()

In [None]:
cand_sponsor_agg.rename(columns={'wmp_creative_id':'cids'}, inplace=True)

In [None]:
cand_sponsor_agg['num_unique'] = cand_sponsor_agg.cids.apply(lambda x: len(x.split()))

In [None]:
cand_sponsor_agg = cand_sponsor_agg[cand_sponsor_agg.num_unique > 1]

#### Non-campaign sponsors: Advertiser-media-race_of_focuse-level aggregation

##### Eliminate irrelevant offices among non-candidate campaign sponsors

In [None]:
condition3 = (var.federal_cd == 0) & (~var.wmp_office.isin(['us house - other cycle', 'us senate - other cycle']))
var_noncand = var[condition3]

In [None]:
'''
Drop no race of focus and NAs
'''
var_noncand = var_noncand[~pd.isna(var_noncand.race_of_focus)]

In [None]:
var_noncand = var_noncand[var_noncand.race_of_focus != 'No race of focus']

In [None]:
cols2 = ['advertiser_id', 'wmp_creative_id', 'ad_type', 'race_of_focus']

In [None]:
noncand_sponsor_agg = var_noncand[cols2].groupby(['advertiser_id', 'ad_type', 'race_of_focus', 'wmp_creative_id']).first().reset_index()\
.groupby(['advertiser_id', 'ad_type', 'race_of_focus']).agg(lambda x: ' '.join(x)).reset_index()

In [None]:
noncand_sponsor_agg.rename(columns={'wmp_creative_id':'cids'}, inplace=True)
noncand_sponsor_agg['num_unique'] = noncand_sponsor_agg.cids.apply(lambda x: len(x.split()))

# filter for sponsors with at least a pair of unique creatives
noncand_sponsor_agg = noncand_sponsor_agg[noncand_sponsor_agg.num_unique > 1]

## Import corpus embedding model (indices reordered version)

In [None]:
corpus_embeddings = torch.load('../model/corpus_embedding_google2022_unique_lite_reordered.pt')

## compute pairwise cosine similarity
**Save average and standard deviation**

In [None]:
def compute_avg_similarity(cid_lst):
    cid_lst = cid_lst.split()
    
    pairs = list(itertools.combinations(cid_lst, 2))
    if pairs:
        scores = []
        for pair in pairs: 
            cid1, cid2 = pair
            idx1 = int(cid1.lstrip('cid_'))
            idx2 = int(cid2.lstrip('cid_'))
            
            score = util.cos_sim(corpus_embeddings[idx1], corpus_embeddings[idx2])[0][0].numpy()
            scores.append(score)
        return np.array(scores).mean(), np.array(scores).std()
    return None

### Federal candidates: by advertiser and media type

In [None]:
cand_sponsor_agg.loc[:, ['avg', 'std']] = cand_sponsor_agg.cids.apply(lambda x: pd.Series(compute_avg_similarity(x), index=['avg', 'std'], dtype="float"))

In [None]:
'''
general election period only: Sept - Nov 2022
'''

cand_sponsor_agg.to_csv('../output_data/g2022_set3_gen_elect_cand_media_level_average_pairwise_similarity.csv', index=False)

### non-campaign candidate sponsors: by sponsor - media - race of focus

In [None]:
noncand_sponsor_agg.loc[:, ['avg', 'std']] = noncand_sponsor_agg.cids.apply(lambda x: pd.Series(compute_avg_similarity(x), index=['avg', 'std'], dtype="float"))

In [None]:
noncand_sponsor_agg.shape

In [None]:
noncand_sponsor_agg.num_unique.max()

In [None]:
'''
general election period only
'''
noncand_sponsor_agg.to_csv('../output_data/g2022_set3_gen_elect_noncandidate_advertiser_racefocus_media_level_average_pairwise_similarity.csv', index=False)

## Pairwise similarity distribution for individual candidate cases

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
'''
general election period
'''
adv_agg = pd.read_csv('output_data/g2022_set3_gen_elect_advertiser_media_level_average_pairwise_similarity.csv')

In [None]:
adv_agg.head()

In [None]:
def compute_pairwise_similarity_adv_id(df, adv_id, media_type):
    
    cid_lst= df[(df.advertiser_id == adv_id) & (df.ad_type == media_type)].cids.iloc[0]
    
    cid_lst = cid_lst.split()
    
    pairs = list(itertools.combinations(cid_lst, 2))
    
    if pairs:
        scores = []
        for pair in pairs: 
            cid1, cid2 = pair
            idx1 = int(cid1.lstrip('cid_'))
            idx2 = int(cid2.lstrip('cid_'))
            
            score = util.cos_sim(corpus_embeddings[idx1], corpus_embeddings[idx2])[0][0].numpy()
            scores.append(float(score))
        return scores
    return None

In [None]:
# Warnock general election

warnock_vid = compute_pairwise_similarity_adv_id(adv_agg, 'AR07182956219827486721', 'VIDEO')
warnock_txt = compute_pairwise_similarity_adv_id(adv_agg, 'AR07182956219827486721', 'TEXT')

sns.set_theme(style='ticks', palette="tab10")

fig, ax = plt.subplots(2, sharex=True)

sns.histplot(warnock_vid, kde=True, bins=40, color='gray', ax=ax[0])
sns.histplot(warnock_txt, kde=True, bins=40, color='gray', ax=ax[1])

ax[0].set_xlim(0, 1)
ax[1].set_xlim(0, 1)


ax[0].set(title='Pairwise text similarity distribution (video): Raphael Warnock', xlabel='similarity score')
ax[1].set(title='Pairwise text similarity distribution (text): Raphael Warnock', xlabel='similarity score')

In [None]:
# AOC general election

aoc_vid = compute_pairwise_similarity_adv_id(adv_agg, 'AR17095295161908330497', 'VIDEO')
aoc_txt = compute_pairwise_similarity_adv_id(adv_agg, 'AR17095295161908330497', 'TEXT')

sns.set_theme(style='ticks', palette="tab10")

fig, ax = plt.subplots(2, sharex=True)

sns.histplot(aoc_vid, kde=True, bins=40, color='gray', ax=ax[0])
sns.histplot(aoc_txt, kde=True, bins=40, color='gray', ax=ax[1])

ax[0].set_xlim(0, 1)
ax[1].set_xlim(0, 1)


ax[0].set(title='Pairwise text similarity distribution (video): Alexandria Ocasio-Cortez', xlabel='similarity score')
ax[1].set(title='Pairwise text similarity distribution (text): Alexandria Ocasio-Cortez', xlabel='similarity score')