In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../input_data/gg_regression_table_for_descriptives.csv')

In [None]:
df.AverageSimilarity.mean()

In [None]:
df_cand = df[(df.candidate == 1)]

## DV descriptives

In [None]:
source_path = '' # Local source path

In [None]:
var = pd.read_csv(os.path.join(source_path, 'g2022_adid_var.csv.gz'))

In [None]:
'''
Select Set 3 from the universe of possible federal election ads
'''
var = var.loc[var.set3 == 1]

**General election period**

In [None]:
condition = (var.date_range_end >= '2022-09-01') & (var.date_range_start <= '2022-11-30')
'''
Second condition is not necessary: resulting dataframe shapes are the same.
'''
var = var[condition]

In [None]:
text = pd.read_csv(os.path.join(source_path, 'g2022_adid_text.csv.gz'))

In [None]:
var = var.merge(text.drop('advertiser_name', axis=1), how='left', on='ad_id').drop_duplicates() 
# drop advertiser_name to avoid column name collision

In [None]:
var.columns

**Number of unique creatives: General election period**

In [None]:
sns.set_theme(style='ticks', palette="tab10")

sns.histplot(df.AverageSimilarity, kde=True, bins=40, color='gray', line_kws={'color': 'gray'})

In [None]:
df_agg = df.merge(var[['advertiser_id', 'advertiser_name']], how='left', on='advertiser_id').drop_duplicates()

In [None]:
df_agg['advertiser_name'] = df_agg['advertiser_name'].str.lower()

In [None]:
df_agg = df_agg.groupby(['advertiser_id']).agg({'advertiser_name': 'first', 'num_unique': 'sum'}).reset_index() 

In [None]:
df_agg.num_unique.mean()

In [None]:
df_agg.num_unique.median()

In [None]:
df_agg[df_agg.advertiser_name == 'nrsc'] 

In [None]:
df_agg.sort_values(by='num_unique', ascending=False).head(40)

In [None]:
# aggregated num_unique histogram (sponsor level)
sns.histplot(df_agg.num_unique, kde=True, bins=200, color='gray')
plt.xlim((0, 100))

### Sophistication index

In [None]:
def create_sophistication_index(input_df):
    max_ = input_df.num_unique.max()
    min_ = input_df.num_unique.min()
    input_df['num_unique_normalized'] = (input_df.num_unique - min_)/(max_ - min_)
    input_df['sophistication_index'] = input_df['num_unique_normalized'] * (1 - input_df.AverageSimilarity)
    return input_df

In [None]:
df_cand = create_sophistication_index(df_cand)

In [None]:
cols = ['advertiser_id', 'advertiser_name',]
df_cand = df_cand.merge(var[cols], how='left', on='advertiser_id').drop_duplicates()

In [None]:
df_cand.columns

#### General eleciton period

**To display in paper**

In [None]:
from IPython.display import display, HTML

In [None]:
df_cand["advertiser_name"] = df_cand.advertiser_name.str.title()

In [None]:
df_cand["ad_type"] = df_cand.ad_type.str.title()

**general election period**

In [None]:
cols2display = ['advertiser_name', 'sophistication_index', 'ad_type', 'num_unique', 'AverageSimilarity']

display_df = df_cand[cols2display].drop_duplicates().sort_values(by='sophistication_index', ascending=False).head(22)

display(HTML(display_df.to_html(index=False)))

In [None]:
df_cand[cols2display].sort_values(by='sophistication_index', ascending=False).head(50)