In [None]:
import os
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from stargazer.stargazer import Stargazer
from IPython.display import display, HTML

## Create regression table

In [None]:
source_path = '' # Define local data directory

In [None]:
var = pd.read_csv(os.path.join(source_path, 'fb_2022_adid_var.csv.gz'))

In [None]:
'''
Filter for Set 3
'''
var = var[var.set3 == 1]

In [None]:
var.shape

### Import average text similarity results

In [None]:
'''
fed candidate campaign sponsors
'''
df_cand = pd.read_csv('../output_data/fb_set3_cand_pdid_media_average_pairwise_similarity.csv')

'''''
non-campaign sponsors
'''''
df_noncand = pd.read_csv('../output_data/fb_set3_noncandidate_pdid_racefocus_media_average_pairwise_similarity.csv')

In [None]:
print(df_noncand.columns)
df_noncand.head(2)

In [None]:
'''
Select only relevant sponsors from set3 var table
'''
all_sponsor_ids = df_cand.pd_id.tolist() + df_noncand.pd_id.tolist()
var = var[var.pd_id.isin(all_sponsor_ids)]

### Process/explore variables

**Party variables**

In [None]:
'''
Create "optimized" party var
1) Use party variables from entity files. 
2) When they are not available, use pd_id level party classifier's predictions 
'''

var['party_pdid_opt'] = np.where(pd.isna(var.party_all), var.party_all_clf_pdid, var.party_all)

**Race competitiveness**: 

SD: safe Dem
DF: Dem favored
LD: Lean Dem
C: Too-close-to-call
LR: Lean Republican
RF: Republican favored
SR: Safe Republican
 
The SD and SR races are coded 1 in house2 (least competitive)
DF and RF are coded 2
LD and LR are coded 3
C is coded 4 (most competitive)

In [None]:
# load race competitiveness index file
# For access to this data, contact the authors
race = pd.read_csv('../../race_competitiveness.csv')

In [None]:
var = var.merge(race[['race_code','race_competitiveness']], how='left', left_on='race_of_focus', right_on='race_code')

**campaign resources**

Only for the subset of sponsors who invested in TV ads

In [None]:
# for access to this data, contact the authors
cost = pd.read_csv('../input_data/campaign_resource_fb2022.csv')

In [None]:
'''
Merge to regression tables
'''

df_cand = df_cand.merge(cost[['pd_id', 'estcost']], on='pd_id', how='left')
df_noncand = df_noncand.merge(cost[['pd_id', 'estcost']], on='pd_id', how='left')

In [None]:
# Log transform
df_cand['estcost'] = np.log(df_cand['estcost'])
df_noncand['estcost'] = np.log(df_noncand['estcost'])

In [None]:
df_cand.rename(columns={"estcost":"log_estimated_cost"}, inplace=True)
df_noncand.rename(columns={"estcost":"log_estimated_cost"}, inplace=True)

### Select regression variables

In [None]:
cols = ['pd_id', 'wmp_media_type', 'wmp_office', 'wmp_spontype', 'party_pdid_opt', 
        'race_of_focus', 'race_competitiveness', ]

In [None]:
data = var[cols]

In [None]:
data.head(2)

In [None]:
# FB/Instagram don't have plain text ads
df_cand.wmp_media_type.unique()

In [None]:
data.wmp_office.unique()

### Process regression variables

In [None]:
data['image'] = np.where((data.wmp_media_type == 'image'), 1, 0)

data['party'] = np.where(data.wmp_spontype.isin(['party', 'party national']), 1, 0)
data['group'] = np.where(data.wmp_spontype == 'group', 1, 0)
data['other_sponsor_type'] = np.where(data.wmp_spontype.isin(['government official',
                                                              'coordinated', 'government agency',]), 1, 0)

data['Democratic'] = np.where(data.party_pdid_opt == 'DEM', 1, 0)
data['third_party'] = np.where(data.party_pdid_opt == 'OTHER', 1, 0)

# Senate races
data['senate'] = np.where((data.wmp_office == 'us senate') | (data.race_of_focus.str.endswith('S0')), 1, 0)

In [None]:
# aggregate at the advertiser level
agg = data.groupby(['pd_id', 'wmp_media_type']).mean(numeric_only=True).reset_index()

### Candidate regression table

In [None]:
df_cand = df_cand.merge(agg, on=['pd_id', 'wmp_media_type'], how='left')

In [None]:
df_cand.rename(columns={'avg': 'AverageSimilarity'}, inplace=True)

In [None]:
df_cand['candidate'] = 1

In [None]:
df_cand.columns

### Non-candidate regression table

In [None]:
'''
for non-candidate, non-campaign sponsors

to merge with df_noncand 
'''
agg2 = data.groupby(['pd_id', 'wmp_media_type', 'race_of_focus']).mean(numeric_only=True).reset_index()

In [None]:
agg2.columns

In [None]:
df_noncand = df_noncand.merge(agg2, on=['pd_id', 'wmp_media_type', 'race_of_focus'], how='left')

df_noncand.rename(columns={'avg': 'AverageSimilarity'}, inplace=True)

In [None]:
df_noncand['candidate'] = 0

### Combine candidates only and non-candidates sponsors grouped by race of focus

In [None]:
df_cand.shape

In [None]:
df = pd.concat([df_cand, df_noncand.drop("race_of_focus", axis=1)])

In [None]:
df.columns

In [None]:
'''
Final regression table
'''
df.to_csv('../input_data/fb_regression_table_for_descriptives.csv', index=False)

### Import regression table directly

In [None]:
df = pd.read_csv('../input_data/fb_regression_table_for_descriptives.csv')

In [None]:
df_cand = df[df.candidate == 1]

### Subset of sponsors who invested in TV ads

In [None]:
df_cost = df[~pd.isna(df.log_estimated_cost)]

In [None]:
df_cand_cost = df_cost[(df_cost.candidate == 1)]

## Create DV (new method)

In [None]:
def create_sophistication_index(input_df):
    max_ = input_df.num_unique.max()
    min_ = input_df.num_unique.min()
    input_df['num_unique_normalized'] = (input_df.num_unique - min_)/(max_ - min_)
    input_df['sophistication_index'] = input_df['num_unique_normalized'] * (1 - input_df.AverageSimilarity)
    return input_df

In [None]:
df = create_sophistication_index(df)

In [None]:
df_cand = create_sophistication_index(df_cand)
df_cost = create_sophistication_index(df_cost)
df_cand_cost = create_sophistication_index(df_cand_cost)

## Regression

In [None]:
import statsmodels.api as sm

### New DV

In [None]:
rename_labels = {'senate': 'Senate', 'race_competitiveness': 'Race competitiveness', 
                  'third_party': 'Third party', 'image': 'Image',
                 'log_estimated_cost': 'TV ad spending (logged)', 
                 'party': 'Party', 'group': 'Group', 
                 'other_sponsor_type': 'Other sponsor type'}

In [None]:
# for house/senate candidates only 
form1 = "sophistication_index ~ senate + race_competitiveness \
        + Democratic + third_party + image"

form2 = "sophistication_index ~ senate + race_competitiveness \
        + Democratic + third_party + log_estimated_cost \
        + image"

model1 = smf.ols(formula=form1, data=df_cand).fit()
model2 = smf.ols(formula=form2, data=df_cand_cost).fit()

In [None]:
# DV Sophistication index, Candidates Only
cov_order = ['senate', 'race_competitiveness',
             'Democratic', 'third_party', 'log_estimated_cost', 
             'image', 'Intercept'] 

stargazer = Stargazer([model1, model2])
stargazer.covariate_order(cov_order)
stargazer.rename_covariates(rename_labels)
display(HTML(stargazer.render_html()))

In [None]:
# for all sponsors, non-campaign sponsors grouped by race 
form3 = "sophistication_index ~ senate + party + group + other_sponsor_type + race_competitiveness \
        + Democratic + third_party + image"

form4 = "sophistication_index ~ senate + party + group + other_sponsor_type + race_competitiveness \
        + Democratic + third_party + log_estimated_cost \
        + image"

model3 = smf.ols(formula=form3, data=df).fit()
model4 = smf.ols(formula=form4, data=df_cost).fit()

In [None]:
# DV Sophistication index, Candidates + non-candidates grouped by race of focus
cov_order = ['senate', 'party', 'group', 'other_sponsor_type', 'race_competitiveness',
             'Democratic', 'third_party', 'log_estimated_cost', 
             'image', 'Intercept'] 

stargazer = Stargazer([model3, model4])
stargazer.covariate_order(cov_order)
stargazer.rename_covariates(rename_labels)
display(HTML(stargazer.render_html()))

## Main effects plots

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.lines as mlines

In [None]:
def plot_coefficents(m1, m2, offset=-0.15):
    '''
    Generate a coefficients plot given two models
    
    Inputs: 
          m1, m2: two OLS regression models
          offset: display distance between results from two models on the axes
    '''
    coefficients1 = m1.params
    conf1 = m1.conf_int()
    conf1['coef'] = coefficients1
    conf1.columns = ['lower', 'upper', 'coef']
    conf1['model'] = 'Model 1'

    coefficients2 = m2.params
    conf2 = m2.conf_int()
    conf2['coef'] = coefficients2
    conf2.columns = ['lower', 'upper', 'coef']
    conf2['model'] = 'Model 2'
    
    conf12 = pd.concat([conf1, conf2])
    conf12.rename(index=rename_labels, inplace=True)
    conf12 = conf12[conf12.index != 'Intercept']
    
    # Add offset to separate the models
    conf12['y_offset'] = conf12.groupby(conf12.index).cumcount() * offset
    
    labels = list(conf12.index.unique()[::-1])

    y_steps = {}
    for i, label in enumerate(labels):
        y_steps[label] = i

    conf12['y_step'] = conf12.index.map(y_steps)
    
    # Plotting
    plt.figure(figsize=(10, 8))
    fig, ax = plt.subplots()

    colors=['black', 'gray']

    for i, model in enumerate(conf12.model.unique()):
        curr_coef = conf12[conf12.model == model]
        ax.errorbar(y=curr_coef['y_offset'] + curr_coef['y_step'], 
                     x=curr_coef['coef'], 
                     xerr=(curr_coef['upper'] - curr_coef['lower']) / 2, 
                     fmt='o', 
                     color=colors[i])



    # Create legend
    handle1 = mlines.Line2D([], [], color='black', linestyle='-', marker='o', markersize=4, label='Model 1')
    handle2 = mlines.Line2D([], [], color='gray', linestyle='-', marker='o', markersize=4, label='Model 2')
    ax.legend(handles=[handle1, handle2], loc='best')

    # Format yticks labels and other labels
    plt.yticks(ticks=range(len(labels)), labels=labels)
    plt.title('')
    plt.ylabel('Predictor')
    plt.xlabel('Coefficient')
    plt.axvline(0, color='grey', linestyle='--')
    plt.show()

In [None]:
plot_coefficents(model1, model2)

In [None]:
plot_coefficents(model3, model4,offset=-0.25)