In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
source_path = '' # Define local data import path

In [None]:
var = pd.read_csv(os.path.join(source_path, 'fb_2022_adid_var.csv.gz'))

In [None]:
# Set3
var = var.loc[var.set3 == 1]

In [None]:
text = pd.read_csv(os.path.join(source_path, 'fb_2022_adid_text.csv.gz'))

In [None]:
var = var.merge(text, how='left', on='ad_id')

In [None]:
var.columns

In [None]:
'''''
Regression table: WMP Set 3 (sponsor-media level)
'''''
df = pd.read_csv("../input_data/fb_regression_table_for_descriptives.csv")

In [None]:
df.columns

In [None]:
len(df.pd_id.unique()) # All unique sponsors

In [None]:
len(df.dropna(subset='log_estimated_cost').pd_id.unique()) # Number of sponsors who invested in TV ads

In [None]:
df_cand = df[df.candidate == 1]
print(df_cand.shape)

In [None]:
df.AverageSimilarity.mean()

In [None]:
# DV
sns.set_theme(style='ticks', palette="tab10")

sns.histplot(df.AverageSimilarity, kde=True, bins=40, color='gray', line_kws={'color': 'gray'})

In [None]:
sns.histplot(df_cand.AverageSimilarity, kde=True, bins=40, color='gray')

In [None]:
df = df.merge(var[['pd_id', 'page_name', 'disclaimer']].drop_duplicates(), how='left', on='pd_id')

In [None]:
df.AverageSimilarity.mean()

In [None]:
df_agg = df.groupby(['pd_id']).agg({'page_name': 'first', 'disclaimer': 'first', 'num_unique': 'sum'}).reset_index() 

In [None]:
df_agg.num_unique.mean()

In [None]:
df_agg.num_unique.median()

In [None]:
df_agg.sort_values(by='num_unique', ascending=False).head(40)

In [None]:
# aggregated num_unique histogram (sponsor level)
g = sns.histplot(df_agg.num_unique, kde=True, bins=200, color='gray')
g.set(title='Number of unique creatives at the sponsor level\n\nMeta', xlabel='')
plt.xlim((0, 100))

## Table to display in the draft

In [None]:
from IPython.display import display, HTML

In [None]:
df_cand = df[df.candidate == 1]
df_cand.shape

In [None]:
df_cand["sponsor"] = df_cand.disclaimer.str.upper()

In [None]:
def create_sophistication_index(input_df):
    max_ = input_df.num_unique.max()
    min_ = input_df.num_unique.min()
    input_df['num_unique_normalized'] = (input_df.num_unique - min_)/(max_ - min_)
    input_df['sophistication_index'] = input_df['num_unique_normalized'] * (1 - input_df.AverageSimilarity)
    return input_df

In [None]:
df_cand = create_sophistication_index(df_cand)

In [None]:
df_cand.drop_duplicates(inplace=True)

In [None]:
df_cand['sponsor'] = df_cand.sponsor.str.title()
df_cand['wmp_media_type'] = df_cand.wmp_media_type.str.title()
df_cand['AverageSimilarity'] = df_cand.AverageSimilarity.round(3)
df_cand['sophistication_index'] = df_cand.sophistication_index.round(3)

In [None]:
cols2display = ['sponsor', 'sophistication_index', 'wmp_media_type', 'num_unique', 'AverageSimilarity']

rename_dict = {'sponsor': 'Sponsor', 'sophistication_index': 'Sophistication index',
              'wmp_media_type': 'Media type', 'num_unique': 'Unique creatives', 
               'AverageSimilarity': 'Average text similarity'}

display_df = df_cand[cols2display].sort_values(by='sophistication_index', ascending=False).drop_duplicates().head(20)

display_df.rename(columns=rename_dict, inplace=True)

display(HTML(display_df.to_html(index=False)))