Adding post-processed variables for Google 2022 and create the "var table" (g2022_adid_var.csv). 

Merging output from these repos: [entity_linking_2022](https://github.com/Wesleyan-Media-Project/entity_linking_2022), [ABSA](https://github.com/Wesleyan-Media-Project/ABSA), [race_of_focus](https://github.com/Wesleyan-Media-Project/race_of_focus),  [party_classifier](https://github.com/Wesleyan-Media-Project/party_classifier), [ad_tone](https://github.com/Wesleyan-Media-Project/ad_tone), [ad_goal_classifier](https://github.com/Wesleyan-Media-Project/ad_goal_classifier), [party_classifier_pdid](https://github.com/Wesleyan-Media-Project/party_classifier_pdid), and [issue_classifier](https://github.com/Wesleyan-Media-Project/issue_classifier)

### Output table:
+ g2022_adid_var.csv

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
'''
Load the var1 table produced from 

data-post-production/01-merge-results/01_merge_preprocessed_multimedia_results.ipynb
'''
df = pd.read_csv('../../../data_post_production/g2022_adid_01062021_11082022_var1.csv.gz')

In [3]:
df.shape

(179263, 14)

# Adding data classification results to "var1" table
**Note: replace all data import paths below with your local filepaths**

## Add race of focus

In [4]:
rof = pd.read_csv('../../../race_of_focus/data/race_of_focus_google_2022.csv')

In [6]:
rof.head(2)

Unnamed: 0,ad_id,sub_bucket,race_of_focus
0,CR00000257354440376321,2.0,Downballot
1,CR00000354386341527553,3.3,No race of focus


In [7]:
df = df.merge(rof, on='ad_id', how='left')

## Add ad tone

### Constructed

In [8]:
tone_constructed = pd.read_csv('../../../ad_tone/data/ad_tone_constructed_g2022.csv.gz')

In [9]:
tone_constructed.shape

(58269, 2)

In [10]:
df = df.merge(tone_constructed, on='ad_id', how='left').drop_duplicates()

### Mentioned

In [11]:
tone_mentioned = pd.read_csv('../../../ad_tone/data/ad_tone_mentionbased_g2022.csv')

In [12]:
tone_mentioned.shape

(24366, 2)

In [13]:
df = df.merge(tone_mentioned, on='ad_id', how='left').drop_duplicates()

In [14]:
df.rename(columns={'ad_tone':'ad_tone_mentionbased'}, inplace=True)

## Add ad goal

In [15]:
goal = pd.read_csv('../../../ad_goal_classifier/data/ad_goal_rf_google_2022.csv.gz')

In [16]:
goal.shape

(179743, 33)

In [17]:
goal.head(2)

Unnamed: 0,ad_id,wmp_creative_id,ad_type,csum_agg,advertiser_id,aws_face_vid,aws_face_img,impressions,age_targeting,gender_targeting,...,goal_EVENT_predicted_prob,goal_POLL_prediction,goal_POLL_predicted_prob,goal_GATHERINFO_prediction,goal_GATHERINFO_predicted_prob,goal_LEARNMORE_prediction,goal_LEARNMORE_predicted_prob,goal_PRIMARY_PERSUADE_prediction,goal_PRIMARY_PERSUADE_predicted_prob,goal_highest_prob
0,CR00000257354440376321,cid_16941,VIDEO,3a0b45af177cfef8675852003ebb1e57838275d50d20b6...,AR08588079303567081473,WMPID5311;WMPID5292,,400000-450000,"18-24, 25-34, 35-44, 45-54, 55-64, ≥65","Male, Female, Unknown gender",...,0.003441,0,0.006437,0,0.016071,1,0.781739,1,0.94758,PRIMARY_PERSUADE
1,CR00000354386341527553,cid_1742,TEXT,,AR03715945093920718849,,,0-1000,,,...,0.004714,0,0.005244,0,0.013358,1,0.901786,1,0.938611,PRIMARY_PERSUADE


In [18]:
cols = ['ad_id', 'goal_DONATE_prediction',
       'goal_DONATE_predicted_prob', 'goal_CONTACT_prediction',
       'goal_CONTACT_predicted_prob', 'goal_PURCHASE_prediction',
       'goal_PURCHASE_predicted_prob', 'goal_GOTV_prediction',
       'goal_GOTV_predicted_prob', 'goal_EVENT_prediction',
       'goal_EVENT_predicted_prob', 'goal_POLL_prediction',
       'goal_POLL_predicted_prob', 'goal_GATHERINFO_prediction',
       'goal_GATHERINFO_predicted_prob', 'goal_LEARNMORE_prediction',
       'goal_LEARNMORE_predicted_prob', 'goal_PRIMARY_PERSUADE_prediction',
       'goal_PRIMARY_PERSUADE_predicted_prob', 'goal_highest_prob']

In [19]:
goal = goal[cols]

In [20]:
df = df.merge(goal, on='ad_id', how='left').drop_duplicates()

## Add party classifier trained at the advertiser level


In [21]:
party_advertiser = pd.read_csv('../../../party_classifier_pdid/party_all_clf_google_2022_advertiser_id.csv')

In [22]:
party_advertiser.shape

(5148, 2)

In [23]:
party_advertiser.head(2)

Unnamed: 0,advertiser_id,party_all_clf
0,AR00000475401340059649,REP
1,AR00008638175664668673,DEM


In [24]:
df = df.merge(party_advertiser, on='advertiser_id', how='left')

## Add party classifier at ad level

In [25]:
party_ad = pd.read_csv('../../../party_classifier (Ad Level)/data/google/party_predictions_google_2022.csv.gz')

In [26]:
party_ad.head(2)

Unnamed: 0,ad_id,prob_dem,prob_other,prob_rep,predicted_party_all,predicted_party_all_majvote
0,CR00115889762518171649,0.523488,0.036479,0.440033,DEM,DEM
1,CR00336564632430837761,0.523488,0.036479,0.440033,DEM,DEM


In [27]:
party_ad.rename(columns={'predicted_party_all': 'party_all_clf_adid', 
                         'predicted_party_all_majvote': 'party_all_clf_adid_agg'}, inplace=True)
party_ad.head(2)

Unnamed: 0,ad_id,prob_dem,prob_other,prob_rep,party_all_clf_adid,party_all_clf_adid_agg
0,CR00115889762518171649,0.523488,0.036479,0.440033,DEM,DEM
1,CR00336564632430837761,0.523488,0.036479,0.440033,DEM,DEM


In [28]:
df = df.merge(party_ad, on='ad_id', how='left').drop_duplicates()

## WMP party_all

In [29]:
wmp = pd.read_csv('../../../datasets/wmp_entity_files/Google/wmp_google_2022_entities_v112822.csv', encoding='latin1', usecols=['advertiser_id', 'party_all'])

In [30]:
wmp = wmp.replace('MISSING', np.nan)

In [31]:
wmp.party_all.value_counts(dropna=False)

party_all
NaN      3155
DEM       150
REP       123
OTHER       6
Name: count, dtype: int64

In [32]:
df = df.merge(wmp, on='advertiser_id', how='left')

## Add detected entities

In [33]:
ent = pd.read_csv('../../../entity_linking_2022/google/data/entity_linking_results_google_2022_notext_combined.csv.gz')

In [34]:
ent.shape

(71903, 3)

In [35]:
ent.head(2)

Unnamed: 0,ad_id,detected_entities,field
0,CR00000257354440376321,"WMPID5311, WMPID5311, WMPID5292, WMPID5311, WM...","advertiser_name, google_asr_text, aws_ocr_vide..."
1,CR00001421943412621313,WMPID1188,ad_text


In [36]:
ent['detected_entities'] = [i.replace('|', ',') for i in ent['detected_entities']]

In [37]:
ent.head(3)

Unnamed: 0,ad_id,detected_entities,field
0,CR00000257354440376321,"WMPID5311, WMPID5311, WMPID5292, WMPID5311, WM...","advertiser_name, google_asr_text, aws_ocr_vide..."
1,CR00001421943412621313,WMPID1188,ad_text
2,CR00001435481149538305,WMPID3770,aws_ocr_video_text


In [38]:
df = df.merge(ent, on='ad_id', how='left')

In [39]:
df.shape

(185966, 46)

## Add detected entities federal

In [40]:
cand = pd.read_csv('../../../datasets/candidates/wmpcand_120223_wmpid.csv')

In [41]:
cand.head(2)

Unnamed: 0,cand_id,wmpid,genelect_cd,CurrCand,cand_name,cand_office,cand_office_st,cand_office_dist,cand_party_affiliation,cand_incumbent_challenger_open_s,...,latino_crp,race_wmp,race_crp1,race_crp2,race_crpmena,hse_cmpt_gen,full_name,first_name,last_name,st_dist
0,H0AL01055,WMPID21,1,Y,"CARL, JERRY LEE, JR",H,AL,1.0,REP,INCUMBENT,...,N,White,W,,N,0,Jerry Carl,Jerry,Carl,AL1
1,H0AL02202,WMPID24,1,Y,"HARVEY-HALL, PHYLLIS",H,AL,2.0,DEM,CHALLENGER,...,n,Black,B,,n,0,Phyllis Harvey-Hall,Phyllis,Harvey-Hall,AL2


In [42]:
cand['cand_office'].value_counts(dropna=False)

cand_office
H    3612
S     683
Name: count, dtype: int64

In [43]:
candlist = cand['wmpid'].tolist()

In [44]:
df['detected_entities_federal'] = df['detected_entities'].apply(lambda x: ','.join(list(set([item.strip() for item in str(x).split(',') if item.strip() in candlist]))))


In [45]:
df['detected_entities_federal']= df['detected_entities_federal'].replace('', np.NaN)

## Add attack like

In [46]:
attacklike = pd.read_csv('../../../attack_like/google2022_attack-like.csv')

In [47]:
attacklike.head(2)

Unnamed: 0,ad_id,attacklike1_pred,attacklike1_prob,attacklike2_pred,attacklike2_prob,attacklike3_pred
0,CR00002202734107295745,1.0,0.665,1.0,0.989,2.0
1,CR00002786574781644801,1.0,0.81,1.0,0.976,2.0


In [48]:
attacklike.rename(columns={'attacklike1_pred': 'attacklike1_bert_pred', 'attacklike1_prob': 'attacklike1_bert_prob',
                           'attacklike2_pred': 'attacklike2_bert_pred', 'attacklike2_prob': 'attacklike2_bert_prob',
                          'attacklike3_pred': 'attacklike3_bert_pred'}, inplace=True)

In [49]:
df = df.merge(attacklike, on='ad_id', how='left')

### Attack like sentiment

In [50]:
attacklike_sent = pd.read_csv("../../../attack_like/google2022_senti.csv")

In [51]:
attacklike_sent.head(2)

Unnamed: 0,ad_id,btweet_senti,btweet_prob
0,CR00000257354440376321,POS,0.890607
1,CR00000354386341527553,POS,0.967603


In [52]:
attacklike_sent.rename(columns={'btweet_senti': 'attacklike_senti_pred', 
                                'btweet_prob': 'attacklike_senti_prob'}, inplace=True)

In [53]:
df = df.merge(attacklike_sent, on='ad_id', how='left').drop_duplicates()

## Add ABSA

In [54]:
sent = pd.read_csv("../../../ABSA/data/google_2022_ABSA_pred.csv.gz")

In [55]:
sent.head(2)

Unnamed: 0,ad_id,field,detected_entities,start,end,predicted_sentiment
0,CR12885901962544939009,advertiser_name,WMPID3138,33,39,1
1,CR18028502563606757377,advertiser_name,WMPID1566,0,7,1


In [56]:
grouped = sent.groupby(['ad_id', 'detected_entities'])
average_scores = grouped['predicted_sentiment'].sum().reset_index()
repetition_count = grouped.size().reset_index(name='number_of_mentions')
sent2 = pd.merge(average_scores, repetition_count, on=['ad_id', 'detected_entities'])

In [57]:
sent2.rename(columns={'detected_entities': 'ABSA_detected_entities', 
                      'predicted_sentiment': 'ABSA_predicted_sentiment_agg',
                      'number_of_mentions': 'ABSA_number_of_mentions'}, inplace=True)

In [58]:
agg_sent = sent2.groupby('ad_id').agg({'ABSA_number_of_mentions': lambda x: ','.join(map(str, x)),
                                       'ABSA_detected_entities': lambda x: ','.join(x),
                                       'ABSA_predicted_sentiment_agg': lambda x: ','.join(map(str, x))}).reset_index()

In [59]:
agg_sent.head(2)

Unnamed: 0,ad_id,ABSA_number_of_mentions,ABSA_detected_entities,ABSA_predicted_sentiment_agg
0,CR00000257354440376321,26,"WMPID5292,WMPID5311",-22
1,CR00001421943412621313,1,WMPID1188,-1


In [60]:
df = df.merge(agg_sent, on='ad_id', how='left')

## Add issue classifiers

In [61]:
issue = pd.read_csv('../../../issue_classifier/google_2022/data/g2022_output_processed.csv')

In [9]:
issue.head(2)

Unnamed: 0,ad_id,issue_field,issue_class
0,CR00004220544102694913,ad_text,ISSUE43
1,CR00006144689451302913,aws_ocr_img_text,ISSUE18


In [63]:
df = df.merge(issue, on='ad_id', how='left')

## Add AWS face federal

In [64]:
df['aws_face'] = df.apply(lambda row: ','.join(set(str(row['aws_face_img']).split(',')) | set(str(row['aws_face_vid']).split(','))) if (not pd.isnull(row['aws_face_img']) and not pd.isnull(row['aws_face_vid'])) else ','.join([item for item in str(row['aws_face_img']).split(',') + str(row['aws_face_vid']).split(',') if item != 'nan']) if pd.isnull(row['aws_face_img']) or pd.isnull(row['aws_face_vid']) else '', axis=1)


In [65]:
df['aws_face_federal'] = df['aws_face'].apply(lambda x: ','.join(list(set([item.strip() for item in str(x).split(',') if item.strip() in candlist]))))


In [66]:
df['aws_face_federal']= df['aws_face_federal'].replace('', np.NaN)

In [67]:
df['combined_entities_federal'] = (df['detected_entities_federal'].fillna('') + ',' + df['aws_face_federal'].fillna('')).str.strip(',')

In [68]:
df['combined_entities_federal']= df['combined_entities_federal'].replace('', np.NaN)

In [69]:
df['combined_entities_federal'] = df['combined_entities_federal'].apply(lambda x: list(set(x.split(','))) if isinstance(x, str) else np.nan)


In [70]:
df['combined_entities_federal'] = df['combined_entities_federal'].apply(lambda x: ','.join(x) if isinstance(x, list) else np.nan)

## Add federal verified

In [71]:
df['federal_verified'] = df[['aws_face_federal','detected_entities_federal']].isna().sum(axis=1).apply(lambda x: 'No' if x==2 else 'Yes')

In [72]:
df.federal_verified.value_counts(dropna=False)

federal_verified
No     128364
Yes     59554
Name: count, dtype: int64

## Save final results

In [73]:
df.columns

Index(['ad_id', 'advertiser_id', 'date_range_start', 'date_range_end',
       'num_of_days', 'impressions', 'age_targeting', 'gender_targeting',
       'geo_targeting_included', 'geo_targeting_excluded',
       'spend_range_min_usd', 'spend_range_max_usd', 'aws_face_vid',
       'aws_face_img', 'sub_bucket', 'race_of_focus', 'ad_tone_constructed',
       'ad_tone_mentionbased', 'goal_DONATE_prediction',
       'goal_DONATE_predicted_prob', 'goal_CONTACT_prediction',
       'goal_CONTACT_predicted_prob', 'goal_PURCHASE_prediction',
       'goal_PURCHASE_predicted_prob', 'goal_GOTV_prediction',
       'goal_GOTV_predicted_prob', 'goal_EVENT_prediction',
       'goal_EVENT_predicted_prob', 'goal_POLL_prediction',
       'goal_POLL_predicted_prob', 'goal_GATHERINFO_prediction',
       'goal_GATHERINFO_predicted_prob', 'goal_LEARNMORE_prediction',
       'goal_LEARNMORE_predicted_prob', 'goal_PRIMARY_PERSUADE_prediction',
       'goal_PRIMARY_PERSUADE_predicted_prob', 'goal_highest_prob',
 

In [74]:
OUTFILE = "../../g2022_adid_var.csv.gz"

df.to_csv(OUTFILE, 
        index=False,
         compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1})