### Apply party_all_entity_clf to all entities in the 1.4m dataset

In [1]:
import pandas as pd
from joblib import load

In [2]:
d = pd.read_csv('../fb_2020/fb_2020_140m_adid_text_clean.csv.gz')

In [3]:
pdid = pd.read_csv('../fb_2020/fb_2020_140m_adid_var1.csv.gz', usecols=['ad_id', 'pd_id'])

In [4]:
d = d.merge(pdid, on='ad_id', how='left')

In [5]:
d.head()

Unnamed: 0,ad_id,page_name,disclaimer,ad_creative_body,ad_snapshot_url,ad_creative_link_caption,ad_creative_link_title,ad_creative_link_description,google_asr_text,aws_ocr_text,pd_id
0,x999618737203554,Texas Organizing Project,TEXAS ORGANIZING PROJECT POLITICAL ACTION COMM...,Early voting is here and we have the special o...,https://www.facebook.com/ads/library/?id=99961...,TOPpac.org TOPpac.org TOPpac.org TOPpac.org TO...,Texas Organizing Project,,,Genesis Draper. County Criminal Court No. 12. ...,pd-124974587580398-7
1,x387950208912185,League of Conservation Voters,League of Conservation Voters Education Fund,Our communities are burning. Our air is unbrea...,https://www.facebook.com/ads/library/?id=38795...,fb.me,DEMAND Climate Action Now,"Goal: 5,000 actions by Sunday",,I,pd-16477459734-5
2,x261774408593130,BlackPAC,Black PAC,We are better than this. It is time for us to ...,https://www.facebook.com/ads/library/?id=26177...,www.blackfloridavote.com,Vote Early,Make A Plan Today,We're experiencing a moral Reckoning with raci...,,pd-504161563089343-5
3,x636161653936319,Better PA,"Better Pennsylvania, Inc.","Just like your parents told you, patience is a...",https://www.facebook.com/ads/library/?id=63616...,,,,,,pd-112378316832885-1
4,x405984460395245,Florida Democratic Party,Florida Democratic Party,,https://www.facebook.com/ads/library/?id=40598...,FloridaDems.org,,,,,pd-7833534974-3


### Prepare text for inference

In [6]:
cols = ['disclaimer', 'page_name', 'ad_creative_body', 'ad_creative_link_caption', 'ad_creative_link_description', 'ad_creative_link_title', 'aws_ocr_text', 'google_asr_text']
d['ad_combined_text'] = d[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

In [7]:
d.shape

(1404005, 12)

In [8]:
len(d.pd_id.unique())

15164

In [9]:
# Deduplicate before concatenate ad texts
dd = d.drop_duplicates(subset=['pd_id','ad_combined_text'], keep='last')

In [10]:
dd.shape

(277519, 12)

In [11]:
dd.head()

Unnamed: 0,ad_id,page_name,disclaimer,ad_creative_body,ad_snapshot_url,ad_creative_link_caption,ad_creative_link_title,ad_creative_link_description,google_asr_text,aws_ocr_text,pd_id,ad_combined_text
0,x999618737203554,Texas Organizing Project,TEXAS ORGANIZING PROJECT POLITICAL ACTION COMM...,Early voting is here and we have the special o...,https://www.facebook.com/ads/library/?id=99961...,TOPpac.org TOPpac.org TOPpac.org TOPpac.org TO...,Texas Organizing Project,,,Genesis Draper. County Criminal Court No. 12. ...,pd-124974587580398-7,TEXAS ORGANIZING PROJECT POLITICAL ACTION COMM...
8,x1298334053850238,Rick Bennett for Senate,Rick Bennett for Senate,"Twenty years ago, I was unanimously elected St...",https://www.facebook.com/ads/library/?id=12983...,rickbennett.org,Common Sense and Common Ground,Candidate for Maine State Senate District 19,,EDMONDS MILLS TURNER. FERGUSON MITCHELL WOODCO...,pd-105383080873129-1,Rick Bennett for Senate Rick Bennett for Senat...
16,x352464732653623,Realtors for Ben McAdams,National Association of REALTORS,Utah REALTORS® have a champion in Ben McAdams....,https://www.facebook.com/ads/library/?id=35246...,realtorsformcadams.com,Vote McAdams November 3,Click here to learn more >>,,,pd-107849021067647-1,National Association of REALTORS Realtors for ...
19,x3079465968824749,Doug Collins,"COLLINS FOR SENATE, INC.",CLEVELAND: I'm hitting the road with fellow Tr...,https://www.facebook.com/ads/library/?id=30794...,Meet Trump Defender Doug Collins w/Special Gue...,Meet Trump Defender Doug Collins w/Special Gue...,"89 E Jarrard St, Cleveland, GA 30528-1228, Uni...",,Collins. TRUMP DEFENDER. STATEWIDE TOUR,pd-253791814655400-5,"COLLINS FOR SENATE, INC. Doug Collins CLEVELAN..."
23,x813228526100156,Jennifer Pawlik,Jennifer Pawlik for AZ House. Authorized by Je...,The last day to mail ballots is October 27. Yo...,https://www.facebook.com/ads/library/?id=81322...,,,,,Vote or Return Your. Ballot for Jennifer. Pawl...,pd-153755258313622-1,Jennifer Pawlik for AZ House. Authorized by Je...


In [12]:
d_pdid_txt = dd.groupby(['pd_id'])['ad_combined_text'].apply(lambda x: ' '.join(x)).reset_index()

In [13]:
d_pdid_txt.head()

Unnamed: 0,pd_id,ad_combined_text
0,pd-1000079383451499-1,George A Jackson Defender Association of Phila...
1,pd-1000253160054572-4,CORI BUSH FOR CONGRESS Cori Bush I didn’t run ...
2,pd-1000359586797421-2,Devine For Judge Devine For Judge Judge Josh D...
3,pd-100053898949-2,MONTANA REPUBLICAN STATE CENTRAL COMMITTEE Mon...
4,pd-1000723093289140-1,These ads ran without a disclaimer Ombudsman S...


### Make influence


In [14]:
# Load the model weights
mnb_clf = load('party_clf_pdid_mnb.joblib')



In [15]:
pred = mnb_clf.predict(d_pdid_txt['ad_combined_text'])

In [16]:
d_pdid_txt['party_all_clf_pdid'] = pred

In [17]:
d_pdid_txt = d_pdid_txt[['pd_id', 'party_all_clf_pdid']]

In [18]:
d_pdid_txt.head()

Unnamed: 0,pd_id,party_all_clf_pdid
0,pd-1000079383451499-1,DEM
1,pd-1000253160054572-4,DEM
2,pd-1000359586797421-2,REP
3,pd-100053898949-2,REP
4,pd-1000723093289140-1,DEM


In [19]:
d_pdid_txt.party_all_clf_pdid.value_counts()

DEM      10223
REP       4828
OTHER      113
Name: party_all_clf_pdid, dtype: int64

In [20]:
d_pdid_txt.shape

(15164, 2)

In [21]:
d_pdid_txt.to_csv("party_all_clf_pdid_fb_2020_140m.csv", index=False)