### Apply party_all_entity_clf to all entities in the 1.4m dataset

In [1]:
import pandas as pd
from joblib import load

In [2]:
d = pd.read_csv('../fb_2020/fb_2020_140m_adid_text_clean.csv.gz')

In [3]:
pdid = pd.read_csv('../fb_2020/fb_2020_140m_adid_var1.csv.gz', usecols=['ad_id', 'pd_id'])

In [4]:
d = d.merge(pdid, on='ad_id', how='left')

### Prepare text for inference

In [5]:
cols = ['disclaimer', 'page_name', 'ad_creative_body', 'ad_creative_link_caption', 'ad_creative_link_description', 'ad_creative_link_title', 'aws_ocr_text', 'google_asr_text']
d['combined'] = d[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
d['combined'] = d['combined'].str.strip()

In [6]:
d.shape

(1404005, 12)

In [7]:
len(d.pd_id.unique())

15164

In [8]:
# Deduplicate before concatenate ad texts
dd = d.drop_duplicates(subset=['pd_id','combined'], keep='last')

In [9]:
dd.shape

(277515, 12)

In [10]:
d_pdid_txt = dd.groupby(['pd_id'])['combined'].apply(lambda x: ' '.join(x)).reset_index()

In [11]:
d_pdid_txt.head()

Unnamed: 0,pd_id,combined
0,pd-1000079383451499-1,George A Jackson Defender Association of Phila...
1,pd-1000253160054572-4,CORI BUSH FOR CONGRESS Cori Bush I didn’t run ...
2,pd-1000359586797421-2,Devine For Judge Devine For Judge Judge Josh D...
3,pd-100053898949-2,MONTANA REPUBLICAN STATE CENTRAL COMMITTEE Mon...
4,pd-1000723093289140-1,These ads ran without a disclaimer Ombudsman S...


### Make influence


In [12]:
# Load the best model weights
mnb_clf = load('models/party_clf_pdid_rf.joblib')

In [13]:
pred = mnb_clf.predict(d_pdid_txt['combined'])

In [14]:
d_pdid_txt['party_all_clf_pdid'] = pred

In [15]:
d_pdid_txt = d_pdid_txt[['pd_id', 'party_all_clf_pdid']]

In [16]:
d_pdid_txt.head()

Unnamed: 0,pd_id,party_all_clf_pdid
0,pd-1000079383451499-1,DEM
1,pd-1000253160054572-4,DEM
2,pd-1000359586797421-2,REP
3,pd-100053898949-2,REP
4,pd-1000723093289140-1,DEM


In [17]:
d_pdid_txt.party_all_clf_pdid.value_counts()

DEM      9560
REP      5455
OTHER     149
Name: party_all_clf_pdid, dtype: int64

In [18]:
d_pdid_txt.to_csv("party_all_clf_pdid_fb_2020_140m.csv", index=False)