In [1]:
import pandas as pd
import numpy as np

### Parameters

In [2]:
# Inputs
# fb_2020_140m_adid_var1.csv.gz is an output from repo fb_2020
path_fb_140m_vars = "../fb_2020/fb_2020_140m_adid_var1.csv.gz"
path_wmp_ent = "../datasets/wmp_entity_files/Facebook/2020/wmp_fb_entities_v090622.csv"

In [3]:
# Outputs
path_training_data = "data/facebook/140m_with_page_id_based_training_data.csv.gz"

### Ad id, pd_id, page_id

In [4]:
df = pd.read_csv(path_fb_140m_vars, usecols =['ad_id', 'pd_id', 'page_id'])

Merge with the wmp entities file

In [5]:
wmp = pd.read_csv(path_wmp_ent, usecols=['pd_id', 'party_all'])
print(wmp.shape)
wmp = wmp.loc[wmp['party_all']!='MISSING']
print(wmp.shape)

(22137, 2)
(8124, 2)


In [6]:
df = df.merge(wmp, how='left', on='pd_id')

In [7]:
df.loc[df['party_all'].isna(), 'party_all'] ='NOTCODED'

In [8]:
df.party_all.value_counts()

NOTCODED    525256
DEM         443189
REP         429136
OTHER         6424
Name: party_all, dtype: int64

In [9]:
df.head()

Unnamed: 0,ad_id,pd_id,page_id,party_all
0,x999618737203554,pd-124974587580398-7,n124974587580398,DEM
1,x387950208912185,pd-16477459734-5,n16477459734,DEM
2,x261774408593130,pd-504161563089343-5,n504161563089343,NOTCODED
3,x636161653936319,pd-112378316832885-1,n112378316832885,NOTCODED
4,x405984460395245,pd-7833534974-3,n7833534974,DEM


- Count how many different parties a page_id contains;
- It's possible that two pd_ids of the same page_id may have different parties, and therefore exclude page ids that may have more than 1 party all values. 

In [10]:
test = pd.DataFrame({'party_all': df['party_all'], 'page_id': df['page_id']})
test = test.groupby('page_id')['party_all'].value_counts().unstack(fill_value=0)

In [11]:
test.head()

party_all,DEM,NOTCODED,OTHER,REP
page_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
n1000079383451499,0,1,0,0
n1000253160054572,1721,0,0,0
n1000359586797421,0,29,0,0
n100053898949,0,0,0,74
n1000723093289140,0,2,0,0


In [12]:
test.shape

(13561, 4)

In [13]:
test['usable_party_all'] = np.count_nonzero(test, axis = 1)==1

In [14]:
test.usable_party_all.value_counts()

True     13392
False      169
Name: usable_party_all, dtype: int64

In [15]:
test.loc[test['usable_party_all']==False].head()

party_all,DEM,NOTCODED,OTHER,REP,usable_party_all
page_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
n100201634866577,0,10,0,9,False
n100282731561199,0,1,0,139,False
n100362008472396,126,10,0,0,False
n100373524955306,0,50,0,3,False
n100394095067572,0,20,0,8,False


In [16]:
test = test.reset_index(drop=False)

In [17]:
test = test[['page_id', 'usable_party_all']]

In [18]:
test.shape

(13561, 2)

In [19]:
df = df.merge(test, on='page_id', how='left')

In [20]:
df['party_all_usable'] = df['party_all']

In [21]:
df.loc[df.usable_party_all == False, 'party_all_usable'] = np.nan

In [22]:
df.loc[df.party_all_usable == 'NOTCODED', 'party_all_usable']= np.nan

### Create train and test split

In [23]:
df['party_all_usable'].value_counts(dropna=False)

NaN      553478
REP      426402
DEM      417930
OTHER      6195
Name: party_all_usable, dtype: int64

In [24]:
page_id_with_usable_party_all = df.loc[df['party_all_usable'].notnull(), 'page_id'].unique()

In [25]:
np.random.seed(123)

In [26]:
split = np.random.choice(['train', 'test'], size=len(page_id_with_usable_party_all), replace=True, p=[0.7, 0.3])

In [27]:
split_df = pd.DataFrame({'page_id': page_id_with_usable_party_all,
                         'split': split})

In [28]:
df = df.merge(split_df, on='page_id', how='left')

In [29]:
df.split.value_counts(dropna=False)

train    695594
NaN      553478
test     154933
Name: split, dtype: int64

In [30]:
df = df.loc[df['split'].notna()]

In [31]:
df.head()

Unnamed: 0,ad_id,pd_id,page_id,party_all,usable_party_all,party_all_usable,split
0,x999618737203554,pd-124974587580398-7,n124974587580398,DEM,True,DEM,train
1,x387950208912185,pd-16477459734-5,n16477459734,DEM,True,DEM,train
4,x405984460395245,pd-7833534974-3,n7833534974,DEM,True,DEM,train
6,x265231068212432,pd-243510442756370-2,n243510442756370,DEM,True,DEM,train
7,x1949505221867086,pd-24413227922-7,n24413227922,DEM,True,DEM,test


In [32]:
df.to_csv(path_training_data, index=False,
         compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1})