### party_clf_entity training 

In [1]:
import sklearn.model_selection as ms
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
from joblib import dump, load
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Input
path_train_test = "data/facebook/140m_with_page_id_based_training_data.csv.gz"
# fb_2020_140m_adid_text_clean.csv.gz is an output from repo fb_2020
path_train_test_text = "../fb_2020/fb_2020_140m_adid_text_clean.csv.gz"

In [3]:
# Load train/test metadata
d = pd.read_csv(path_train_test, encoding='UTF-8', keep_default_na = False)
# Load train/test text
d_text = pd.read_csv(path_train_test_text, encoding='UTF-8', keep_default_na = False)
# Merge
d = d.merge(d_text, on = "ad_id")

In [4]:
# All fields
cols = ['disclaimer', 'page_name', 'ad_creative_body', 'ad_creative_link_caption', 'ad_creative_link_description', 'ad_creative_link_title', 'aws_ocr_text', 'google_asr_text']
d['combined'] = d[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

# Remove duplicate texts
d = d.drop_duplicates(subset=['combined'])

# Split by pd-id (previously assigned)
train = d[d['split'] == 'train']
test = d[d['split'] == 'test']

### Create entity level dataframes

In [5]:
train_pdid = train.drop_duplicates(subset=['pd_id'], keep='last')
train_pdid = train_pdid[['pd_id', 'party_all_usable']]
print(train_pdid.shape)

(2235, 2)


In [6]:
test_pdid = test.drop_duplicates(subset=['pd_id'], keep='last')
test_pdid = test_pdid[['pd_id', 'party_all_usable']]
print(test_pdid.shape)

(959, 2)


### Aggregate text to pd id level for training and test

In [7]:
d_pdid_txt = d.groupby(['pd_id'])['combined'].apply(lambda x: ' '.join(x)).reset_index()

In [8]:
d_pdid_txt['text_length'] = d_pdid_txt['combined'].apply(len)

In [9]:
d_pdid_txt.head()

Unnamed: 0,pd_id,combined,text_length
0,pd-1000253160054572-4,CORI BUSH FOR CONGRESS Cori Bush This is not a...,53423
1,pd-100053898949-2,MONTANA REPUBLICAN STATE CENTRAL COMMITTEE Mon...,8118
2,pd-100125675072407-1,"Drew-Montez Clark for State Rep District 80, R...",426
3,pd-100131284943324-1,LETLOW FOR CONGRESS Luke Letlow For Congress M...,7724
4,pd-100140741454415-2,BISH FOR CONGRESS 2020 Chris Bish for Congress...,4172


In [10]:
train_pdid = train_pdid.merge(d_pdid_txt, how ='left', on='pd_id')

In [11]:
test_pdid = test_pdid.merge(d_pdid_txt, how ='left', on='pd_id')

### Models

In [12]:
np.random.seed(123)

#### MultinomialNB

In [13]:
mnb_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [14]:
mnb_params = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': [1, 1e-1, 1e-2]
}

In [15]:
mnb_grid = GridSearchCV(mnb_clf, mnb_params, cv=5, n_jobs=-1)

In [16]:
mnb_grid.fit(train_pdid['combined'], train_pdid['party_all_usable'])

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'clf__alpha': [1, 0.1, 0.01],
                         'tfidf__norm': ('l1', 'l2'),
                         'tfidf__use_idf': (True, False),
                         'vect__ngram_range': [(1, 1), (1, 2), (2, 2)]})

In [17]:
print("Best Params: ", mnb_grid.best_params_)

Best Params:  {'clf__alpha': 0.01, 'tfidf__norm': 'l2', 'tfidf__use_idf': False, 'vect__ngram_range': (2, 2)}


In [18]:
print(metrics.classification_report(test_pdid['party_all_usable'], mnb_grid.predict(test_pdid['combined']), digits = 3))

              precision    recall  f1-score   support

         DEM      0.820     0.965     0.887       491
       OTHER      1.000     0.068     0.128        44
         REP      0.918     0.818     0.865       424

    accuracy                          0.859       959
   macro avg      0.913     0.617     0.627       959
weighted avg      0.872     0.859     0.842       959



In [19]:
dump(mnb_grid, 'models/party_clf_pdid_mnb.joblib')

['models/party_clf_pdid_mnb.joblib']

#### Logistic regression

In [20]:
lr_clf = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(max_iter=500)),
               ])

In [21]:
lr_params = [{'clf__penalty': ['l2'],
        'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'clf__solver': ['newton-cg']}] 

In [22]:
lr_grid = GridSearchCV(lr_clf, lr_params, cv=5, n_jobs=-1) 

In [23]:
lr_grid.fit(train_pdid['combined'], train_pdid['party_all_usable'])

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf',
                                        LogisticRegression(max_iter=500))]),
             n_jobs=-1,
             param_grid=[{'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                          'clf__penalty': ['l2'],
                          'clf__solver': ['newton-cg']}])

In [24]:
print("Best Params: ", lr_grid.best_params_)

Best Params:  {'clf__C': 100, 'clf__penalty': 'l2', 'clf__solver': 'newton-cg'}


In [25]:
print(metrics.classification_report(test_pdid['party_all_usable'], lr_grid.predict(test_pdid['combined']), digits = 3))

              precision    recall  f1-score   support

         DEM      0.845     0.900     0.872       491
       OTHER      0.800     0.364     0.500        44
         REP      0.856     0.840     0.848       424

    accuracy                          0.849       959
   macro avg      0.834     0.701     0.740       959
weighted avg      0.848     0.849     0.844       959



In [26]:
dump(lr_grid, 'models/party_clf_pdid_logit.joblib')

['models/party_clf_pdid_logit.joblib']

#### SVM

In [27]:
svm_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier())])

In [28]:
svm_params = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}

In [29]:
svm_grid = GridSearchCV(svm_clf, svm_params, cv=5, n_jobs=-1)

In [30]:
svm_grid.fit(train_pdid['combined'], train_pdid['party_all_usable'])

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf', SGDClassifier())]),
             n_jobs=-1,
             param_grid={'clf__alpha': (0.01, 0.001),
                         'tfidf__use_idf': (True, False),
                         'vect__ngram_range': [(1, 1), (1, 2)]})

In [31]:
print("Best Params: ", svm_grid.best_params_)

Best Params:  {'clf__alpha': 0.001, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}


In [32]:
print(metrics.classification_report(test_pdid['party_all_usable'], svm_grid.predict(test_pdid['combined']), digits = 3))

              precision    recall  f1-score   support

         DEM      0.834     0.943     0.885       491
       OTHER      1.000     0.045     0.087        44
         REP      0.881     0.835     0.857       424

    accuracy                          0.854       959
   macro avg      0.905     0.608     0.610       959
weighted avg      0.862     0.854     0.836       959



In [33]:
dump(svm_grid, 'models/party_clf_pdid_svm.joblib')

['models/party_clf_pdid_svm.joblib']

#### Random Forest (Best classifier)

In [34]:
rf_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ("clf", RandomForestClassifier())
])

In [35]:
rf_params = {
    'clf__n_estimators': [5,50,100,500],
    'clf__max_depth': [1, 5, 10, 25],
    'clf__max_features': [*np.arange(0.1, 1.1, 0.1)],
}

In [36]:
rf_grid = GridSearchCV(rf_clf, rf_params, cv=5, n_jobs=-1)

In [37]:
rf_grid.fit(train_pdid['combined'], train_pdid['party_all_usable'])

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect', CountVectorizer()),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf', RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'clf__max_depth': [1, 5, 10, 25],
                         'clf__max_features': [0.1, 0.2, 0.30000000000000004,
                                               0.4, 0.5, 0.6,
                                               0.7000000000000001, 0.8, 0.9,
                                               1.0],
                         'clf__n_estimators': [5, 50, 100, 500]})

In [38]:
print("Best Params: ", rf_grid.best_params_)

Best Params:  {'clf__max_depth': 25, 'clf__max_features': 0.1, 'clf__n_estimators': 500}


In [39]:
print(metrics.classification_report(test_pdid['party_all_usable'], rf_grid.predict(test_pdid['combined']), digits = 3))

              precision    recall  f1-score   support

         DEM      0.843     0.941     0.889       491
       OTHER      1.000     0.091     0.167        44
         REP      0.887     0.851     0.869       424

    accuracy                          0.862       959
   macro avg      0.910     0.628     0.642       959
weighted avg      0.870     0.862     0.847       959



In [40]:
dump(rf_grid, 'models/party_clf_pdid_rf.joblib')

['models/party_clf_pdid_rf.joblib']