In [90]:
import numpy as np
import pandas as pd
import sklearn.model_selection as ms
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from tqdm import tqdm

## Train

In [13]:
# Read in manually coded data for 2022 and 2020
df = pd.read_csv('data/fbel_prepared_2022.csv', encoding = 'UTF-8')
df20 = pd.read_csv('data/fbel_prepared.csv', encoding = 'UTF-8')

In [43]:
df = pd.concat([df, df20])

In [55]:
# To prevent data leakage, make sure the same features don't go into both train and test 
df = df.drop_duplicates(subset='text')

In [59]:
# from joblib import dump, load

models = {}

results_dir = 'performance'

clf_rf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('cal', CalibratedClassifierCV(RandomForestClassifier(n_estimators=500, random_state=123), cv=5, method="sigmoid"),)
])

model_name = 'rf'

# Other rare categories: "EVENT", "POLL", "GATHERINFO", "LEARNMORE", "CONTACT", "PURCHASE"
goals = ["PRIMARY_PERSUADE", "DONATE", "GOTV"]

for g in goals:
  
  print(g)
  
  curr_df = df[['text', g]].dropna(subset=[g])

  X_train, X_test, y_train, y_test = ms.train_test_split(curr_df['text'], curr_df[g], test_size=0.2, random_state=123)
  
  clf_rf.fit(X_train, y_train)
  y_preds = clf_rf.predict(X_test)
 
  
  print(metrics.classification_report(y_test, y_preds))
  
#   df_perf = pd.DataFrame(metrics.precision_recall_fscore_support(test[g], predicted))
#   df_perf.index = ['Precision', 'Recall', 'F-Score', 'Support']
#   df_perf.to_csv(results_dir + "/" + model_name + "/" + g + '.csv')

  # Save model to disk
  # dump(clf_rf, f'models/goal_rf_{g}_2020_2022.joblib')
    
  # Save model on the go
  models[g] = clf_rf

PRIMARY_PERSUADE
              precision    recall  f1-score   support

           0       0.84      0.75      0.79       460
           1       0.87      0.93      0.90       872

    accuracy                           0.86      1332
   macro avg       0.86      0.84      0.85      1332
weighted avg       0.86      0.86      0.86      1332

DONATE
              precision    recall  f1-score   support

         0.0       0.98      0.99      0.99      1029
         1.0       0.97      0.93      0.95       245

    accuracy                           0.98      1274
   macro avg       0.98      0.96      0.97      1274
weighted avg       0.98      0.98      0.98      1274

GOTV
              precision    recall  f1-score   support

         0.0       0.92      0.96      0.94      1033
         1.0       0.78      0.63      0.70       241

    accuracy                           0.90      1274
   macro avg       0.85      0.80      0.82      1274
weighted avg       0.89      0.90      0.89  

## Inference: Meta 2022

In [83]:
# Input data
path_fb22_acb = 'data/fb2022_prepared.csv.gz'
# dir_models = 'models/goal_rf_'
# Ouput data
path_predictions_gz = 'data/ad_goal_rf_fb2022.csv.gz'

# Load data
inference = pd.read_csv(path_fb22_acb)
inference = inference[inference['text'] != ""]
inference = inference.dropna(subset='text')

goals = ["PRIMARY_PERSUADE", "DONATE", "GOTV"]

for g in goals:

  # Load model
  clf = models[g]
  # Load saved model
  # clf = load(dir_models + g + '.joblib')
  
  # Apply clf
  predicted_prob = clf.predict_proba(inference['text'])
  predicted = np.argmax(predicted_prob, axis=1)
  
  inference['goal_'+g+'_prediction'] = predicted
  inference['goal_'+g+'_predicted_prob'] = predicted_prob[:,1]
  
# Make a column with the largest probability
inference['goal_highest_prob'] = inference[[col for col in inference.columns if "predicted_prob" in col]].idxmax(1)
inference['goal_highest_prob'] = inference['goal_highest_prob'].str.replace('_predicted_prob', '')
inference['goal_highest_prob'] = inference['goal_highest_prob'].str.replace('goal_', '')

# Save only columns of prediction results
cols_to_drop = ['text', 'ad_creative_bodies', 'ad_snapshot_url',
       'ad_creative_link_captions', 'ad_creative_link_titles',
       'ad_creative_link_descriptions', 'checksum',]
inference = inference.drop(cols_to_drop, axis=1)

inference.to_csv(path_predictions_gz, index = False,
                compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1})

## Inference: Google 2022

In [91]:
# Input data
path_google_2022 = 'data/google_2022_prepared.csv.gz'
# dir_models = 'models/goal_rf_'

# Ouput data
path_predictions_gz = 'data/ad_goal_rf_google_2022.csv.gz'

# Load data
inference = pd.read_csv(path_google_2022)
inference = inference[inference['text'] != ""]
inference = inference.dropna(subset='text')

goals = ["PRIMARY_PERSUADE", "DONATE", "GOTV"]

for g in tqdm(goals):

  # Load model
  clf = models[g]
    
  # Load saved model
  # clf = load(dir_models + g + '.joblib')
  
  # Apply clf
  predicted_prob = clf.predict_proba(inference['text'])
  predicted = np.argmax(predicted_prob, axis=1)
  
  inference['goal_'+g+'_prediction'] = predicted
  inference['goal_'+g+'_predicted_prob'] = predicted_prob[:,1]


# Make a column with the largest probability
inference['goal_highest_prob'] = inference[[col for col in inference.columns if "predicted_prob" in col]].idxmax(1)
inference['goal_highest_prob'] = inference['goal_highest_prob'].str.replace('_predicted_prob', '')
inference['goal_highest_prob'] = inference['goal_highest_prob'].str.replace('goal_', '')

# Save only columns of goal predictions
cols_to_drop = ['wmp_creative_id', 'ad_type', 'csum_agg',
       'advertiser_id', 'aws_face_vid', 'aws_face_img', 'impressions',
       'age_targeting', 'gender_targeting', 'geo_targeting_included',
       'geo_targeting_excluded', 'spend_range_min_usd', 'spend_range_max_usd',]

inference = inference.drop(cols_to_drop, axis=1)

inference.to_csv(path_predictions_gz, index = False,
                compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1})

100%|████████████████████████████████████████████| 3/3 [08:25<00:00, 168.62s/it]
