In [None]:
# Inference dataset (change)
inference_dataset = '../data/fb_22_for_inf.csv'
text_field = 'text'
output_file = '../data/fb_22_output_multi.csv.gz'

# Variable label file (don't change)
label_file = '../data/issue_labels_25.txt'

# Model files (usually don't change)
model_pytorch_model = '../models/multilabel_trf_v1/model.safetensors'
model_config = '../models/multilabel_trf_v1/config.json'

In [None]:
#!pip install pandas==2.2.2
#!pip install scikit-learn==1.0.2
#!pip install numpy==1.26.4
#!pip install joblib==1.4.2
#!pip install torch==2.3.1
#!pip install tqdm==4.66.4
#!pip install transformers==4.41.2
#!pip install datasets==2.20.0
#!pip install ipywidgets

In [2]:
from tqdm import tqdm
import os
import shutil

import numpy as np
import pandas as pd
import torch
import transformers

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
)

In [3]:
#----
# Make model dir if it doesn't already exist
os.makedirs('models', exist_ok=True)
# Copy the model files to the model dir
shutil.copyfile(model_pytorch_model, 'models/model.safetensors')
shutil.copyfile(model_config, 'models/config.json')

# Copy inference data (here test set)
shutil.copyfile(inference_dataset, './inference_dataset.csv')


'./inference_dataset.csv'

In [4]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


In [5]:
#----
# Load the trained model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained('models').to(device)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [6]:
#----
# Load the inference dataset
df = pd.read_csv('./inference_dataset.csv')
df = df.dropna(subset = [text_field]) # remove NAs
df = df[df[text_field] != '_error'] # remove errors
df = df.reset_index(drop = True)


In [17]:
# Load the variable labels
with open('../data/issue_labels_25.txt', 'r') as reader:
    labels = reader.read().split('\n')
# They were created like this:
# df = pd.read_csv('data/issues_tv_fb_18_20.csv')
# with open('data/issue_labels_65.txt', 'w') as writer:
#   for i in df.columns[2:].tolist():
#     writer.write(i + '\n')

In [8]:
df.head()

Unnamed: 0,text,id
0,',x_4971277182972491__ad_creative_link_descripti...
1,' YOUR SOURCE FOR @GANDERNEWSROOM And I very m...,x_605605801303782__aws_ocr_text_vid
2,'21. JAN. 1.4%. BIDEN'S INFLATION. FEB. 1.7%. ...,x_1062742507750431__aws_ocr_text_img|x_1018172...
3,"'22. Just this year,. Alaska's budget invested...",x_652922416203645__aws_ocr_text_img|x_56077544...
4,'2A' Second Amendment 1791 Hat,x_628894858798509__ad_creative_link_title|x_50...


In [9]:
#----
# Inference

# Batch the text Series (batch size 16)
texts = df[text_field].to_list()
batch_size = 16
list_df = [texts[i:i+batch_size] for i in range(0,len(texts),batch_size)]

# Use the tokenizer to encode the Series in batches
batched_examples = []
for text_chunk in tqdm(list_df, desc="Tokenizing batches", unit="batch"):
    batched_examples.append(tokenizer.batch_encode_plus(
        text_chunk, truncation=True, padding=True, return_tensors="pt"
    ))


Tokenizing batches: 100%|██████████| 21643/21643 [02:30<00:00, 143.92batch/s]


In [10]:
# Batch inference
# For inference, calculating the gradients is unnecessary
# with torch.no_grad(): turns them off, which is faster (seems about 10x faster on CPU, and 2x faster on GPU or so)
outputs_list = []

for encoded_chunk in tqdm(batched_examples):
    encoded_chunk = {key: value.to(device) for key, value in encoded_chunk.items()}
    
    with torch.no_grad():
        outputs_list.append(model(**encoded_chunk))

100%|██████████| 21643/21643 [2:19:29<00:00,  2.59it/s]  


In [11]:
# Convert to 1s and 0s
preds_l = []
for output in outputs_list:
    preds = output.logits.sigmoid().cpu().numpy() > 0.5
    preds = preds.astype(int)
    preds_l.append(preds)

outputs = np.vstack(preds_l)

In [19]:
# Convert to pd DataFrame and save
df_preds = pd.DataFrame(outputs)
df_preds.columns = labels[:]
df_results = pd.concat([df, df_preds], axis = 1)
df_results2 = df_results.dropna()
df_results3 = df_results2.drop_duplicates()

In [20]:
df_results3

Unnamed: 0,text,id,ISSUE215,ISSUE10,ISSUE30,ISSUE40,ISSUE212,ISSUE12,ISSUE16,ISSUE209,...,ISSUE91,ISSUE56,ISSUE53,ISSUE90,ISSUE65,ISSUE45,ISSUE60,ISSUE58,ISSUE22,ISSUE32
0,',x_4971277182972491__ad_creative_link_descripti...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,' YOUR SOURCE FOR @GANDERNEWSROOM And I very m...,x_605605801303782__aws_ocr_text_vid,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,'21. JAN. 1.4%. BIDEN'S INFLATION. FEB. 1.7%. ...,x_1062742507750431__aws_ocr_text_img|x_1018172...,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"'22. Just this year,. Alaska's budget invested...",x_652922416203645__aws_ocr_text_img|x_56077544...,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,'2A' Second Amendment 1791 Hat,x_628894858798509__ad_creative_link_title|x_50...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
346281,"陳介飛接受了徵召, 在海軍中為我們的國家服務. 現在他想進入國會來為我們的國家服務，計劃降低...",x_459727066083741__ad_creative_body|x_39298650...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
346282,"陳介飛是一位退伍軍人, 一位小企業主，也是個台灣移民家庭的兒子.\n\n他正在競選國會議員，...",x_1021976488471789__ad_creative_body|x_1397130...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
346283,陳介飛曾自豪地為我們的國家服務，並不是為了看到由特殊利益集團利用華盛頓來謀取私利.\n\n他...,x_481402380513471__ad_creative_body|x_10289971...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
346284,陳介飛知道華盛頓並沒有為橙縣的家庭謀福利。他在海軍中為我們的國家服務，並不是要看到政客們將特...,x_1142790589664752__ad_creative_body|x_1851109...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
df_results3.to_csv(output_file, index=False, compression='gzip', encoding='utf-8')


In [None]:
#reupload
#df_inf = pd.read_csv('google_22_output.csv', encoding='utf-8')

In [None]:
#df_inf.shape