In [None]:
# Inference dataset (change)
inference_dataset = '../data/google_22_for_inf.csv'
text_field = 'text'
output_file = '../data/google_22_output_25.csv'

# Variable label file (don't change)
label_file = '../data/issue_labels_25.txt'

# Model files (usually don't change)
model_pytorch_model = '../models/multilabel_trf_v1/model.safetensors'
model_config = '../models/multilabel_trf_v1/config.json'

In [2]:
from tqdm import tqdm
import os
import shutil

import numpy as np
import pandas as pd
import torch
import transformers

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
)

In [4]:
#----
# Make model dir if it doesn't already exist
os.makedirs('models', exist_ok=True)
# Copy the model files to the model dir
shutil.copyfile(model_pytorch_model, 'models/model.safetensors')
shutil.copyfile(model_config, 'models/config.json')

# Copy inference data (here test set)
shutil.copyfile(inference_dataset, './inference_dataset.csv')


'./inference_dataset.csv'

In [5]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


In [6]:
#----
# Load the trained model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained('models').to(device)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [7]:
#----
# Load the inference dataset
df = pd.read_csv('./inference_dataset.csv')
df = df.dropna(subset = [text_field]) # remove NAs
df = df[df[text_field] != '_error'] # remove errors
df = df.reset_index(drop = True)

# Load the variable labels
with open('../data/issue_labels_25.txt', 'r') as reader:
    labels = reader.read().split('\n')
# They were created like this:
# df = pd.read_csv('data/issues_tv_fb_18_20.csv')
# with open('data/issue_labels_65.txt', 'w') as writer:
#   for i in df.columns[2:].tolist():
#     writer.write(i + '\n')

In [8]:
#----
# Inference

# Batch the text Series (batch size 16)
texts = df[text_field].to_list()
batch_size = 16
list_df = [texts[i:i+batch_size] for i in range(0,len(texts),batch_size)]

# Use the tokenizer to encode the Series in batches
batched_examples = []
for text_chunk in tqdm(list_df, desc="Tokenizing batches", unit="batch"):
    batched_examples.append(tokenizer.batch_encode_plus(
        text_chunk, truncation=True, padding=True, return_tensors="pt"
    ))


Tokenizing batches:   0%|          | 0/2937 [00:00<?, ?batch/s]

Tokenizing batches: 100%|██████████| 2937/2937 [00:23<00:00, 126.02batch/s]


In [9]:
# Batch inference
# For inference, calculating the gradients is unnecessary
# with torch.no_grad(): turns them off, which is faster (seems about 10x faster on CPU, and 2x faster on GPU or so)
outputs_list = []

for encoded_chunk in tqdm(batched_examples):
    encoded_chunk = {key: value.to(device) for key, value in encoded_chunk.items()}
    
    with torch.no_grad():
        outputs_list.append(model(**encoded_chunk))

100%|██████████| 2937/2937 [24:37<00:00,  1.99it/s]


In [10]:
# Convert to 1s and 0s
preds_l = []
for output in outputs_list:
    preds = output.logits.sigmoid().cpu().numpy() > 0.5
    preds = preds.astype(int)
    preds_l.append(preds)

outputs = np.vstack(preds_l)

In [11]:
# Convert to pd DataFrame and save
df_preds = pd.DataFrame(outputs)
df_preds.columns = labels[:]
df_results = pd.concat([df, df_preds], axis = 1)
df_results2 = df_results.dropna()
df_results3 = df_results2.drop_duplicates()

In [12]:
df_results3.head()

Unnamed: 0,text,id,ISSUE215,ISSUE10,ISSUE30,ISSUE40,ISSUE212,ISSUE12,ISSUE16,ISSUE209,...,ISSUE91,ISSUE56,ISSUE53,ISSUE90,ISSUE65,ISSUE45,ISSUE60,ISSUE58,ISSUE22,ISSUE32
0,'30 MASSEY- HARRIS PAID FOR BY DAVID TRONE FOR...,CR01131952237317193729__aws_ocr_video_text|CR0...,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"'HER WAY' """"КОТЕК CROSSED THE LINE"""" OPB.ORG N...",CR01885433262180925441__aws_ocr_video_text|CR0...,0,0,0,1,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,'No one wants to be a cop these days' Across ...,CR10112608181852045313__aws_ocr_video_text,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,'Peace of mind': For Arizona veterans exposed ...,CR09381537402985644033__aws_ocr_video_text|CR1...,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,'S GO O T E Paid for by Future Majority BRIEFI...,CR11691957679105769473__aws_ocr_video_text,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
df_results3.to_csv(output_file, index = False, encoding='utf-8')

In [13]:
#reupload
#df_inf = pd.read_csv('google_22_output.csv', encoding='utf-8')

In [14]:
#df_inf.shape