In [4]:
# Inference dataset (change)
inference_dataset = 'data/google_22_for_inf.csv'
text_field = 'text'
output_file = 'data/google_22_output_multi.csv'

# Variable label file (don't change)
label_file = '../data/issue_labels_65.txt'

# Model files (usually don't change)
model_pytorch_model = '../models/multilabel_trf_v1/pytorch_model.bin'
model_config = '../models/multilabel_trf_v1/config.json'

In [3]:
#!pip install transformers

In [2]:
from tqdm import tqdm
import os
import shutil

import numpy as np
import pandas as pd
import torch
import transformers

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
)

In [5]:
#----
# Make model dir if it doesn't already exist
os.makedirs('models', exist_ok=True)
# Copy the model files to the model dir
shutil.copyfile(model_pytorch_model, 'models/pytorch_model.bin')
shutil.copyfile(model_config, 'models/config.json')

# Copy inference data (here test set)
shutil.copyfile(inference_dataset, './inference_dataset.csv')


'./inference_dataset.csv'

In [6]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


In [7]:
#----
# Load the trained model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained('models').to(device)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [8]:
#----
# Load the inference dataset
df = pd.read_csv('./inference_dataset.csv')
df = df.dropna(subset = [text_field]) # remove NAs
df = df[df[text_field] != '_error'] # remove errors
df = df.reset_index(drop = True)

# Load the variable labels
with open('../data/issue_labels_65.txt', 'r') as reader:
  labels = reader.read().split('\n')
# They were created like this:
# df = pd.read_csv('data/issues_tv_fb_18_20.csv')
# with open('data/issue_labels_65.txt', 'w') as writer:
#   for i in df.columns[2:].tolist():
#     writer.write(i + '\n')

In [9]:
#----
# Inference

# Batch the text Series (batch size 16)
texts = df[text_field].to_list()
batch_size = 16
list_df = [texts[i:i+batch_size] for i in range(0,len(texts),batch_size)]

# Use the tokenizer to encode the Series in batches
batched_examples = []
for text_chunk in list_df:
    batched_examples.append(tokenizer.batch_encode_plus(text_chunk, truncation=True, padding=True, return_tensors="pt"))


In [10]:
# Batch inference
# For inference, calculating the gradients is unnecessary
# with torch.no_grad(): turns them off, which is faster (seems about 10x faster on CPU, and 2x faster on GPU or so)
outputs_list = []

for encoded_chunk in tqdm(batched_examples):
    encoded_chunk = {key: value.to(device) for key, value in encoded_chunk.items()}
    
    with torch.no_grad():
        outputs_list.append(model(**encoded_chunk))

100%|████████████████████████████████████████████████████████████████████████████| 5893/5893 [1:20:18<00:00,  1.22it/s]


In [11]:
# Convert to 1s and 0s
preds_l = []
for output in outputs_list:
    preds = output.logits.sigmoid().cpu().numpy() > 0.5
    preds = preds.astype(int)
    preds_l.append(preds)

outputs = np.vstack(preds_l)

In [12]:
# Convert to pd DataFrame and save
df_preds = pd.DataFrame(outputs)
df_preds.columns = labels[:-1]
df_results = pd.concat([df, df_preds], axis = 1)
df_results2 = df_results.dropna()
df_results3 = df_results2.drop_duplicates()

In [13]:
df_results3

Unnamed: 0,text,id,ISSUE10,ISSUE11,ISSUE12,ISSUE13,ISSUE14,ISSUE15,ISSUE16,ISSUE17,...,ISSUE105,ISSUE106,ISSUE111,ISSUE118,ISSUE200,ISSUE208,ISSUE210,ISSUE212,ISSUE218,ISSUE221
0,"Demand, I will say for computer coding...",CR18032445206145531905__google_asr_text,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Good afternoon. Good afternoon. We're ...,CR02245745215467945985__google_asr_text,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,FOR PUBLIC EDUCATION PAID FOR BY JIM PORT...,CR16409446952694972417__ad_text,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Welcome back, Madam Secretary mr. Deputy ...",CR03842772866831482881__google_asr_text,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Clark Tina Peters did her job and that is ...,CR12955430473680551937__google_asr_text,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94273,对 2022 年环境债券法进行投票\n债券法案将于11月8日进行投票。\n您可以选择缺席投票...,CR14512177241682608129__ad_text,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
94274,選票提案 1,CR05144676822612443137__ad_title,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
94275,"입법부가 명시한 대로 NY 천연 자원을 보존, 강화 및 복원하기 위해 통과되었습니다.",CR05087187757642547201__ad_text,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
94276,제안 1에 투표,CR05087187757642547201__ad_title,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
df_results3.to_csv(output_file, index = False, encoding='utf-8')

In [45]:
#reupload
#df_inf = pd.read_csv('google_22_output.csv', encoding='utf-8')

In [47]:
#df_inf.shape

(177328, 67)