In [5]:
import sklearn.model_selection as ms
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from joblib import dump, load
from tqdm import tqdm

In [2]:
dir_models = '../models/binary_rf_v1'
path_inference_set = 'google_22_for_inf.csv'
path_inference_set_output = 'google_22_output_binary.csv'

In [3]:
#----
# Load the inference dataset
text_field = 'text'
df = pd.read_csv(path_inference_set)
df = df.dropna(subset = [text_field]) # remove NAs
df = df[df[text_field] != '_error'] # remove errors
df = df.reset_index(drop = True)

# Load the variable labels
with open('data/issue_labels_65.txt', 'r') as reader:
  labels = reader.read().split('\n')
labels = labels[:-1]


In [14]:
preds_l = []

# Batch size for predictions
batch_size = 512

# Create a single progress bar for both loops
total_labels = len(labels)
total_rows = len(df[text_field])

with tqdm(total=total_labels * total_rows, desc='Making Predictions') as pbar:
    for l in labels:
        clf_rf = load(dir_models + '/issues_rf_' + l + '.joblib')

        for i in range(0, total_rows, batch_size):
            # Batch predict using NumPy for efficiency
            batch_transcripts = df[text_field][i:i + batch_size].values
            batch_predictions = clf_rf.predict(batch_transcripts)
            
            preds_l.extend(batch_predictions)
            pbar.update(len(batch_transcripts))  # Update the single progress bar for each batch

# Ensure the progress bar reaches 100% at the end
pbar.n = total_labels * total_rows
pbar.last_print_n = total_labels * total_rows
pbar.refresh()

Making Predictions: 100%|███████████████████████████████████████████████| 11526320/11526320 [5:20:04<00:00, 600.18it/s]


In [17]:
df_preds = pd.DataFrame({y: x for x, y in zip(preds_l, labels)}, index=df.index)
df_results = pd.concat([df, df_preds], axis=1)

df_results.to_csv(path_inference_set_output)

In [19]:
df_results

Unnamed: 0,text,id,ISSUE10,ISSUE11,ISSUE12,ISSUE13,ISSUE14,ISSUE15,ISSUE16,ISSUE17,...,ISSUE105,ISSUE106,ISSUE111,ISSUE118,ISSUE200,ISSUE208,ISSUE210,ISSUE212,ISSUE218,ISSUE221
0,"Demand, I will say for computer coding...",CR18032445206145531905__google_asr_text,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Good afternoon. Good afternoon. We're ...,CR02245745215467945985__google_asr_text,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,FOR PUBLIC EDUCATION PAID FOR BY JIM PORT...,CR16409446952694972417__ad_text,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Welcome back, Madam Secretary mr. Deputy ...",CR03842772866831482881__google_asr_text,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Clark Tina Peters did her job and that is ...,CR12955430473680551937__google_asr_text,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177323,对 2022 年环境债券法进行投票\n债券法案将于11月8日进行投票。\n您可以选择缺席投票...,CR14512177241682608129__ad_text,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
177324,選票提案 1,CR05144676822612443137__ad_title,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
177325,"입법부가 명시한 대로 NY 천연 자원을 보존, 강화 및 복원하기 위해 통과되었습니다.",CR05087187757642547201__ad_text,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
177326,제안 1에 투표,CR05087187757642547201__ad_title,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
