In [None]:
# https://www.kaggle.com/shivamkushwaha/bbc-full-text-document-classification
#!wget -nc https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv

In [None]:
from transformers import pipeline

import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
#import textwrap

from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix

In [None]:
classifier = pipeline("zero-shot-classification", device=0)

In [None]:
classifier("This is a great movie", candidate_labels=["positive", "negative"])

In [None]:
# https://en.wikipedia.org/wiki/AMP-activated_protein_kinase
text = "Due to the presence of isoforms of its components, there are 12 " + \
  "versions of AMPK in mammals, each of which can have different tissue " + \
  "localizations, and different functions under different conditions. " + \
  "AMPK is regulated allosterically and by post-translational " + \
  "modification, which work together."
classifier(text, candidate_labels=["biology", "math", "geology"])

In [None]:
df = pd.read_csv('bbc_text_cls.csv')
len(df)

In [None]:
df.sample(5)

In [None]:
labels = list(set(df['labels']))
labels


In [None]:
print(df.iloc[1024]['text'])

In [None]:
df.iloc[1024]['labels']

In [None]:
classifier(df.iloc[1024]['text'], candidate_labels=labels)

In [None]:
%%time
preds = classifier(df['text'].tolist(), candidate_labels=labels)

In [None]:
preds[0]

In [None]:
predicted_labels = [d['labels'][0] for d in preds]
df['predicted_labels'] = predicted_labels

In [None]:
print("Acc:", np.mean(df['predicted_labels'] == df['labels']))

In [None]:
# Convert prediction probs into an NxK matrix according to
# original label order

N = len(df)
K = len(labels)
label2idx = {v:k for k,v in enumerate(labels)}

probs = np.zeros((N, K))
for i in range(N):
  # loop through labels and scores in corresponding order
  d = preds[i]
  for label, score in zip(d['labels'], d['scores']):
    k = label2idx[label]
    probs[i, k] = score

In [None]:
int_labels = [label2idx[x] for x in df['labels']]
int_preds = np.argmax(probs, axis=1)
cm = confusion_matrix(int_labels, int_preds, normalize='true')

In [None]:
# Scikit-Learn is transitioning to V1 but it's not available on Colab
# The changes modify how confusion matrices are plotted
def plot_cm(cm):
  df_cm = pd.DataFrame(cm, index=labels, columns=labels)
  ax = sn.heatmap(df_cm, annot=True, fmt='.2g')
  ax.set_xlabel("Predicted")
  ax.set_ylabel("Target")

plot_cm(cm)

In [None]:
f1_score(df['labels'], predicted_labels, average='micro')

In [None]:
roc_auc_score(int_labels, probs, multi_class='ovo')