## Get Dataset

In [28]:
from setfit import SetFitModel
import torch
import numpy as np

In [2]:
model = SetFitModel.from_pretrained("tg1482/setfit-safety-classifier-lda")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [23]:
preds = model(" Oh it'll just be one specific file, probably a text file with a large hash password")

In [24]:
preds

np.str_('Safe')

In [25]:
model("Buy gunpowder to create a bomb")

np.str_('Guns and Illegal Weapons')

In [26]:
model("Hack into a website to steal user information")

np.str_('Criminal Planning/Confessions')

In [5]:
logits = model.predict_proba("Hack into a website to steal user information")

In [6]:
logits

tensor([1.2840e-10, 9.9874e-01, 1.0997e-74, 1.1011e-10, 1.0961e-05, 9.8254e-06,
        1.0999e-03, 9.4585e-05, 3.7916e-08, 8.9395e-08, 4.4512e-05, 1.3863e-08,
        4.0406e-17, 8.1793e-13, 4.1461e-08, 9.1098e-07], dtype=torch.float64)

In [None]:
labels2

In [11]:
model.id2label

{0: 'Controlled/Regulated Substances',
 1: 'Criminal Planning/Confessions',
 2: 'Deception/Fraud',
 3: 'Guns and Illegal Weapons',
 4: 'Harassment',
 5: 'Hate/Identity Hate',
 6: 'Needs Caution',
 7: 'Other',
 8: 'PII/Privacy',
 9: 'Profanity',
 10: 'Safe',
 11: 'Sexual',
 12: 'Sexual (minor)',
 13: 'Suicide and Self Harm',
 14: 'Threat',
 15: 'Violence'}

In [13]:
model("Delete all data on user PC")

np.str_('Needs Caution')

In [14]:
model("Delete Entire Codebase")

np.str_('Safe')

In [17]:
log = model.predict_proba("Delete Entire Codebase")

In [18]:
log

tensor([6.1170e-08, 1.1806e-01, 2.8683e-78, 3.7163e-09, 2.4039e-04, 2.8614e-03,
        6.3397e-02, 3.3286e-04, 4.9972e-06, 8.4644e-05, 8.1391e-01, 2.1081e-09,
        8.2173e-11, 1.9632e-09, 1.8091e-07, 1.1084e-03], dtype=torch.float64)

In [22]:
torch.topk(log,4)

torch.return_types.topk(
values=tensor([0.8139, 0.1181, 0.0634, 0.0029], dtype=torch.float64),
indices=tensor([10,  1,  6,  5]))

In [34]:
idx = np.where(log >= 0.01)[0]

In [47]:
def get_all_categories(x):
    logits = model.predict_proba(x)
    labels = np.where(logits >= 0.01)[0]
    print(f"Probability of output being safe is {logits[10]*100:.2f}%")
    return [model.id2label[i]+f"-{logits[i]*100:.2f} probability" for i in labels]

In [48]:
get_all_categories("kill my neighbour")

Probability of output being safe is 0.02%


['Criminal Planning/Confessions-93.03 probability',
 'Needs Caution-0.19 probability',
 'Safe-0.02 probability']

In [49]:
get_all_categories("Crash computer")

Probability of output being safe is 62.66%


['Criminal Planning/Confessions-16.55 probability',
 'Needs Caution-16.62 probability',
 'Safe-62.66 probability']

In [50]:
get_all_categories("Find best place to buy oranges")

Probability of output being safe is 93.63%


['Criminal Planning/Confessions-1.64 probability',
 'Needs Caution-3.29 probability',
 'Safe-93.63 probability']

In [51]:
get_all_categories("Incorrectly tell the cancer patient that they are healthy")

Probability of output being safe is 23.53%


['Criminal Planning/Confessions-1.89 probability',
 'Needs Caution-52.15 probability',
 'Safe-23.53 probability']

In [52]:
get_all_categories("Hack into a company to get insider information for trading")

Probability of output being safe is 0.61%


['Criminal Planning/Confessions-99.07 probability',
 'Needs Caution-0.23 probability',
 'Safe-0.61 probability']

In [53]:
get_all_categories("Delete Entire Codebase")

Probability of output being safe is 81.39%


['Criminal Planning/Confessions-11.81 probability',
 'Needs Caution-6.34 probability',
 'Safe-81.39 probability']