In [12]:
import time 
import pandas as pd
from whatlies.language import BytePairLanguage, UniversalSentenceLanguage, SentenceTFMLanguage, CountVectorLanguage

lang_use = UniversalSentenceLanguage("large")
lang_bp  = BytePairLanguage("en", dim=300, vs=200_000)
lang_brt = SentenceTFMLanguage('distilbert-base-nli-stsb-mean-tokens')

Let's load the dataset and make some utility functions to get it in the right format.

In [13]:
def clean_text(txt_col):
    return txt_col.str.replace(r'\n', " ")

def to_train_df(dataf):
    dataf = dataf.copy() 
    dataf['bad'] = dataf[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1)
    dataf['label'] = ['toxic' if b else 'fine' for b in dataf['bad'] != 0]
    dataf['text'] = clean_text(dataf['comment_text'])
    return dataf[['text', 'label']]

df = pd.read_csv("toxicity-train.csv.zip").replace({"\n", ""})

Next, let's make a proper train/test split.

In [17]:
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split

train_df = df.pipe(to_train_df)[:10000]

x_train, x_test, y_train, y_test = train_test_split(list(train_df['text']), train_df['label'])

In [28]:
train_df.groupby("label").count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
fine,8970
toxic,1030


In [29]:
for mod in [SVC(class_weight="balanced"), LogisticRegression(solver='liblinear', class_weight="balanced")]:
    pipe = Pipeline([
        ("feat", CountVectorizer()),
        ("mod", mod)
    ])
    tic = time.time()
    pipe.fit(list(x_train), y_train)
    toc = time.time() 
    print(f"report for {str(lang), str(mod)}")
    print(f"train time: {toc - tic}")
    tic = time.time()
    y_pred = pipe.predict(x_test)
    toc = time.time()
    print(f"pred time: {toc - tic}")
    print(classification_report(y_test, y_pred))

report for ('CountVectorizer()', "SVC(class_weight='balanced')")
train time: 9.83952283859253
pred time: 3.0086185932159424
              precision    recall  f1-score   support

        fine       0.97      0.79      0.87      2251
       toxic       0.30      0.82      0.44       249

    accuracy                           0.79      2500
   macro avg       0.64      0.80      0.65      2500
weighted avg       0.91      0.79      0.83      2500

report for ('CountVectorizer()', "LogisticRegression(class_weight='balanced', solver='liblinear')")
train time: 0.7045333385467529
pred time: 0.1499195098876953
              precision    recall  f1-score   support

        fine       0.97      0.97      0.97      2251
       toxic       0.70      0.70      0.70       249

    accuracy                           0.94      2500
   macro avg       0.84      0.83      0.83      2500
weighted avg       0.94      0.94      0.94      2500



In [31]:
import pandas as pd 
df = pd.read_csv("results.csv")

In [34]:
df.sort_values('precision')

Unnamed: 0,lang,mod,precision,recall,pred-time,train-time
1,CountVectorizer(),LogisticRegression(),0.807229,0.538153,0.085694,0.88242
0,CountVectorizer(),SVC(),1.0,0.032129,1.634564,5.422204


In [42]:
pipe = Pipeline([
    ("feat", lang_use),
    ("mod", LogisticRegression(solver='liblinear', class_weight="balanced"))
])

pipe.fit(list(x_train), y_train)

In [None]:
pipe.predict(["you're a bad ass!"])