In [1]:
import re
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from nltk.corpus import stopwords
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsOneClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from nltk.stem.snowball import SnowballStemmer
import pickle

df = pd.read_csv("datasets/train_15000.csv")
stop_words = stopwords.words('english')

In [2]:
def clean_text(sentence):
    sentence = re.sub(r"what's", "what is ", sentence)
    sentence = re.sub(r"\'s", " ", sentence)
    sentence = re.sub(r"\'ve", " have ", sentence)
    sentence = re.sub(r"can't", "can not ", sentence)
    sentence = re.sub(r"n't", " not ", sentence)
    sentence = re.sub(r"i'm", "i am ", sentence)
    sentence = re.sub(r"\'re", " are ", sentence)
    sentence = re.sub(r"\'d", " would ", sentence)
    sentence = re.sub(r"\'ll", " will ", sentence)
    sentence = re.sub(r"\'scuse", " excuse ", sentence)
    sentence = re.sub('\W', ' ', sentence)
    sentence = re.sub('\s+', ' ', sentence)
    sentence = re.sub(r'[?|!|\'|"|#–—-]',r'',sentence)
    sentence = re.sub(r'[.|,|)|(|\|/]',r' ',sentence)
    sentence = ' '.join([word for word in sentence.split() if word not in stop_words])
    sentence = sentence.strip()
    sentence = sentence.replace("\n"," ")
    stemmer = SnowballStemmer("english")
    stemmedSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemmedSentence += stem
        stemmedSentence += " "
    stemmedSentence = stemmedSentence.strip()
    return stemmedSentence

df['comment_text'] = df['comment_text'].map(lambda x : clean_text(x))

In [3]:
categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train, test = sklearn.model_selection.train_test_split(df, random_state=42, test_size=0.1, shuffle=True)
X_train = train.comment_text
X_test = test.comment_text

In [4]:
models = []
for i in range(len(categories)):
    models.append(Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsOneClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
            ]))
for i in range(len(categories)):
    models[i].fit(X_train, train[categories[i]])

In [5]:
for i in range(len(categories)):
    prediction = models[i].predict(X_test)
    print('accuracy_score:', (accuracy_score(test[categories[i]], prediction)))
    print('f1_score:', (f1_score(test[categories[i]], prediction)))

accuracy_score: 0.9590800852237122
f1_score: 0.7391130643228125
accuracy_score: 0.990913648326858
f1_score: 0.3438914027149321
accuracy_score: 0.9775661110414839
f1_score: 0.746458923512748
accuracy_score: 0.9977440782052889
f1_score: 0.2173913043478261
accuracy_score: 0.9703596941972679
f1_score: 0.6347490347490348
accuracy_score: 0.991978944729916
f1_score: 0.2727272727272727


In [6]:
def predict_categories(models, categories, text):
    text = clean_text(text)
    predictions = {}
    for i in range(6):
        predictions[format(categories[i])] = models[i].predict([text])[0]
    return predictions

In [7]:
prediction = predict_categories(models, categories, "")
prediction

{'toxic': 0,
 'severe_toxic': 0,
 'obscene': 0,
 'threat': 0,
 'insult': 0,
 'identity_hate': 0}

In [8]:
for i in range(len(categories)):
    pickle.dump(models[i], open("sklearn_toxicity_models/model" + str(i) + ".sav", "wb"))