In [42]:
import pandas as pd
import spacy

In [43]:

nlp = spacy.load('en_core_web_sm')
df = pd.read_csv('train.csv')
df_clean = df
df_clean['isToxic'] = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].any(axis=1).astype(int)
df_clean = df_clean[['comment_text', 'isToxic']].copy()
df_clean.rename(columns={'comment_text': 'Text'}, inplace=True)

In [44]:
df_toxic = df_clean[df_clean['isToxic'] == 1]
df_non_toxic = df_clean[df_clean['isToxic'] == 0].sample(n=len(df_toxic))
df_equilibre = pd.concat([df_toxic, df_non_toxic])

In [45]:
def treat_comment(comment):
    spacy_comment = nlp(comment, disable=["parser", "tagger", "ner", "textcat"])
    treated_tokens = [w.text for w in spacy_comment if w.is_alpha and not w.is_stop]
    return " ".join(treated_tokens)
df_equilibre['Text'] = df_equilibre['Text'].map(treat_comment)
df_equilibre.head()



Unnamed: 0,Text,isToxic
6,COCKSUCKER PISS WORK,1
12,Hey talk exclusive group WP TALIBANS good dest...,1
16,Bye look come think comming Tosser,1
42,gay antisemmitian Archangel WHite Tiger Meow G...,1
43,FUCK FILTHY MOTHER ASS DRY,1


In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [47]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_equilibre['Text'])
Y = df_equilibre['isToxic']
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.25,random_state=42)
model = LogisticRegression()
model.fit(X_train, Y_train)
y_pred = model.predict(X_test)
accuracy_score(Y_test, y_pred)

0.893134475533095

In [48]:
import joblib

In [49]:
joblib.dump(vectorizer, 'vectorizer.joblib')
joblib.dump(model, 'model.joblib') 

['model.joblib']

In [50]:
from sklearn.metrics import confusion_matrix, classification_report

In [51]:
matrice = confusion_matrix(Y_test, y_pred)
print(matrice)
print(classification_report(y_pred, Y_test))

[[3769  272]
 [ 595 3477]]
              precision    recall  f1-score   support

           0       0.93      0.86      0.90      4364
           1       0.85      0.93      0.89      3749

    accuracy                           0.89      8113
   macro avg       0.89      0.90      0.89      8113
weighted avg       0.90      0.89      0.89      8113

