In [1]:
path='dataset/'

In [3]:
import pandas as pd
import numpy as np

In [4]:
train=pd.read_csv(path+"train.csv")
test=pd.read_csv(path+"test.csv")

In [5]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
import re

In [8]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [9]:
def clean(comment):
  comment = comment.lower()
  comment = re.sub('[^a-zA-Z]',' ', comment)
  comment = comment.strip()
  comment = comment.split()
  stop_words = set(stopwords.words('english'))
  stemmer = PorterStemmer()
  comment = [stemmer.stem(word) for word in comment if word not in stop_words and len(word)<30]
  comment = ' '.join(comment)
  return comment

In [10]:
def vectorise(comments):
  vectorizer = CountVectorizer(max_features=5000)
  X = vectorizer.fit_transform(comments)
  return X.toarray()

In [12]:
raw_comments= list(train['comment_text'])
len(raw_comments)

159571

In [14]:
comments=[]
for i,com in enumerate(raw_comments):
  if(i%10000==0): print(i)
  try:
    comments.append(clean(com))
  except:
    print(i,com)

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000


In [15]:
x=vectorise(comments)
x.shape

(159571, 5000)

In [16]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier

In [17]:
labels=train.iloc[:,2:]
labels.head(10)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
5,0,0,0,0,0,0
6,1,1,1,0,1,0
7,0,0,0,0,0,0
8,0,0,0,0,0,0
9,0,0,0,0,0,0


In [18]:
classes=list(labels.columns)
classes

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [19]:
labels.head(10)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
5,0,0,0,0,0,0
6,1,1,1,0,1,0
7,0,0,0,0,0,0
8,0,0,0,0,0,0
9,0,0,0,0,0,0


In [20]:
y=[]
for idx,row in labels.iterrows():
  y.append(row.values)
y=np.array(y)

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(x,y,test_size=0.2,random_state=1)

In [22]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [23]:
model = MultinomialNB()

train_acc = []
val_acc = []

preds_train = []
preds_val = []

f1_val = []

for i, label_name in enumerate(classes):
    print('Class: ',label_name)
    model.fit(X_train,y_train[:,i])
    preds_train_class = model.predict(X_train)
    train_acc_class = accuracy_score(y_train[:,i], preds_train_class)
    print('Train Accuracy:', train_acc_class)
    train_acc.append(train_acc_class)
    preds_train.append(preds_train_class)
    preds_val_class = model.predict(X_val)
    val_acc_class = accuracy_score(y_val[:,i], preds_val_class)
    print('Val Accuracy:', val_acc_class)
    val_acc.append(val_acc_class)
    preds_val.append(preds_val_class)
    cm = confusion_matrix(y_val[:,i], preds_val_class)
    print(cm)

    tn, fp, fn, tp = cm.ravel()
    precision= tp/(tp+fp+1e-5)
    recall= tp/(tp+fn+1e-5)
    f1_score= (2*precision*recall)/(precision+recall+1e-5)
    print('precision {:.4f} recall {:.4f} F1 score {:.4f}:'.format(precision,recall,f1_score))
    f1_val.append(f1_score)

    print()
    
print('mean train accuracy : ', np.mean(train_acc))
print('mean val accuracy :', np.mean(val_acc))
print('mean val F1 score :', np.mean(f1_val))

Class:  toxic
Train Accuracy: 0.950382277370433
Val Accuracy: 0.9492088359705467
[[28095   717]
 [  904  2199]]
precision 0.7541 recall 0.7087 F1 score 0.7307:

Class:  severe_toxic
Train Accuracy: 0.9826643479350755
Val Accuracy: 0.9821087263042456
[[31123   489]
 [   82   221]]
precision 0.3113 recall 0.7294 F1 score 0.4363:

Class:  obscene
Train Accuracy: 0.9703656702387666
Val Accuracy: 0.9667241109196303
[[29577   651]
 [  411  1276]]
precision 0.6622 recall 0.7564 F1 score 0.7061:

Class:  threat
Train Accuracy: 0.9876073196716175
Val Accuracy: 0.9865580448065173
[[31429   385]
 [   44    57]]
precision 0.1290 recall 0.5644 F1 score 0.2099:

Class:  insult
Train Accuracy: 0.9644043366547597
Val Accuracy: 0.9618047939840201
[[29578   763]
 [  456  1118]]
precision 0.5944 recall 0.7103 F1 score 0.6472:

Class:  identity_hate
Train Accuracy: 0.9807999623989472
Val Accuracy: 0.9796647344508852
[[31100   529]
 [  120   166]]
precision 0.2388 recall 0.5804 F1 score 0.3384:

mean train