In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score

import seaborn as sns
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.metrics import confusion_matrix

In [2]:
train_file = pd.read_csv('clean.csv')

In [3]:
train_file.columns, train_file.shape

(Index(['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
        'identity_hate', 'label'],
       dtype='object'),
 (159513, 8))

In [4]:
max_label_length = 6
new_label = []
for each in train_file['label']:
    temp = str(each)
    if len(temp) != max_label_length:
        zeros = max_label_length - len(str(each))
        res = temp.rjust(zeros + len(temp), '0')
        new_label.append(res)
    else:
        new_label.append(temp)    

In [5]:
train_file['label'] = new_label

In [6]:
train_file = train_file.drop(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis = 1)

In [7]:
all_toxic = train_file[(train_file.label != '000000')].reset_index(drop = True)
non_toxic = train_file[(train_file.label == '000000')].reset_index(drop = True)

In [8]:
all_toxic.shape, non_toxic.shape

((16225, 2), (143288, 2))

In [9]:
del train_file

In [10]:
non_toxic_labels = [0] * non_toxic.shape[0]
toxic_labels = [1] * all_toxic.shape[0]

In [11]:
non_toxic['label'] = non_toxic_labels
all_toxic['label'] = toxic_labels

In [12]:
non_toxic.head(4), all_toxic.head(4)

(                                        comment_text  label
 0  explanation edits made username hardcore metal...      0
 1  aww matches background colour seemingly stuck ...      0
 2  hey man really trying edit war guy constantly ...      0
 3  make real suggestions improvement wondered sec...      0,
                                         comment_text  label
 0                        cocksucker piss around work      1
 1  hey talk exclusive group wp talibans good dest...      1
 2            bye look come think comming back tosser      1
 3  gay antisemmitian archangel white tiger meow g...      1)

In [13]:
whole_frame = pd.concat([non_toxic, all_toxic], ignore_index=True)

In [14]:
train, non_train = train_test_split(whole_frame, test_size=0.1, random_state=42, stratify = whole_frame['label'], shuffle = True)
val, test = train_test_split(non_train, test_size=0.1, random_state=42, stratify = non_train['label'], shuffle = True)
train.shape, val.shape, test.shape

((143561, 2), (14356, 2), (1596, 2))

In [15]:
del non_toxic
del all_toxic

x_train = train['comment_text'].reset_index(drop = True)
y_train = train['label'].reset_index(drop = True)

x_val = val['comment_text'].reset_index(drop = True)
y_val = val['label'].reset_index(drop = True)

x_test = test['comment_text'].reset_index(drop = True)
y_test = test['label'].reset_index(drop = True)


del train
del val
del test

#### Corpus will be x_train
#### the output shape will be (number of data points, features)
#### features - representing the tf-idf score for different unigrams and bigrams.

In [16]:
tf_idf = TfidfVectorizer()
tf_idf.fit(x_train)
X_train = tf_idf.transform(x_train)
X_val = tf_idf.transform(x_val)
X_test = tf_idf.transform(x_test)
X_train.shape, X_val.shape, X_test.shape

((143561, 158647), (14356, 158647), (1596, 158647))

In [17]:
models = [
    RandomForestClassifier(n_estimators=60, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]
models_name = ['random forest', 'support vector machine', 'naive bayes', 'logistic regression']

#### trained models are saved in trained_models list.

In [18]:
trained_model = []
for index, model in enumerate(models):
    print(models_name[index])
    m1 = model.fit(X_train, y_train)
    trained_model.append(m1)
    preds = m1.predict(X_val)
    conf_mat = confusion_matrix(y_val, preds)
    print('Confusion matrix')
    print(conf_mat)
    print('f1 score',f1_score(y_val, preds, average='weighted', zero_division = 1))
    print('recall',recall_score(y_val, preds, average='weighted', zero_division = 1))
    print('precision',precision_score(y_val, preds, average='weighted', zero_division =1))
    print('accuracy',accuracy_score(y_val, preds))
    print('********************************')
    '''
    fig, ax = plt.subplots(figsize=(5,5))
    sns.heatmap(conf_mat, annot=True)
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()
    '''

random forest
Confusion matrix
[[12895     0]
 [ 1461     0]]
f1 score 0.8500741213236016
recall 0.8982307049317358
precision 0.9085876943504273
accuracy 0.8982307049317358
********************************
support vector machine
Confusion matrix
[[12749   146]
 [  433  1028]]
f1 score 0.9576939481755302
recall 0.9596684313179159
precision 0.9578389295535076
accuracy 0.9596684313179159
********************************
naive bayes
Confusion matrix
[[12892     3]
 [ 1147   314]]
f1 score 0.8958244920999688
recall 0.9198941209250487
precision 0.9256505585088531
accuracy 0.9198941209250487
********************************
logistic regression
Confusion matrix
[[12817    78]
 [  573   888]]
f1 score 0.9504557196605109
recall 0.9546531067149624
precision 0.9533445008954489
accuracy 0.9546531067149624
********************************
