In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import math

import string
from collections import Counter
from sklearn import preprocessing
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, confusion_matrix

import collections
import operator
import random
import seaborn as sns
from sklearn.model_selection import train_test_split

import gensim
import gensim.downloader
from sklearn.ensemble import RandomForestClassifier

In [2]:
train_file = pd.read_csv('clean.csv')

In [3]:
train_file.columns, train_file.shape

(Index(['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
        'identity_hate', 'label'],
       dtype='object'),
 (159513, 8))

In [4]:
max_label_length = 6
new_label = []
for each in train_file['label']:
    temp = str(each)
    if len(temp) != max_label_length:
        zeros = max_label_length - len(str(each))
        res = temp.rjust(zeros + len(temp), '0')
        new_label.append(res)
    else:
        new_label.append(temp)    

In [5]:
train_file['label'] = new_label

In [6]:
train_file = train_file.drop(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis = 1)

In [7]:
all_toxic = train_file[(train_file.label != '000000')].reset_index(drop = True)
non_toxic = train_file[(train_file.label == '000000')].reset_index(drop = True)

In [8]:
all_toxic.shape, non_toxic.shape

((16225, 2), (143288, 2))

In [9]:
del train_file

In [10]:
non_toxic_labels = [0] * non_toxic.shape[0]
toxic_labels = [1] * all_toxic.shape[0]

In [11]:
non_toxic['label'] = non_toxic_labels
all_toxic['label'] = toxic_labels

In [12]:
non_toxic.head(4), all_toxic.head(4)

(                                        comment_text  label
 0  explanation edits made username hardcore metal...      0
 1  aww matches background colour seemingly stuck ...      0
 2  hey man really trying edit war guy constantly ...      0
 3  make real suggestions improvement wondered sec...      0,
                                         comment_text  label
 0                        cocksucker piss around work      1
 1  hey talk exclusive group wp talibans good dest...      1
 2            bye look come think comming back tosser      1
 3  gay antisemmitian archangel white tiger meow g...      1)

In [13]:
whole_frame = pd.concat([non_toxic, all_toxic], ignore_index=True)

In [14]:
train, non_train = train_test_split(whole_frame, test_size=0.1, random_state=42, stratify = whole_frame['label'], shuffle = True)
val, test = train_test_split(non_train, test_size=0.1, random_state=42, stratify = non_train['label'], shuffle = True)
train.shape, val.shape, test.shape

((143561, 2), (14356, 2), (1596, 2))

In [15]:
del non_toxic
del all_toxic

x_train = train['comment_text'].reset_index(drop = True)
y_train = train['label'].reset_index(drop = True)

x_val = val['comment_text'].reset_index(drop = True)
y_val = val['label'].reset_index(drop = True)

x_test = test['comment_text'].reset_index(drop = True)
y_test = test['label'].reset_index(drop = True)


del train
del val
del test

In [16]:
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


#### download the desired model

In [17]:
fasttext_vectors = gensim.downloader.load('word2vec-google-news-300')

#### Split the sentence, get embedding for every word, if its OOV simply skip.
#### Lastly take the average of all the words

In [18]:
def sentence_vectors(datapoint, vector_type):
    words = datapoint.split(' ')
    word_vectors = []
    for each in words:
        try:
            word_vectors.append(fasttext_vectors[each])
        except KeyError:
            #if OOV then skip
            pass
    #take average
    word_vectors = np.array(word_vectors)
    
    #if while skipping as words are OOV
    #the list is empty then return 0
    if vector_type == 'sent':
        if word_vectors.shape[0] == []:
            return 0
        else:
            return np.mean(word_vectors, axis = 0)
    else:
        if word_vectors.shape[0] == []:
            return 0, 0
        else:
            return word_vectors, len(words)

In [19]:
train_sentence_vector = x_train.apply(sentence_vectors, vector_type = 'sent')
val_sentence_vector = x_val.apply(sentence_vectors, vector_type = 'sent')
test_sentence_vector = x_test.apply(sentence_vectors, vector_type = 'sent')

  return _methods._mean(a, axis=axis, dtype=dtype,


In [20]:
train_sentence_vector.shape, y_train.shape, val_sentence_vector.shape, y_val.shape, test_sentence_vector.shape, y_test.shape


((143561,), (143561,), (14356,), (14356,), (1596,), (1596,))

#### sentence_vectors returned 0 for the empty sentences
#### simply delete those inputs and outputs

In [21]:
 def get_index_zerovectors(train_sentence_vector):
    remove_indices = []
    for index, vector in enumerate(train_sentence_vector):
        if vector.shape == ():
            remove_indices.append(index)
    return remove_indices

In [22]:
train_remove_indices = get_index_zerovectors(train_sentence_vector)
val_remove_indices = get_index_zerovectors(val_sentence_vector)
test_remove_indices = get_index_zerovectors(test_sentence_vector)

In [23]:
train_sentence_vector = [i for j, i in enumerate(train_sentence_vector) if j not in train_remove_indices]
y_train = [i for j, i in enumerate(y_train) if j not in train_remove_indices]

val_sentence_vector = [i for j, i in enumerate(val_sentence_vector) if j not in val_remove_indices]
y_val = [i for j, i in enumerate(y_val) if j not in val_remove_indices]

test_sentence_vector = [i for j, i in enumerate(test_sentence_vector) if j not in test_remove_indices]
y_test = [i for j, i in enumerate(y_test) if j not in test_remove_indices]

In [24]:
train_sentence_vector = np.array(train_sentence_vector)
y_train = np.array(y_train)
val_sentence_vector = np.array(val_sentence_vector)
y_val = np.array(y_val)
test_sentence_vector = np.array(test_sentence_vector)
y_test = np.array(y_test)

train_sentence_vector.shape, y_train.shape, val_sentence_vector.shape, y_val.shape, test_sentence_vector.shape, y_test.shape


((143453, 300), (143453,), (14349, 300), (14349,), (1594, 300), (1594,))

In [26]:
clf = RandomForestClassifier(max_depth=5, random_state=0)
clf.fit(train_sentence_vector, y_train)
print(clf.score(val_sentence_vector, y_val), clf.score(test_sentence_vector, y_test))

preds = clf.predict(val_sentence_vector)

conf_mat = confusion_matrix(y_val, preds)
print('Confusion matrix')
print(conf_mat)
print('f1 score',f1_score(y_val, preds, average='weighted', zero_division = 1))
print('recall',recall_score(y_val, preds, average='weighted', zero_division = 1))
print('precision',precision_score(y_val, preds, average='weighted', zero_division =1))


0.9167886263851139 0.9209535759096612
Confusion matrix
[[12872    16]
 [ 1178   283]]
f1 score 0.8911141239166358
recall 0.9167886263851139
precision 0.9192449263234633
