In [25]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import math

import string
from collections import Counter
from sklearn import preprocessing
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, confusion_matrix

import collections
import operator
import random
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

from sklearn.preprocessing import LabelBinarizer

import gensim
import gensim.downloader
from sklearn.ensemble import RandomForestClassifier

In [2]:
train_file = pd.read_csv('clean.csv')

In [3]:
train_file.columns, train_file.shape

(Index(['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
        'identity_hate', 'label'],
       dtype='object'),
 (159513, 8))

In [4]:
max_label_length = 6
new_label = []
for each in train_file['label']:
    temp = str(each)
    if len(temp) != max_label_length:
        zeros = max_label_length - len(str(each))
        res = temp.rjust(zeros + len(temp), '0')
        new_label.append(res)
    else:
        new_label.append(temp)    

In [5]:
train_file['label'] = new_label

In [6]:
train_file = train_file.drop(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis = 1)

In [7]:
value_counts_label = train_file['label'].value_counts()
value_counts_label

000000    143288
100000      5666
101010      3800
101000      1758
100010      1215
111010       989
101011       618
001000       317
000010       301
111011       265
001010       181
111000       158
100001       136
100011       134
101110       131
100100       113
111110        64
101111        56
000001        54
110000        41
101001        35
111111        31
000011        28
000100        22
001011        18
100110        16
110010        14
101100        11
110100        11
100101         7
110011         7
111001         6
111100         4
000110         3
100111         3
110001         3
001001         3
001100         2
001110         2
110110         1
110101         1
Name: label, dtype: int64

#### some classes have only 1 data point in a class, discard them as train_test_split requires atleast 2 data points

In [8]:
labels_one_dp = ['110110', '110101']
inds = train_file[ (train_file['label'] == '110110') ].index
train_file.drop(inds , inplace=True)
train_file.shape

inds = train_file[ (train_file['label'] == '110101') ].index
train_file.drop(inds , inplace=True)
train_file.shape

(159511, 2)

In [9]:
train, val = train_test_split(train_file, test_size=0.1, random_state=42, stratify = train_file['label'], shuffle = True)


In [10]:


x_train = train['comment_text'].reset_index(drop = True)
y_train = train['label'].reset_index(drop = True)

x_val = val['comment_text'].reset_index(drop = True)
y_val = val['label'].reset_index(drop = True)


del train
del val


In [11]:
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


#### download the desired model

In [12]:
fasttext_vectors = gensim.downloader.load('word2vec-google-news-300')

#### Split the sentence, get embedding for every word, if its OOV simply skip.
#### Lastly take the average of all the words

In [13]:
def sentence_vectors(datapoint, vector_type):
    words = datapoint.split(' ')
    word_vectors = []
    for each in words:
        try:
            word_vectors.append(fasttext_vectors[each])
        except KeyError:
            #if OOV then skip
            pass
    #take average
    word_vectors = np.array(word_vectors)
    
    #if while skipping as words are OOV
    #the list is empty then return 0
    if vector_type == 'sent':
        if word_vectors.shape[0] == []:
            return 0
        else:
            return np.mean(word_vectors, axis = 0)
    else:
        if word_vectors.shape[0] == []:
            return 0, 0
        else:
            return word_vectors, len(words)

In [15]:
train_sentence_vector = x_train.apply(sentence_vectors, vector_type = 'sent')
val_sentence_vector = x_val.apply(sentence_vectors, vector_type = 'sent')


In [16]:
train_sentence_vector.shape, y_train.shape, val_sentence_vector.shape, y_val.shape


((143559,), (143559,), (15952,), (15952,))

#### sentence_vectors returned 0 for the empty sentences
#### simply delete those inputs and outputs

In [17]:
 def get_index_zerovectors(train_sentence_vector):
    remove_indices = []
    for index, vector in enumerate(train_sentence_vector):
        if vector.shape == ():
            remove_indices.append(index)
    return remove_indices

In [18]:
train_remove_indices = get_index_zerovectors(train_sentence_vector)
val_remove_indices = get_index_zerovectors(val_sentence_vector)


In [19]:
train_sentence_vector = [i for j, i in enumerate(train_sentence_vector) if j not in train_remove_indices]
y_train = [i for j, i in enumerate(y_train) if j not in train_remove_indices]

val_sentence_vector = [i for j, i in enumerate(val_sentence_vector) if j not in val_remove_indices]
y_val = [i for j, i in enumerate(y_val) if j not in val_remove_indices]


In [20]:
train_sentence_vector = np.array(train_sentence_vector)
y_train = np.array(y_train)
val_sentence_vector = np.array(val_sentence_vector)
y_val = np.array(y_val)


train_sentence_vector.shape, y_train.shape, val_sentence_vector.shape, y_val.shape


((143454, 300), (143454,), (15940, 300), (15940,))

#### form one hot encoding labels

In [21]:
lb = LabelBinarizer()
lb.fit(y_train)
Y_train = lb.transform(y_train)
Y_val = lb.transform(y_val)


In [26]:
models = [
    RandomForestClassifier(n_estimators=60, max_depth=3, random_state=0),
    OneVsRestClassifier(LinearSVC(random_state=0))
]
models_name = ['random forest', 'SVM - one vs rest']

In [28]:
trained_model = []
for index, model in enumerate(models):
    print(models_name[index])
    m1 = model.fit(train_sentence_vector, Y_train)
    trained_model.append(m1)
    preds = m1.predict(val_sentence_vector)
    preds = lb.inverse_transform(preds)
    conf_mat = confusion_matrix(y_val, preds)
    
    print(conf_mat)
    print('f1 score',f1_score(y_val, preds, average='weighted', zero_division = 1))
    print('recall',recall_score(y_val, preds, average='weighted', zero_division = 1))
    print('precision',precision_score(y_val, preds, average='weighted', zero_division =1))
    print('accuracy',accuracy_score(y_val, preds))
    print('********************************')
    '''
    fig, ax = plt.subplots(figsize=(15,15))
    sns.heatmap(conf_mat, annot=True)
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()
    '''

random forest
f1 score 0.8503644433122343
recall 0.8984316185696362
precision 0.9087477546760201
accuracy 0.8984316185696362
********************************
SVM - one vs rest
f1 score 0.8549527851185107
recall 0.8999372647427855
precision 0.8520606198146976
accuracy 0.8999372647427855
********************************
