In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score

import seaborn as sns
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import LabelBinarizer

from sklearn.metrics import confusion_matrix

In [2]:
train_file = pd.read_csv('clean.csv')

In [3]:
train_file.columns, train_file.shape

(Index(['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
        'identity_hate', 'label'],
       dtype='object'),
 (159513, 8))

In [4]:
max_label_length = 6
new_label = []
for each in train_file['label']:
    temp = str(each)
    if len(temp) != max_label_length:
        zeros = max_label_length - len(str(each))
        res = temp.rjust(zeros + len(temp), '0')
        new_label.append(res)
    else:
        new_label.append(temp)    

In [5]:
train_file['label'] = new_label

In [6]:
train_file = train_file.drop(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis = 1)

In [7]:
value_counts_label = train_file['label'].value_counts()
value_counts_label

000000    143288
100000      5666
101010      3800
101000      1758
100010      1215
111010       989
101011       618
001000       317
000010       301
111011       265
001010       181
111000       158
100001       136
100011       134
101110       131
100100       113
111110        64
101111        56
000001        54
110000        41
101001        35
111111        31
000011        28
000100        22
001011        18
100110        16
110010        14
101100        11
110100        11
100101         7
110011         7
111001         6
111100         4
000110         3
100111         3
110001         3
001001         3
001100         2
001110         2
110110         1
110101         1
Name: label, dtype: int64

In [8]:
labels_one_dp = ['110110', '110101']
inds = train_file[ (train_file['label'] == '110110') ].index
train_file.drop(inds , inplace=True)
train_file.shape

(159512, 2)

In [9]:
inds = train_file[ (train_file['label'] == '110101') ].index
train_file.drop(inds , inplace=True)
train_file.shape

(159511, 2)

In [10]:
train, val = train_test_split(train_file, test_size=0.1, random_state=42, stratify = train_file['label'], shuffle = True)


In [11]:


x_train = train['comment_text'].reset_index(drop = True)
y_train = train['label'].reset_index(drop = True)

x_val = val['comment_text'].reset_index(drop = True)
y_val = val['label'].reset_index(drop = True)


del train
del val


#### Corpus will be x_train
#### the output shape will be (number of data points, features)
#### features - representing the tf-idf score for different unigrams and bigrams.

In [12]:
tf_idf = TfidfVectorizer()
tf_idf.fit(x_train)
X_train = tf_idf.transform(x_train)
X_val = tf_idf.transform(x_val)
X_train.shape, X_val.shape

((143559, 158878), (15952, 158878))

In [13]:
lb = LabelBinarizer()
lb.fit(y_train)
Y_train = lb.transform(y_train)
Y_val = lb.transform(y_val)

In [14]:
models = [
    RandomForestClassifier(n_estimators=60, max_depth=3, random_state=0),
    OneVsRestClassifier(LinearSVC(random_state=0))
]
models_name = ['random forest', 'SVM - one vs rest']

In [15]:
trained_model = []
for index, model in enumerate(models):
    print(models_name[index])
    m1 = model.fit(X_train, Y_train)
    trained_model.append(m1)
    preds = m1.predict(X_val)
    preds = lb.inverse_transform(preds)
    conf_mat = confusion_matrix(y_val, preds)
    
    print('f1 score',f1_score(y_val, preds, average='weighted', zero_division = 1))
    print('recall',recall_score(y_val, preds, average='weighted', zero_division = 1))
    print('precision',precision_score(y_val, preds, average='weighted', zero_division =1))
    print('accuracy',accuracy_score(y_val, preds))
    print('********************************')
    '''
    fig, ax = plt.subplots(figsize=(15,15))
    sns.heatmap(conf_mat, annot=True)
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()
    '''

random forest
f1 score 0.8502030926012301
recall 0.8983199598796389
precision 0.908658790438517
accuracy 0.8983199598796389
********************************
SVM - one vs rest
f1 score 0.8690322637398731
recall 0.9044633901705116
precision 0.8652460371854777
accuracy 0.9044633901705116
********************************
