In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data_raw = pd.read_csv('dataset1_comments.csv')
#data_raw = data_raw.loc[np.random.choice(data_raw.index, size=2000)]
data = data_raw
data = data_raw.loc[np.random.choice(data_raw.index, size=2000)]
data.shape
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re

import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [4]:
def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext


def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned


def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

In [5]:
data['comment_text'] = data['comment_text'].str.lower()
data['comment_text'] = data['comment_text'].apply(cleanHtml)
data['comment_text'] = data['comment_text'].apply(cleanPunc)
data['comment_text'] = data['comment_text'].apply(keepAlpha)
stop_words = set(stopwords.words('english'))
stop_words.update(['zero','one','two','three','four','five','six','seven','eight','nine','ten','may','also','across','among','beside','however','yet','within'])
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub(" ", sentence)

data['comment_text'] = data['comment_text'].apply(removeStopWords)
stemmer = SnowballStemmer("english")
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

data['comment_text'] = data['comment_text'].apply(stemming)

In [8]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data,test_size=0.30, shuffle=True)

train_text = train['comment_text']
test_text = test['comment_text']


from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2',max_features=5000)
vectorizer.fit(train_text)
vectorizer.fit(test_text)

X_train = vectorizer.transform(train_text).toarray()
y_train = train.drop(labels = ['id','comment_text'], axis=1).values

X_test = vectorizer.transform(test_text).toarray()
y_test = test.drop(labels = ['id','comment_text'], axis=1).values

In [9]:
print("Train_X: ",X_train.shape)
print("Train_Y: ",y_train.shape)
print("Test_X: ",X_test.shape)
print("Test_Y: ",y_test.shape)

Train_X:  (1400, 5000)
Train_Y:  (1400, 6)
Test_X:  (600, 5000)
Test_Y:  (600, 6)


In [10]:
def hamming_accuracy_score(y_true, y_pred, normalize=True, sample_weight=None):
    '''
    Compute the Hamming score (a.k.a. label-based accuracy) for the multi-label case
    http://stackoverflow.com/q/32239577/395857
    '''
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        #print('\nset_true: {0}'.format(set_true))
        #print('set_pred: {0}'.format(set_pred))
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/float( len(set_true.union(set_pred)) )
        #print('tmp_a: {0}'.format(tmp_a))
        acc_list.append(tmp_a)
    return np.mean(acc_list)

## Bidirectional Model


In [11]:
import keras
from keras.models import Sequential
from keras.layers import Dense,Dropout
from keras.backend.tensorflow_backend import clear_session
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [12]:
ENS_COUNT = 4
clear_session()

In [13]:
def create_c1node(X_feed,y_now):
    '''
    C1 node Architecture:
    attribute:512:256:1 [saperate for each class]
    loss: Binary crossentropy
    '''
    model = Sequential()
    model.add(Dense(512,activation='relu',input_shape=(X_feed.shape[1],),kernel_initializer='glorot_uniform'))
    model.add(Dropout(0.4))
    model.add(Dense(256,activation='relu',kernel_initializer='glorot_uniform'))
    model.add(Dropout(0.4))
    model.add(Dense(y_now.shape[1],activation='sigmoid',kernel_initializer='glorot_uniform'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    
    return model

In [14]:
ensembles_fwd = []
for j in range(ENS_COUNT//2):
    print("TRAINING ENSEMBLE {} :\n\n\n".format(j))
    #Model chain
    chain_forward = []

    #Training
    x_t,_,y_t,_ = train_test_split(X_train,y_train,test_size=0.4)
    X_feed = x_t.copy()
    for i in range(y_t.shape[1]):
        print("Training chain node ",i)
        y_now = y_t[:,[i,]].copy()
        print("Shapes:\n X = {} \n Y = {}".format(X_feed.shape,y_now.shape))

        node = create_c1node(X_feed,y_now)
        node.fit(X_feed,y_now,epochs=5,batch_size=50)
        print("Training of node {} complete\n\n".format(i))  
        #Checking accuracy of the node
        #---
        #Adding node to chain 
        chain_forward.append(node)
        #Updating X_feed
        X_feed = np.append(X_feed,y_now,axis=1)
    ensembles_fwd.append(chain_forward)

TRAINING ENSEMBLE 0 :



Training chain node  0
Shapes:
 X = (840, 5000) 
 Y = (840, 1)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Training of node 0 complete


Training chain node  1
Shapes:
 X = (840, 5001) 
 Y = (840, 1)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Training of node 1 complete


Training chain node  2
Shapes:
 X = (840, 5002) 
 Y = (840, 1)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Training of node 2 complete


Training chain node  3
Shapes:
 X = (840, 5003) 
 Y = (840, 1)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Training of node 3 complete


Training chain node  4
Shapes:
 X = (840, 5004) 
 Y = (840, 1)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Training of node 4 complete


Training chain node  5
Shapes:
 X = (840, 5005) 
 Y = (840, 1)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Training of node 5 complete


TRAINING ENSEMBLE 1 :



Training chain node  0
Shapes:
 X = (840, 5000) 
 Y = (840, 1)
Epoch 1/5
Epoch 2/5
Epoch 3/5

In [15]:
ensembles_rev = []
for j in range(ENS_COUNT//2):
    print("TRAINING ENSEMBLE {} :\n\n\n".format(j))
    #Model chain
    chain_reverse = []
    #Training
    x_t,_,y_t,_ = train_test_split(X_train,y_train,test_size=0.4)
    X_feed = x_t.copy()
    for i in range(1,y_t.shape[1]+1):
        print("Training chain node ",i)
        y_now = y_t[:,[-i,]].copy()
        print("Shapes:\n X = {} \n Y = {}".format(X_feed.shape,y_now.shape))

        node = create_c1node(X_feed,y_now)
        node.fit(X_feed,y_now,epochs=10,batch_size=50)
        print("Training of node {} complete\n\n".format(i))  
        #Checking accuracy of the node
        #---
        #Adding node to chain 
        chain_reverse.append(node)
        #Updating X_feed
        X_feed = np.append(X_feed,y_now,axis=1)
    ensembles_rev.append(chain_reverse)

TRAINING ENSEMBLE 0 :



Training chain node  1
Shapes:
 X = (840, 5000) 
 Y = (840, 1)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training of node 1 complete


Training chain node  2
Shapes:
 X = (840, 5001) 
 Y = (840, 1)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training of node 2 complete


Training chain node  3
Shapes:
 X = (840, 5002) 
 Y = (840, 1)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training of node 3 complete


Training chain node  4
Shapes:
 X = (840, 5003) 
 Y = (840, 1)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training of node 4 complete


Training chain node  5
Shapes:
 X = (840, 5004) 
 Y = (840, 1)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/

Epoch 8/10
Epoch 9/10
Epoch 10/10
Training of node 2 complete


Training chain node  3
Shapes:
 X = (840, 5002) 
 Y = (840, 1)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training of node 3 complete


Training chain node  4
Shapes:
 X = (840, 5003) 
 Y = (840, 1)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training of node 4 complete


Training chain node  5
Shapes:
 X = (840, 5004) 
 Y = (840, 1)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training of node 5 complete


Training chain node  6
Shapes:
 X = (840, 5005) 
 Y = (840, 1)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training of node 6 complete




In [16]:
y_full_fw = []
y_full_re = []

for k in range(ENS_COUNT//2):
    print("\n\nENSEMBLE {}:\n\n".format(k+1))
    X_feed_fw = X_test.copy()
    X_feed_re = X_test.copy()
    j = 0
    #Model chain
    y_pred_fw = []
    y_pred_re = []
    for i in range(len(chain_forward)):
        node1 = ensembles_fwd[k][i]
        node2 = ensembles_rev[k][i]
        print("Getting op from chain node ",j)
        node_name = "Node" + str(i)
        print("Shapes:\n X = {}".format(X_feed_fw.shape))
        output_fw = node1.predict(X_feed_fw)
        output_fw_bin = output_fw.round().astype(int)
        output_rev = node2.predict(X_feed_re)
        output_rev_bin = output_rev.round().astype(int)
        y_pred_fw.append(output_fw)
        y_pred_re.append(output_rev)
        X_feed_fw = np.append(X_feed_fw,output_fw_bin,axis=1)
        X_feed_re = np.append(X_feed_re,output_rev_bin,axis=1)
        j+=1
    y_full_fw.append(y_pred_fw)
    y_full_re.append(y_pred_re)
        #Updating X_feed
            



ENSEMBLE 1:


Getting op from chain node  0
Shapes:
 X = (600, 5000)
Getting op from chain node  1
Shapes:
 X = (600, 5001)
Getting op from chain node  2
Shapes:
 X = (600, 5002)
Getting op from chain node  3
Shapes:
 X = (600, 5003)
Getting op from chain node  4
Shapes:
 X = (600, 5004)
Getting op from chain node  5
Shapes:
 X = (600, 5005)


ENSEMBLE 2:


Getting op from chain node  0
Shapes:
 X = (600, 5000)
Getting op from chain node  1
Shapes:
 X = (600, 5001)
Getting op from chain node  2
Shapes:
 X = (600, 5002)
Getting op from chain node  3
Shapes:
 X = (600, 5003)
Getting op from chain node  4
Shapes:
 X = (600, 5004)
Getting op from chain node  5
Shapes:
 X = (600, 5005)


In [17]:
cross_chain_group = []
for j in range(ENS_COUNT//2):
    prediction = []
    y_pred_fw = y_full_fw[j].copy()
    y_pred_re = y_full_re[j].copy()
    fw = np.array(y_pred_fw)[:,:,0].transpose().copy()
    re = np.array(y_pred_re)[:,:,0].transpose().copy()
    for i in range(np.array(y_pred_fw)[:,:,0].transpose().shape[0]):
        fw_i = fw[i,:].reshape(1,-1)[0]
        re_i = re[i,:].reshape(1,-1)[0][::-1] 
        final = (fw_i+re_i)/2
        prediction.append(final)
    cross_chain_group.append(np.array(prediction))

In [18]:
#Adding all 5 results
total = None
for x in cross_chain_group:
    if total is None:
        total = x
    else:
        total = total + x
#Calculating the average
predictions = (total/(ENS_COUNT//2)).round().astype(int)

In [19]:
from sklearn.metrics import accuracy_score,hamming_loss, log_loss, f1_score

In [20]:
# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("Hamming accuracy = ",hamming_accuracy_score(y_test,predictions))
print("Hamming loss = ",hamming_loss(y_test,predictions))
print("Log loss = ",log_loss(y_test,predictions))
print("F1 score = ",f1_score(y_test,predictions,average='macro'))

Accuracy =  0.906666666667
Hamming accuracy =  0.912611111111
Hamming loss =  0.03333333333333333
Log loss =  1.6733330222
F1 score =  0.0731120731121
