In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data_raw = pd.read_csv('dataset1_comments.csv')
#data_raw = data_raw.loc[np.random.choice(data_raw.index, size=2000)]
data = data_raw
data = data_raw.loc[np.random.choice(data_raw.index, size=2000)]
data.shape
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re

import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [3]:
def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext


def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned


def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

In [4]:
data['comment_text'] = data['comment_text'].str.lower()
data['comment_text'] = data['comment_text'].apply(cleanHtml)
data['comment_text'] = data['comment_text'].apply(cleanPunc)
data['comment_text'] = data['comment_text'].apply(keepAlpha)
stop_words = set(stopwords.words('english'))
stop_words.update(['zero','one','two','three','four','five','six','seven','eight','nine','ten','may','also','across','among','beside','however','yet','within'])
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub(" ", sentence)

data['comment_text'] = data['comment_text'].apply(removeStopWords)
stemmer = SnowballStemmer("english")
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

data['comment_text'] = data['comment_text'].apply(stemming)

In [27]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data,test_size=0.30, shuffle=True)

train_text = train['comment_text']
test_text = test['comment_text']


from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2',max_features=5000)
vectorizer.fit(train_text)
vectorizer.fit(test_text)

x_train = vectorizer.transform(train_text).toarray()
y_train = train.drop(labels = ['id','comment_text'], axis=1).values

x_test = vectorizer.transform(test_text).toarray()
y_test = test.drop(labels = ['id','comment_text'], axis=1).values

In [28]:
print(x_train.shape)
print(y_train.shape)
print(y_test.shape)

(1400, 5000)
(1400, 6)
(600, 6)


## Accuracy Function

In [29]:
def hamming_accuracy_score(y_true, y_pred, normalize=True, sample_weight=None):
    '''
    Compute the Hamming score (a.k.a. label-based accuracy) for the multi-label case
    http://stackoverflow.com/q/32239577/395857
    '''
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        #print('\nset_true: {0}'.format(set_true))
        #print('set_pred: {0}'.format(set_pred))
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/float( len(set_true.union(set_pred)) )
        #print('tmp_a: {0}'.format(tmp_a))
        acc_list.append(tmp_a)
    return np.mean(acc_list)

## MODEL 1 : Chain Classifiers of LR

In [30]:
# using classifier chains
from skmultilearn.problem_transform import ClassifierChain
from sklearn.svm import SVC
from sklearn.metrics import hamming_loss,log_loss,f1_score,accuracy_score

# initialize classifier chains multi-label classifier
classifier = ClassifierChain(SVC())

# Training logistic regression model on train data
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

In [31]:
# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions.toarray()))
print("Hamming accuracy = ",hamming_accuracy_score(y_test,predictions.toarray()))
print("Hamming loss = ",hamming_loss(y_test,predictions))
print("Log loss = ",log_loss(y_test,predictions.toarray()))
print("F1 score = ",f1_score(y_test,predictions.toarray(),average='macro'))

Accuracy =  0.888333333333
Hamming accuracy =  0.888333333333
Hamming loss =  0.03777777777777778
Log loss =  0.406132146358
F1 score =  0.0


## MODEL 2 : Multi-lable Lazy Learning

In [32]:
from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix
classifier_new = MLkNN(k=10)
# Note that this classifier can throw up errors when handling sparse matrices.
x_train = lil_matrix(x_train).toarray()
y_train = lil_matrix(y_train).toarray()
x_test = lil_matrix(x_test).toarray()
# train
classifier_new.fit(x_train, y_train)
# predict
predictions = classifier_new.predict(x_test)

In [33]:
# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions.toarray()))
print("Hamming accuracy = ",hamming_accuracy_score(y_test,predictions.toarray()))
print("Hamming loss = ",hamming_loss(y_test,predictions))
print("Log loss = ",log_loss(y_test,predictions.toarray()))
print("F1 score = ",f1_score(y_test,predictions.toarray(),average='macro'))

Accuracy =  0.875
Hamming accuracy =  0.894361111111
Hamming loss =  0.03222222222222222
Log loss =  1.16305492832
F1 score =  0.276626909395


## MODEL 3 : Classic ANN

In [34]:
import os
os.environ['CUDA_VISIBLE_DIVICES'] = '-1'
import keras
from keras.models import Sequential
from keras.layers import Dense,Dropout
from keras.backend.tensorflow_backend import clear_session
clear_session()
model = Sequential()
model.add(Dense(512,activation='relu',input_shape=(x_train.shape[1],),kernel_initializer='glorot_uniform'))
model.add(Dropout(0.4))
model.add(Dense(256,activation='relu',kernel_initializer='glorot_uniform'))
model.add(Dropout(0.4))
model.add(Dense(y_train.shape[1],activation='sigmoid',kernel_initializer='glorot_uniform'))
model.compile(loss='binary_crossentropy', optimizer='adagrad')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               2560512   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               131328    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 1542      
Total params: 2,693,382
Trainable params: 2,693,382
Non-trainable params: 0
_________________________________________________________________


In [35]:
model.fit(x=x_train,y=y_train,epochs=5,batch_size=50)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1a81a797128>

In [36]:
#Predictions
predictions = model.predict(x_test)
predictions = np.array([list(np.round(x).astype(int)) for x in predictions])
# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("Hamming accuracy = ",hamming_accuracy_score(y_test,predictions))
print("Hamming loss = ",hamming_loss(y_test,predictions))
print("Log loss = ",log_loss(y_test,predictions))
print("F1 score = ",f1_score(y_test,predictions,average='macro'))

Accuracy =  0.88
Hamming accuracy =  0.894472222222
Hamming loss =  0.02861111111111111
Log loss =  1.40591035821
F1 score =  0.279526940183
