In [1]:
from toxic.model import get_model
from toxic.nltk_utils import tokenize_sentences
from toxic.train_utils import train_folds
from toxic.embedding_utils import read_embedding_list, clear_embedding_list, convert_tokens_to_ids

import argparse
import numpy as np
import os
import pandas as pd

#viz
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec 
import seaborn as sns

from keras.layers import Dense, Embedding, Input, GlobalMaxPool1D, GlobalAveragePooling1D, concatenate, Reshape
from keras.layers import Bidirectional, Dropout, CuDNNGRU, GRU
from keras.models import Model
from keras.optimizers import RMSprop


Using TensorFlow backend.


In [2]:
UNKNOWN_WORD = "_UNK_"
END_WORD = "_END_"
NAN_WORD = "_NAN_"

CLASSES = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

PROBABILITIES_NORMALIZE_COEFFICIENT = 1.4

In [3]:
train_file_path = "/Users/sdivakarla/bbanalytics-telemetry-research/satish/SentimentAnalysis/data/WikiToxicity/KaggleToxicDataset/train.csv"
test_file_path = "/Users/sdivakarla/bbanalytics-telemetry-research/satish/SentimentAnalysis/data/WikiToxicity/KaggleToxicDataset/test.csv"
embedding_path = "/Users/sdivakarla/bbanalytics-telemetry-research/satish/SentimentAnalysis/model/crawl/crawl-300d-2M.vec"
print("Loading data...")
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

list_sentences_train = train_data["comment_text"].fillna(NAN_WORD).values
list_sentences_test = test_data["comment_text"].fillna(NAN_WORD).values
y_train = train_data[CLASSES].values

Loading data...


In [4]:
#marking comments without any tags as "clean"
rowsums=train_data.iloc[:,2:8].sum(axis=1)
train_data['clean']=(rowsums==0)
#count number of clean entries
train_data['clean'].sum()
print("Total comments = ",len(train_data))
print("Total clean comments = ",train_data['clean'].sum())
print("Total tags =",rowsums.sum())


Total comments =  159571
Total clean comments =  143346
Total tags = 35098


In [None]:
from langdetect import detect
def detect_language(row):
    try:
        return detect(row)
    except:
        return "en"

In [None]:
train_data['language'] = train_data['comment_text'].apply(detect_language)

In [None]:
test_data['language'] = test_data['comment_text'].apply(detect_language)

In [None]:
from tools.extend_dataset import translate

In [None]:
train_data.head()

In [None]:
train_data.language.unique()

In [None]:
from joblib import Parallel, delayed
from textblob import TextBlob
from textblob.translate import NotTranslated

import argparse
import os
import numpy as np

NAN_WORD = "_NAN_"


def translate_to_english(comment, language):
    print(language)
    if hasattr(comment, "decode"):
        comment = comment.decode("utf-8")
    if language == "en":
        print("Lang is en")
        return comment
    else:
        text = TextBlob(comment)
        try:
            text = text.translate(from_lang=language, to="en")
        except NotTranslated:
            pass
        return str(text)

In [None]:
train_data['new_comment_text'] = train_data.apply(lambda x: translate_to_english(x.comment_text, x.language), axis=1)
train_data.apply(translate_to_english)

In [None]:
train_data['new_comment_text'] = train_data.apply(lambda x: translate_to_english(train_data['comment_text'], train_data['language']), axis=1)

In [None]:
# Translate the non-english to the english.
train_data['comment_text'] = train_data['comment_text'].apply(translate, args=(train_data['language'],))
test_data['comment_text'] = test_data['comment_text'].apply(translate, args=(test_data['language'],))

In [None]:
print(len(train_data))
train_data_non_english= train_data[train_data['language']!='en']
print(len(train_data_non_english))

In [None]:
print(len(test_data))
test_data_non_english= test_data[test_data['language']!='en']
print(len(test_data_non_english))

In [None]:
test_data_non_english.to_csv("test_data_non_english.csv")

In [None]:
x=train_data_non_english.iloc[:,2:9].sum()

#plot
plt.figure(figsize=(8,4))
ax= sns.barplot(x.index, x.values, alpha=0.8)
plt.title("# per class")
plt.ylabel('# of Occurrences', fontsize=12)
plt.xlabel('Type ', fontsize=12)
#adding the text labels
rects = ax.patches
labels = x.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')

plt.show()

In [None]:
print("Tokenizing sentences in train set...")
tokenized_sentences_train, words_dict = tokenize_sentences(list_sentences_train, {})

print("Tokenizing sentences in test set...")
tokenized_sentences_test, words_dict = tokenize_sentences(list_sentences_test, words_dict)

In [None]:
print(embedding_path)
words_dict[UNKNOWN_WORD] = len(words_dict)

print("Loading embeddings...")
embedding_list, embedding_word_dict = read_embedding_list(file_path=embedding_path)
embedding_size = len(embedding_list[0])

In [None]:
print("Preparing data...")
embedding_list, embedding_word_dict = clear_embedding_list(embedding_list, embedding_word_dict, words_dict)

embedding_word_dict[UNKNOWN_WORD] = len(embedding_word_dict)
embedding_list.append([0.] * embedding_size)
embedding_word_dict[END_WORD] = len(embedding_word_dict)
embedding_list.append([-1.] * embedding_size)

embedding_matrix = np.array(embedding_list)

In [None]:
print(embedding_matrix.shape)

In [None]:
sentences_length = 500
result_path = "toxic_results"
batch_size = 256
sentences_length = 500
recurrent_units=64
dropout_rate = 0.3
dense_size=32
fold_count=10

id_to_word = dict((id, word) for word, id in words_dict.items())
train_list_of_token_ids = convert_tokens_to_ids(tokenized_sentences_train,id_to_word,
                                                embedding_word_dict,sentences_length)
test_list_of_token_ids = convert_tokens_to_ids(tokenized_sentences_test,id_to_word,
                                               embedding_word_dict,sentences_length)
X_train = np.array(train_list_of_token_ids)
X_test = np.array(test_list_of_token_ids)

In [None]:
get_model_func = lambda: get_model(
    embedding_matrix,
    sentences_length,
    dropout_rate,
    recurrent_units,
    dense_size)

In [None]:
input_layer = Input(shape=(sentences_length,))
embedding_layer = Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1],
                                weights=[embedding_matrix], trainable=False)(input_layer)
x = Bidirectional(GRU(recurrent_units, return_sequences=True))(embedding_layer)
x = Dropout(dropout_rate)(x)
x = Bidirectional(GRU(recurrent_units, return_sequences=True))(x)
x_max = GlobalMaxPool1D()(x)
x_avg = GlobalAveragePooling1D()(x)
x = concatenate([x_max, x_avg])
#x = Dense(dense_size, activation="relu")(x)
output_layer = Dense(6, activation="sigmoid")(x)
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='binary_crossentropy',optimizer=RMSprop(clipvalue=1, clipnorm=1), metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
print("Starting to train models...")
models = train_folds(X_train, y_train, fold_count, batch_size, get_model_func)

In [None]:
result_path = toxic_results
if not os.path.exists(result_path):
        os.mkdir(result_path)

In [None]:
print("Predicting results...")
test_predicts_list = []
for fold_id, model in enumerate(models):
    model_path = os.path.join(args.result_path, "model{0}_weights.npy".format(fold_id))
    np.save(model_path, model.get_weights())
    test_predicts_path = os.path.join(args.result_path, "test_predicts{0}.npy".format(fold_id))
    test_predicts = model.predict(X_test, batch_size=args.batch_size)
    test_predicts_list.append(test_predicts)
    np.save(test_predicts_path, test_predicts)

In [None]:
test_predicts = np.ones(test_predicts_list[0].shape)
for fold_predict in test_predicts_list:
    test_predicts *= fold_predict
    test_predicts **= (1. / len(test_predicts_list))
    test_predicts **= PROBABILITIES_NORMALIZE_COEFFICIENT
    
    test_ids = test_data["id"].values
    test_ids = test_ids.reshape((len(test_ids), 1))

    test_predicts = pd.DataFrame(data=test_predicts, columns=CLASSES)
    test_predicts["id"] = test_ids
    test_predicts = test_predicts[["id"] + CLASSES]
    submit_path = os.path.join(args.result_path, "submit")
    test_predicts.to_csv(submit_path, index=False)