<p style="background-color:RoyalBlue; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 10px 100px;"><b>Importing Libraries & Data</b></p> 

In [1]:
import sys
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import re
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import gensim

from wordcloud import WordCloud, STOPWORDS
import nltk
nltk.download('stopwords')
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Flatten, Dropout, Dense, LSTM, Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.metrics import confusion_matrix, accuracy_score

ModuleNotFoundError: No module named 'gensim'

## Use TPU

In [50]:
# import tensorflow as tf

# import os
# import tensorflow_datasets as tfds

### TPU initialization

In [51]:
# resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
# tf.config.experimental_connect_to_cluster(resolver)
# # This is the TPU initialization code that has to be at the beginning.
# tf.tpu.experimental.initialize_tpu_system(resolver)
# print("All devices: ", tf.config.list_logical_devices('TPU'))

In [52]:
# a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
# b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])

# with tf.device('/TPU:0'):
#   c = tf.matmul(a, b)

# print("c device: ", c.device)


In [53]:
# strategy = tf.distribute.TPUStrategy(resolver)
# print(strategy)

In [54]:
dff = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv')
dff.head()

## Getting the data together (test data)

In [55]:
df_test = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test.csv")
df_test_l = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv")

## We have test and test labels. So merging them together.
df_test = pd.merge(df_test, df_test_l, how = 'left', on = 'id')
df_test.head()

<p style="background-color:RoyalBlue; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 10px 100px;"><b>Quick EDA</b></p> 

In [56]:
## Plotting the histogram to check the distribution of length of each reviews. 
plt.hist([len(x) for x in dff['comment_text']], bins=500)
plt.show()

In [57]:
dff.shape

In [58]:
dff.isna().sum()

In [59]:
dff.describe()

In [60]:
dff.severe_toxic.value_counts()

In [61]:
dff['toxicity'] = (dff[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) > 0 ).astype(int)
dff = dff[['comment_text', 'toxicity']].rename(columns={'comment_text': 'text'})
dff.sample(5)

In [62]:
dff.describe()

In [63]:
dff.head()

In [64]:
dff.toxicity.value_counts()

data is not balanced.

In [65]:
min_len = (dff['toxicity'] == 1).sum()
df_undersample = dff[dff['toxicity'] == 0].sample(n=min_len, random_state=201)
dff = pd.concat([df_undersample, dff[dff['toxicity'] == 1]])
dff = shuffle(dff)

In [66]:
dff.text = dff.text.map(lambda x:x.replace('\n', ' '))
dff.text[:2]

In [67]:
toxic = dff[dff['toxicity'] == 1]
not_toxic = dff[dff['toxicity'] == 0]

## Word Cloud

In [68]:
wordcloud = WordCloud(width=1400, height=700, background_color='white').generate(' '.join(toxic.text.tolist()))
fig = plt.figure(figsize=(30,10), facecolor='white')
plt.imshow(wordcloud)
plt.axis('off')
plt.title('The most 100 frequent words in the toxic comments', fontsize=50)
plt.tight_layout(pad=0)
plt.show()

In [69]:
wordcloud = WordCloud(width=1400, height=700, background_color='white').generate(' '.join(not_toxic.text.tolist()))
fig = plt.figure(figsize=(30,10), facecolor='white')
plt.imshow(wordcloud)
plt.axis('off')
plt.title('The most 100 frequent words in the normal comments', fontsize=50)
plt.tight_layout(pad=0)
plt.show()

<p style="background-color:RoyalBlue; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 10px 100px;"><b>Text Pre-Processing</b></p> 

In [70]:
y = dff.toxicity
x = dff.drop('toxicity', axis = 1)

In [71]:
texts = x.copy()
texts.reset_index(inplace = True, drop = True)
texts.head()

In [72]:
print(sys.getrecursionlimit())

In [73]:
sys.setrecursionlimit(6000)

In [74]:
ps = PorterStemmer()# 清洁文本，去掉单词的不同时态，只保留词根含义
corpus = []

for i in tqdm(range(0, len(texts))) :
    cleaned = re.sub('[^a-zA-Z]', ' ', texts['text'][i])
    cleaned = cleaned.lower().split()

    cleaned = [ps.stem(word) for word in cleaned if not word in stopwords.words('english')]
    cleaned = ' '.join(cleaned)
    corpus.append(cleaned)

In [75]:
DIM = 100

X = [d.split() for d in corpus]
w2v_model = gensim.models.Word2Vec(sentences = X, vector_size = DIM, window = 10, min_count = 1)

In [76]:
len(w2v_model.wv.key_to_index.keys()) 

In [77]:
w2v_model.wv.most_similar('toxic')

In [78]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X) 

In [79]:
X = tokenizer.texts_to_sequences(X)
# X[:3]

In [80]:
X = pad_sequences(X, padding = 'pre', maxlen = 20)
X[:3]

In [81]:
vocab_size = len(tokenizer.word_index) + 1 
vocab = tokenizer.word_index

In [82]:
def get_weights_matrix(model) :
    weights_matrix = np.zeros((vocab_size, DIM))

    for word, i in vocab.items() :
        weights_matrix[i] = model.wv[word]

    return weights_matrix


embedding_vectors = get_weights_matrix(w2v_model) 

<p style="background-color:RoyalBlue; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 10px 100px;"><b>Modeling & Training</b></p> 

In [83]:
# # with strategy.scope():
# from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout

# model = Sequential()

# model.add(Embedding(vocab_size, output_dim = DIM, weights = [embedding_vectors], input_length = 20)) 
# model.add(Dropout(0.2))

# model.add(Bidirectional(LSTM(64, return_sequences = True)))
# model.add(GlobalMaxPool1D()) 
# # model.add(LSTM(64))
# model.add(Dropout(0.2))

# model.add(Dense(64, activation='relu'))
# model.add(Dropout(0.2))

# model.add(Dense(64, activation='relu'))
# model.add(Dropout(0.2))

# model.add(Dense(1, activation = 'linear'))

In [84]:
## 2 LSTM
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout

model = Sequential()

model.add(Embedding(vocab_size, output_dim = DIM, weights = [embedding_vectors], input_length = 20)) 
model.add(Dropout(0.2))

model.add(LSTM(64, return_sequences = True))
# model.add(Bidirectional(LSTM(64, return_sequences = True)))
# model.add(Bidirectional(LSTM(128)))
model.add(Dropout(0.2))

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(1, activation = 'sigmoid'))



In [None]:
## Model Architecture  https://www.kaggle.com/kaushikholla/using-rnn-and-lstm-with-gpu-0-707
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, output_dim = DIM, weights = [embedding_vectors], input_length = 1000),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [90]:
#   with strategy.scope():  
model.compile(loss = 'mean_squared_error', optimizer = 'adam', metrics = 'accuracy')
# model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = 'accuracy')
model.summary()

# img of model
from keras.utils.vis_utils import plot_model
plot_model(model, to_file='./model.png', show_shapes=False, show_layer_names=True)

In [91]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
# from keras.callbacks import ModelCheckpoint
# checkpoint = ModelCheckpoint('./', monitor='loss', 
#                              verbose=1, save_best_only=True, 
#                              mode='min')

def schedule(epoch):
    if epoch < 50:
        return .001
    if epoch < 300:
        return .0001
    if epoch < 600:
        return .00001
    if epoch < 900:
        return .000001
    else:
        return .0000001      
#将LearningRateScheduler类实例化   
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(schedule)

early = EarlyStopping(patience=3, 
                   monitor='val_acc', 
                   restore_best_weights=True, 
                   mode='max', 
                   verbose=1)

callback_list = [lr_scheduler, early]

# train the model 
# with strategy.scope():
hist = model.fit(x_train, y_train, validation_data = (x_test, y_test), epochs = 20,
                 callbacks = callback_list, batch_size = 32, shuffle=True)

In [87]:
# loss: 0.0050 - accuracy: 0.9970 - val_loss: 0.1164 - val_accuracy: 0.8764
plt.style.use('fivethirtyeight')

# visualize the models accuracy
plt.plot(hist.history['accuracy'])
plt.plot(hist.history['val_accuracy'])
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc = 'best')
plt.show()  

# visualize the models loss
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc = 'best')
plt.show()  

### Other train to compare

In [None]:
# hist = model.fit(x_train, y_train, validation_data = (x_test, y_test), epochs = 50,
#                  callbacks=es, batch_size = 128, shuffle=True)
# plt.style.use('fivethirtyeight')

# # visualize the models accuracy
# plt.plot(hist.history['accuracy'])
# plt.plot(hist.history['val_accuracy'])
# plt.ylabel('accuracy')
# plt.xlabel('epoch')
# plt.legend(['train', 'val'], loc = 'upper left')
# plt.show()  

<p style="background-color:RoyalBlue; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 10px 100px;"><b>Submission</b></p> 

In [None]:
sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

In [None]:
new_text = tokenizer.texts_to_sequences(sub.text)
new_text = pad_sequences(new_text, maxlen = 1000) # 20

In [None]:
sub['score'] = model.predict(new_text) * 1000 
sub.head()

In [None]:
sub[['comment_id', 'score']].to_csv("submission.csv", index=False)