# Embeddings + LSTM

In [38]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import itertools

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.models import Sequential
from keras.callbacks import EarlyStopping

In [2]:
data = pd.read_csv('../Data/train.csv')
test = pd.read_csv('../Data/test.csv')
tags = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
X = data.comment_text
Y = data[tags]
data[0:10]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [34]:
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(list(X))
train_tokenized = pad_sequences(tokenizer.texts_to_sequences(X), 100)
test_tokenized = pad_sequences(tokenizer.texts_to_sequences(test.comment_text), 100)

In [35]:
print('Train shape:',train_tokenized.shape)
print('Test shape:',test_tokenized.shape)

Train shape: (159571, 100)
Test shape: (153164, 100)


In [36]:
model = Sequential([
    Embedding(20000, 100, input_length=100),
    Bidirectional(LSTM(100, return_sequences=True)),
    GlobalMaxPooling1D(),
    Dense(100, activation="relu"),
    Dropout(0.1),
    Dense(6, activation="sigmoid")
])

model.compile(loss="binary_crossentropy", optimizer="adam", metrics= ["accuracy"])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 100, 100)          2000000   
_________________________________________________________________
bidirectional_13 (Bidirectio (None, 100, 200)          160800    
_________________________________________________________________
global_max_pooling1d_17 (Glo (None, 200)               0         
_________________________________________________________________
dense_29 (Dense)             (None, 100)               20100     
_________________________________________________________________
dropout_19 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_30 (Dense)             (None, 6)                 606       
Total params: 2,181,506
Trainable params: 2,181,506
Non-trainable params: 0
_________________________________________________________________


In [40]:
model.fit(train_tokenized, Y, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f553b03a630>

In [41]:
predicted  = model.predict(test_tokenized, verbose=1)
submission = pd.DataFrame(data=predicted,columns=tags,index=test["id"] )
submission[:5]



Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00001cee341fdb12,0.998396,0.38986,0.967518,0.07044257,0.926057,0.3367495
0000247867823ef7,6.9e-05,5.579686e-09,1.4e-05,8.107273e-08,5e-06,2.105901e-07
00013b17ad220c46,0.004311,6.00425e-06,0.000933,5.286001e-05,0.000648,5.334762e-05
00017563c3f7919a,0.000293,5.630267e-08,6.2e-05,9.748851e-07,3.1e-05,1.481763e-06
00017695ad8997eb,0.000238,6.07194e-08,5e-05,8.871141e-07,2.4e-05,1.439776e-06


In [42]:
submission.to_csv('../Submissions/embeddings-lstm.csv')

# Резултат: 0.9752