# Нека отново разделим чистите коментари на 2 и натренираме 2 LSTM мрежи

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import itertools

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.models import Sequential
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [2]:
data = pd.read_csv('../Data/train.csv')
test = pd.read_csv('../Data/test.csv')
tags = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
X = data.comment_text
Y = data[tags]
data[0:10]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [5]:
all_toxic = data[data.toxic + data.severe_toxic + data.obscene + data.threat + data.insult + data.identity_hate > 0]
all_clear = data[data.toxic + data.severe_toxic + data.obscene + data.threat + data.insult + data.identity_hate == 0]

first, second = np.split(all_clear, 2, axis=0)
first = first.append(all_toxic)
second = second.append(all_toxic)

In [6]:
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(list(X))

first_tokenized = pad_sequences(tokenizer.texts_to_sequences(first.comment_text), 100)
second_tokenized = pad_sequences(tokenizer.texts_to_sequences(second.comment_text), 100)
test_tokenized = pad_sequences(tokenizer.texts_to_sequences(test.comment_text), 100)

In [8]:
def train_lstm(modelArr, x = X, y = Y, epochs = 5):
    model = Sequential(modelArr)
    
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics= ["accuracy"])

    print(model.summary())
    
    model.fit(x, y, epochs=epochs)
    
    return model

In [13]:
def submission(model, csv_name, test_comments = test_tokenized):
    prediction = model.predict(test_comments, verbose=1)
    
    submission = pd.DataFrame(data=prediction,columns=tags,index=test["id"])
    
    submission.to_csv('../Submissions/' + csv_name)
    
    return submission

# Първата половина:

In [11]:
first_model = train_lstm([
    Embedding(20000, 100, input_length=100),
    Bidirectional(LSTM(100, return_sequences=True)),
    GlobalMaxPooling1D(),
    Dense(100, activation="relu"),
    Dropout(0.1),
    Dense(6, activation="sigmoid")
], first_tokenized, first[tags], epochs = 3)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 100)          2000000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 200)          160800    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               20100     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 606       
Total params: 2,181,506
Trainable params: 2,181,506
Non-trainable params: 0
_________________________________________________________________


In [14]:
first_submission = submission(first_model, 'embeddings-lstm-split1.csv')
first_submission[:10]



Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00001cee341fdb12,0.995561,0.3448385,0.922943,0.2026089,0.877785,0.485172
0000247867823ef7,0.000103,6.20919e-08,1.8e-05,8.820862e-07,4e-06,1e-06
00013b17ad220c46,0.002547,3.944434e-06,0.000441,3.888881e-05,0.000125,3.8e-05
00017563c3f7919a,0.000518,2.636454e-07,5.8e-05,4.513998e-06,2.2e-05,6e-06
00017695ad8997eb,0.000421,2.200488e-07,5.1e-05,3.50713e-06,1.7e-05,5e-06
0001ea8717f6de06,0.006192,2.795319e-06,0.000425,6.406605e-05,0.000361,8.4e-05
00024115d4cbde0f,0.001276,8.776398e-07,0.000129,1.236272e-05,7e-05,2.6e-05
000247e83dcc1211,0.760456,0.003237332,0.088422,0.01606537,0.166727,0.013986
00025358d4737918,0.180474,8.999523e-06,0.004399,0.0001873898,0.009451,0.000542
00026d1092fe71cc,0.001012,3.990881e-07,8.7e-05,8.740042e-06,4.6e-05,1.4e-05


## Резултат: 0.9728

# Сега и втората част:

In [16]:
second_model = train_lstm([
    Embedding(20000, 100, input_length=100),
    Bidirectional(LSTM(100, return_sequences=True)),
    GlobalMaxPooling1D(),
    Dense(100, activation="relu"),
    Dropout(0.1),
    Dense(6, activation="sigmoid")
], second_tokenized, second[tags], epochs = 3)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 100)          2000000   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 100, 200)          160800    
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 200)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 100)               20100     
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 6)                 606       
Total params: 2,181,506
Trainable params: 2,181,506
Non-trainable params: 0
_________________________________________________________________


In [17]:
second_submission = submission(second_model, 'embeddings-lstm-split2.csv')
second_submission[:10]



Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00001cee341fdb12,0.995701,0.3486625,0.942447,0.09373182,0.899342,0.3955037
0000247867823ef7,0.000134,9.104841e-09,3.7e-05,1.41537e-07,7e-06,7.095074e-07
00013b17ad220c46,0.002522,2.973896e-06,0.000883,2.567595e-05,0.000257,4.776106e-05
00017563c3f7919a,0.000599,1.238338e-07,0.000211,1.576055e-06,5.4e-05,5.956906e-06
00017695ad8997eb,0.012473,6.523748e-06,0.001571,0.0001006798,0.000979,0.000215128
0001ea8717f6de06,0.004204,1.258053e-06,0.000586,2.186434e-05,0.000395,6.246925e-05
00024115d4cbde0f,0.001846,1.379919e-07,0.000202,2.449993e-06,0.000136,1.567924e-05
000247e83dcc1211,0.707416,0.0003663225,0.037864,0.00141343,0.079806,0.002258993
00025358d4737918,0.049086,1.402758e-06,0.001779,1.987362e-05,0.003844,0.0001572483
00026d1092fe71cc,0.000213,9.521102e-09,5.1e-05,1.477032e-07,1.5e-05,9.507321e-07


# Резултат: 0.9740

# Да комбинираме двата модела

In [24]:
first_half = pd.read_csv('../Submissions/embeddings-lstm-split1.csv')
first_half[:10]

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.995561,0.3448385,0.922943,0.2026089,0.877785,0.485172
1,0000247867823ef7,0.000103,6.20919e-08,1.8e-05,8.820862e-07,4e-06,1e-06
2,00013b17ad220c46,0.002547,3.944434e-06,0.000441,3.888881e-05,0.000125,3.8e-05
3,00017563c3f7919a,0.000518,2.636454e-07,5.8e-05,4.513998e-06,2.2e-05,6e-06
4,00017695ad8997eb,0.000421,2.200488e-07,5.1e-05,3.50713e-06,1.7e-05,5e-06
5,0001ea8717f6de06,0.006192,2.795319e-06,0.000425,6.406605e-05,0.000361,8.4e-05
6,00024115d4cbde0f,0.001276,8.776398e-07,0.000129,1.236272e-05,7e-05,2.6e-05
7,000247e83dcc1211,0.760456,0.003237333,0.088422,0.01606537,0.166727,0.013986
8,00025358d4737918,0.180474,8.999523e-06,0.004399,0.0001873898,0.009451,0.000542
9,00026d1092fe71cc,0.001012,3.990881e-07,8.7e-05,8.740042e-06,4.6e-05,1.4e-05


In [25]:
second_half = pd.read_csv('../Submissions/embeddings-lstm-split2.csv')
second_half[:10]

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.995701,0.3486625,0.942447,0.09373182,0.899342,0.3955037
1,0000247867823ef7,0.000134,9.104841e-09,3.7e-05,1.41537e-07,7e-06,7.095074e-07
2,00013b17ad220c46,0.002522,2.973897e-06,0.000883,2.567595e-05,0.000257,4.776106e-05
3,00017563c3f7919a,0.000599,1.238338e-07,0.000211,1.576055e-06,5.4e-05,5.956906e-06
4,00017695ad8997eb,0.012473,6.523748e-06,0.001571,0.0001006798,0.000979,0.000215128
5,0001ea8717f6de06,0.004204,1.258053e-06,0.000586,2.186434e-05,0.000395,6.246925e-05
6,00024115d4cbde0f,0.001846,1.379919e-07,0.000202,2.449992e-06,0.000136,1.567924e-05
7,000247e83dcc1211,0.707416,0.0003663225,0.037864,0.00141343,0.079806,0.002258993
8,00025358d4737918,0.049086,1.402758e-06,0.001779,1.987362e-05,0.003844,0.0001572483
9,00026d1092fe71cc,0.000213,9.521102e-09,5.1e-05,1.477032e-07,1.5e-05,9.507321e-07


In [26]:
combined = (first_half.drop('id', axis=1) + second_half.drop('id', axis=1)) / 2
combined.index = first_half['id']
combined[:10]

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00001cee341fdb12,0.995631,0.3467505,0.932695,0.1481704,0.888563,0.440338
0000247867823ef7,0.000118,3.559837e-08,2.7e-05,5.118116e-07,5e-06,1e-06
00013b17ad220c46,0.002535,3.459165e-06,0.000662,3.228238e-05,0.000191,4.3e-05
00017563c3f7919a,0.000559,1.937396e-07,0.000135,3.045026e-06,3.8e-05,6e-06
00017695ad8997eb,0.006447,3.371898e-06,0.000811,5.209347e-05,0.000498,0.00011
0001ea8717f6de06,0.005198,2.026686e-06,0.000505,4.296519e-05,0.000378,7.3e-05
00024115d4cbde0f,0.001561,5.078158e-07,0.000165,7.406356e-06,0.000103,2.1e-05
000247e83dcc1211,0.733936,0.001801827,0.063143,0.0087394,0.123267,0.008122
00025358d4737918,0.11478,5.20114e-06,0.003089,0.0001036317,0.006648,0.000349
00026d1092fe71cc,0.000613,2.043046e-07,6.9e-05,4.443873e-06,3.1e-05,7e-06


In [27]:
combined.to_csv('../Submissions/embeddings-lstm-halves-combined.csv')

# Резултат: 0.9757 (Подобрение над LSTM със всички данни със 0.0005)