In [None]:
#####
# Description: Main train Script for SVM model
# Author: Tirupal Rao Ravilla
# Date: 04/2019
#####
import numpy as np
import pandas as pd 
import tensorflow as tf

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, CuDNNLSTM, Bidirectional
from keras.layers.embeddings import Embedding
# from tensorflow.keras.backend import set_session
import keras.backend as K
import nltk
import string
import re
from tqdm import tqdm

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
df = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')

In [None]:
def binary(y):
  if y>=0.5:
      return 1
  else:
      return 0
def preprocess(text):
  text = re.sub(r'\W+',' ',text.lower())
  return text

In [None]:
df.shape

In [None]:
x = df['comment_text'].apply(preprocess)
y = df['target'].apply(binary)

In [None]:
print(x.head())
print(y.head())
print(len(x),len(y))

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x)

In [None]:
vocabulary_size = len(tokenizer.word_index)+1
print(vocabulary_size)

In [None]:
sequences = tokenizer.texts_to_sequences(x)
tr_data = pad_sequences(sequences, maxlen=220)

print(len(tr_data))

In [None]:
f = open('../input/glove6b/glove.6B.300d.txt')
embedding_values = {}
for line in tqdm(f):
  value = line.split(' ')
  word = value[0]
  coef = np.array(value[1:],dtype = 'float32')
  embedding_values[word]=coef

In [None]:
embedding_matrix = np.zeros((vocabulary_size,300))
for word,i in tqdm(tokenizer.word_index.items()):
  values = embedding_values.get(word)
  if values is not None:
    embedding_matrix[i] = values

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(tr_data,y,test_size = 0.25,random_state = 42)

In [None]:
K.clear_session()
model = Sequential()
model.add(Embedding(vocabulary_size,300, input_length = 220,weights = [embedding_matrix],trainable = False))
# model.add(Embedding(vocabulary_size, 145, input_length=145))
model.add(Bidirectional(LSTM(145, dropout=0.2, recurrent_dropout=0.2,return_sequences=True)))
model.add(Bidirectional(LSTM(145, dropout=0.1, recurrent_dropout=0.1)))
model.add(Dense(64,activation = 'relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()
# model.fit(tr_data, y, validation_split=0.25, epochs=3,batch_size=128)
model.fit(x_train, y_train, validation_data=(x_test,y_test), epochs=4,batch_size=2000)


In [None]:
test= pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')
xt = test['comment_text'].apply(preprocess)
sequences = tokenizer.texts_to_sequences(xt)
ts_data = pad_sequences(sequences, maxlen=145)


In [None]:
yts= model.predict(ts_data)

In [None]:
print(yts[:10])

In [None]:
submission = pd.DataFrame([test['id']]).T
submission['prediction'] = yts
print(submission.head())

In [None]:
submission.to_csv('submission.csv', index=False)