<a href="https://colab.research.google.com/github/VGODIE/ML_kaggle_competitions/blob/master/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [0]:
print(tf.__version__)

1.15.0


In [0]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"vgodie","key":"914a11543650585c542451372ade778c"}'}

In [0]:
! mkdir -p ~/.kaggle
! cp kaggle.json ~/.kaggle
! chmod 600 ~/.kaggle/kaggle.json

In [0]:
! kaggle competitions download -c iad-deep-learning-sentiment

Downloading x_train.txt.zip to /content
 98% 581M/595M [00:04<00:00, 108MB/s]
100% 595M/595M [00:04<00:00, 133MB/s]
Downloading x_test.txt.zip to /content
 72% 49.0M/68.0M [00:00<00:00, 107MB/s]
100% 68.0M/68.0M [00:00<00:00, 139MB/s]
Downloading random_prediction.csv.zip to /content
  0% 0.00/2.08M [00:00<?, ?B/s]
100% 2.08M/2.08M [00:00<00:00, 141MB/s]
Downloading y_train.csv.zip to /content
  0% 0.00/8.37M [00:00<?, ?B/s]
100% 8.37M/8.37M [00:00<00:00, 77.0MB/s]


In [0]:
! unzip x_train.txt.zip
! unzip x_test.txt.zip
! unzip y_train.csv.zip

Archive:  x_train.txt.zip
  inflating: x_train.txt             
Archive:  x_test.txt.zip
  inflating: x_test.txt              
Archive:  y_train.csv.zip
  inflating: y_train.csv             


# Download data and preprocess

In [0]:
f = open("x_train.txt", "r")
x_train = f.readlines()
f.close()


In [0]:
y_train = pd.read_csv("y_train.csv")

In [0]:
def delete_punctuation(x):
    punctuation = list(string.punctuation)
    return ''.join([a if a not in punctuation + ['\n'] else ' ' for a in x])

In [0]:
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [0]:
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train.Probability.values)

In [0]:
max_features = 100000
max_len = 100
embedding_size = 300

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(x_train)
list_tokenized_train = tokenizer.texts_to_sequences(x_train)
list_tokenized_test = tokenizer.texts_to_sequences(x_test)
x_train_pad = pad_sequences(list_tokenized_train, maxlen=max_len)
x_test_pad = pad_sequences(list_tokenized_test, maxlen=max_len)

In [0]:
!wget http://nlp.stanford.edu/data/wordvecs/glove.42B.300d.zip

--2019-12-21 22:13:51--  http://nlp.stanford.edu/data/wordvecs/glove.42B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/wordvecs/glove.42B.300d.zip [following]
--2019-12-21 22:13:51--  https://nlp.stanford.edu/data/wordvecs/glove.42B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/wordvecs/glove.42B.300d.zip [following]
--2019-12-21 22:13:51--  http://downloads.cs.stanford.edu/nlp/data/wordvecs/glove.42B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Le

In [0]:
! unzip glove.42B.300d.zip

Archive:  glove.42B.300d.zip
  inflating: glove.42B.300d.txt      


In [0]:
from tqdm import tqdm_notebook

In [0]:
f = open('./glove.42B.300d.txt')

embeddings_index = dict() 
for line in tqdm_notebook(f):
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [0]:
all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()
emb_mean, emb_std

  if self.run_code(code, result):


(0.005720101, 0.2951066)

In [0]:
word_index = tokenizer.word_index
unknown_words = set()
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    else: unknown_words.add(word)

# Definind and training model



In [0]:
from tensorflow.keras.layers import Input, Bidirectional, Dropout, Dense, GRU, Embedding, LayerNormalization, LeakyReLU
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, LearningRateScheduler

In [0]:
input_layer = Input((max_len,), name = 'comment_text')
embedding_layer = Embedding(max_features, embedding_size, input_length=max_len, 
                            weights=[embedding_matrix], 
                            trainable = False)(input_layer)
x = Bidirectional(GRU(128, return_sequences=True))(embedding_layer)
x = Dropout(0.3)(x)
x = Bidirectional(GRU(128, return_sequences=False))(x)
x = Dense(64, activation=LeakyReLU(0.3))(x)
output_layer = Dense(1, activation="sigmoid")(x)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='binary_crossentropy',
                  optimizer=Adam(clipvalue=1, clipnorm=1, amsgrad=True),
                  metrics=['accuracy'])
print(model.summary())

def schedule(ind):
    a = [0.001, 0.001, 0.0001, 0.0001, 0.00001, 0.00001, 0.000001]
    return a[ind]

lr = LearningRateScheduler(schedule)
    
early_stop = EarlyStopping(monitor='val_loss',
                           patience=4,
                           verbose=1,
                           min_delta=1e-4)


history = model.fit(x_train_pad, y_train, batch_size=1500, epochs = 3, 
                    validation_split = 0.1, callbacks = [lr])

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
comment_text (InputLayer)    [(None, 100)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 300)          30000000  
________

KeyboardInterrupt: ignored

In [0]:
y_train

array([1., 1., 0., ..., 1., 1., 0.])

In [0]:
np.sum(y_train)

1349818.0

In [0]:
len(y_train) - np.sum(y_train)

1350182.0

In [0]:
f = open("x_test.txt", "r")
x_test = f.readlines()
f.close()

In [0]:
list_tokenized_test = tokenizer.texts_to_sequences(x_test)
x_test_pad = pad_sequences(list_tokenized_test, maxlen=max_len)

In [0]:
preds = model.predict(x_test_pad, batch_size=2500)

In [0]:
preds.shape

(400000, 1)

In [0]:
sub = pd.DataFrame()
sub["Probability"] = preds.reshape(-1,)
sub["Id"] = sub.index + 1

In [0]:
sub.to_csv("submission.csv", index=None)

In [0]:
sub

Unnamed: 0,Probability,Id
0,0.987192,1
1,0.998586,2
2,0.001727,3
3,0.558569,4
4,0.995809,5
...,...,...
399995,0.027100,399996
399996,0.001290,399997
399997,0.129923,399998
399998,0.996316,399999
