In [None]:
# Link to competition: https://www.kaggle.com/c/nnfl-nlp-lab-2
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np 
import pandas as pd 
import os
from sklearn.feature_extraction.text import CountVectorizer
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import re


from numpy.random import seed
seed(1)
tf.random.set_seed(42)

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', 100)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/nnfl-nlp-lab-2/lab2_train.csv')
df.head()

In [None]:
len(df)

In [None]:
mm = []
for text in df.text:
    mm.append(len(text))
mm.sort(reverse = True)
mm

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [None]:
df['text_without_stopwords'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [None]:
vocab_size = 20000
embedding_dim = 50
max_length = 250
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
batch_size = 256

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(df.text.values)
word_index = tokenizer.word_index

X = tokenizer.texts_to_sequences(df.text.values)
X_pad = pad_sequences(X, padding=padding_type, maxlen=max_length)

In [None]:
df_sub = pd.read_csv('../input/nnfl-nlp-lab-2/lab2_test.csv')
df_sub['text_without_stopwords'] = df_sub['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
X_pred = tokenizer.texts_to_sequences(df_sub.text.values)
X_pred_pad = pad_sequences(X_pred, padding=padding_type, maxlen=max_length)

In [None]:
mm = []
for text in df.text_without_stopwords:
    mm.append(len(text))
mm.sort(reverse = True)
mm

In [None]:
tokenizer.word_counts

In [None]:
# print(X[0])
# print(df.text.values[0])

In [None]:
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X_pad,y, test_size = 0.2, random_state = 42, stratify = y)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Conv1D(512, 3, activation = 'relu'),
    tf.keras.layers.MaxPooling1D(3),
    tf.keras.layers.LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(4, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss = 'binary_crossentropy', optimizer='rmsprop',metrics = ['accuracy'])
print(model.summary())

In [None]:
history = model.fit(X_train, y_train, batch_size = batch_size, epochs = 4, validation_data=(X_test, y_test))

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'r', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()

plt.plot(epochs, loss, 'r', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

In [None]:
loss,acc = model.evaluate(X_test, y_test, batch_size = batch_size)
print("Validation Accuracy: %.4f" % (acc))
print("Loss: %.4f"%(loss))

In [None]:
y_pred = model.predict(X_pred_pad, verbose=1)
y_pred = np.where(y_pred > 0.5, 1, 0)
y_pred = y_pred.squeeze()
y_pred

In [None]:
sample = pd.read_csv('../input/nnfl-nlp-lab-2/lab2_sample.csv')
sample['target'] = y_pred
sample.to_csv('submission.csv', index=False)
sample.head()
model.save_weights('model.h5')

In [None]:
# !wget https://github.com/kmr0877/IMDB-Sentiment-Classification-CBOW-Model/raw/master/glove.6B.50d.txt.gz
# ! gunzip glove.6B.50d.txt.gz

In [None]:
# embeddings_index = {}
# f = open('glove.6B.50d.txt')
# for line in f:
#     values = line.split()
#     word = values[0]
#     coefs = np.asarray(values[1:], dtype='float32')
#     embeddings_index[word] = coefs
# f.close()

# print('Found %s word vectors in pretrained word vector model.' % len(embeddings_index))
# print('Dimensions of the vector space : ', len(embeddings_index['the']))

In [None]:
# embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
# for word, i in word_index.items():
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         # words not found in embedding index will be all-zeros.
#         embedding_matrix[i] = embedding_vector

In [None]:
# model = tf.keras.Sequential([
#     tf.keras.layers.Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False),
#     tf.keras.layers.Conv1D(512, 3, activation = 'relu'),
#     tf.keras.layers.MaxPooling1D(3),
#     tf.keras.layers.LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
#     tf.keras.layers.GlobalAveragePooling1D(),
#     tf.keras.layers.Dense(128, activation='relu'),
#     tf.keras.layers.Dropout(0.2),
#     tf.keras.layers.Dense(64, activation='relu'),
#     tf.keras.layers.Dropout(0.2),
#     tf.keras.layers.Dense(4, activation='relu'),
#     tf.keras.layers.Dense(1, activation='sigmoid')
# ])
# print(model.summary())

# model.compile(loss = 'binary_crossentropy', optimizer='rmsprop',metrics = ['accuracy'])
# history = model.fit(X_train, y_train, batch_size = batch_size, epochs = 10, validation_data=(X_test, y_test))

In [None]:
# %matplotlib inline
# import matplotlib.pyplot as plt
# acc = history.history['accuracy']
# val_acc = history.history['val_accuracy']
# loss = history.history['loss']
# val_loss = history.history['val_loss']

# epochs = range(len(acc))

# plt.plot(epochs, acc, 'r', label='Training accuracy')
# plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
# plt.title('Training and validation accuracy')
# plt.legend()
# plt.figure()

# plt.plot(epochs, loss, 'r', label='Training Loss')
# plt.plot(epochs, val_loss, 'b', label='Validation Loss')
# plt.title('Training and validation loss')
# plt.legend()

# plt.show()

In [None]:
# loss,acc = model.evaluate(X_test, y_test, batch_size = batch_size)
# print("Validation Accuracy: %.4f" % (acc))
# print("Loss: %.4f"%(loss))

In [None]:
# y_pred = model.predict(X_pred_pad, verbose=1)
# y_pred = np.where(y_pred > 0.5, 1, 0)
# y_pred = y_pred.squeeze()
# y_pred

In [None]:
# sample_glove = pd.read_csv('../input/nnfl-nlp-lab-2/lab2_sample.csv')
# sample_glove['target'] = y_pred
# sample_glove.to_csv('submission_glove.csv', index=False)
# sample_glove.head()
# model.save_weights('model_glove.h5')