In [None]:
# Link to competition: https://www.kaggle.com/c/nnfl-lab-3
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np 
import pandas as pd 
import os
from sklearn.feature_extraction.text import CountVectorizer
import tensorflow as tf
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import re

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', 100)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
# tf.random.set_seed(1)

In [None]:
df = pd.read_csv('../input/nnfl-lab-3/lab3_train.csv')
df.head()

In [None]:
for i in range(len(df)):
    df.text[i] = df.text[i][3:-2]
df.text[0]

In [None]:
len(df.text)

In [None]:
vocab_size = 50000
embedding_dim = 100
max_length = 3000
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
batch_size = 32

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [None]:
df['text_without_stopwords'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [None]:
f = [len(t) for t in df.text]
f.sort(reverse = True)
f

In [None]:
f = [len(t) for t in df.text_without_stopwords]
f.sort(reverse = True)
f

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', split=' ')

tokenizer.fit_on_texts(df.text_without_stopwords.values)
word_index = tokenizer.word_index

X = tokenizer.texts_to_sequences(df.text_without_stopwords.values)
X_pad = pad_sequences(X, padding=padding_type, maxlen=max_length)

In [None]:
df_sub = pd.read_csv('../input/nnfl-lab-3/lab3_test.csv')
df_sub['text_without_stopwords'] = df_sub['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

X_pred = tokenizer.texts_to_sequences(df_sub.text_without_stopwords.values)
X_pred_pad = pad_sequences(X_pred, padding=padding_type, maxlen=max_length)

In [None]:
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size = 0.1, random_state = 42)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Conv1D(256, 7, 3),
    tf.keras.layers.MaxPooling1D(5),
    tf.keras.layers.Conv1D(128, 5, 3),
    tf.keras.layers.MaxPooling1D(5),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(512, return_sequences=True)),
#     tf.keras.layers.GRU(256, return_sequences=True),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(6, activation='softmax')
])
print(model.summary())


In [None]:
model.compile(loss = 'sparse_categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])

history = model.fit(X_train, y_train, batch_size = batch_size, epochs = 2, validation_data=(X_test, y_test))

In [None]:
import matplotlib.pyplot as plt
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'r', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()

plt.plot(epochs, loss, 'r', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

In [None]:
score,acc = model.evaluate(X_test, y_test, batch_size = batch_size)
print("Validation Accuracy: %.4f" % (acc))

In [None]:
y_pred = model.predict_classes(X_pred_pad, verbose=1)
y_pred

In [None]:
(y_pred[0])

In [None]:
sample = pd.read_csv('../input/nnfl-lab-3/sample_submission.csv')
sample['class'] = y_pred
sample

In [None]:
sample.to_csv('submission.csv', index=False)
sample.head()


In [None]:
model.save_weights('model.h5')

In [None]:
from IPython.display import HTML
import pandas as pd 
import numpy as np
import base64 
def create_download_link(df, title = "Download CSV file", filename = "data.csv"): 
    csv = df.to_csv(index=False) 
    b64 = base64.b64encode(csv.encode()) 
    payload = b64.decode()

    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)
create_download_link(sample)