## **Importing Libraries**

In [None]:
import numpy as np
import pandas as pd

import tensorflow as tf
from keras import layers
from keras.models import Sequential
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer

## Import Data From Kaggle

In [None]:
from google.colab import files
files.upload()

In [None]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle competitions download -c spooky-author-identification

## **Constant Variables**

In [None]:
MAXLEN = 192
TEXTCOL = "text"
TARGETCOL = "author"

train_dir = "train.zip"
test_dir = "test.zip"

## **Get Train and Test Data**

In [None]:
def get_data(train_dict, test_dict):
  train = pd.read_csv(train_dict)
  test = pd.read_csv(test_dict)
  testdex = test.id

  return train, test, testdex

In [None]:
train, test, testdex = get_data(train_dir, test_dir)

In [None]:
train.head(10)

## Prepare Text Data

### Tokenization

In [None]:
def tokenize_data(train, test, TEXTCOL):
  tokenizer = Tokenizer(num_words=26000)
  tokenizer.fit_on_texts(train[TEXTCOL])

  X_train = tokenizer.texts_to_sequences(train[TEXTCOL]) 
  X_test = tokenizer.texts_to_sequences(test[TEXTCOL])

  vocab_size = len(tokenizer.word_index) + 1

  X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, 
                                                          padding='post', 
                                                          maxlen=MAXLEN)
  
  X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, 
                                                         padding='post', 
                                                         maxlen=MAXLEN)

  return X_train, X_test, vocab_size

In [None]:
X_train, X_test, vocab_size = tokenize_data(train, test, TEXTCOL)

### Prepare Labels

In [None]:
def get_labels(TARGETCOL, train):
  label_mapper = {name: i for i,name in enumerate(set(train[TARGETCOL].values))}
  num_label = np.vectorize(label_mapper.get)(train[TARGETCOL].values)
  train_labels = to_categorical(num_label)
  
  return train_labels, label_mapper

In [None]:
train_labels, label_mapper = get_labels(TARGETCOL, train)

In [None]:
label_mapper

### Submission Function

In [None]:
def prepare_submission(testdex, test_results, label_mapper, model_name):
  testdex = test.id
  submission = pd.read_csv("sample_submission.zip")
  sub_cols = submission.columns

  submission = pd.DataFrame(test_result, columns=label_mapper.keys())
  submission['id'] = testdex

  submission = submission[sub_cols]
  submission.to_csv(model_name + '_results.csv', index=False)

## Model With Embedding and Dense Layer

In [None]:
def build_model(vocab_size):
  model = Sequential()
  model.add(layers.Embedding(input_dim=vocab_size, 
                             output_dim=500, 
                             input_length=MAXLEN))

  model.add(layers.Flatten())
  model.add(layers.Dense(1024, activation='relu'))
  model.add(layers.Dropout(0.2))
  model.add(layers.Dense(3, activation='softmax'))

  model.compile(optimizer='adam', 
                loss='categorical_crossentropy', 
                metrics=['accuracy'])
  return model

In [None]:
model = build_model(vocab_size)
model.summary()

In [None]:
history = model.fit(X_train, train_labels,
                    epochs=5,
                    validation_split=0.2,
                    batch_size=32)

In [None]:
model_test_result = model.predict(X_test)

In [None]:
prepare_submission(testdex, _model_test_result, label_mapper, "basic_model")

## GRU Model

In [None]:
def build_GRU(vocab_size):
  model = Sequential()
  model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=500, 
                           input_length=MAXLEN))
  
  model.add(layers.GRU(128, return_sequences=True, activation="tanh"))
  model.add(layers.GRU(256, return_sequences=True, activation="tanh"))

  model.add(layers.Flatten())
  model.add(layers.Dense(1024, activation='relu'))
  model.add(layers.Dropout(0.2))
  model.add(layers.Dense(3, activation='softmax'))

  model.compile(optimizer='adam', 
                loss='categorical_crossentropy', 
                metrics=['accuracy'])
  
  return model

In [None]:
gru_model = build_GRU(vocab_size)
gru_model.summary()

In [None]:
history = gru_model.fit(X_train, train_labels,
                    epochs=5,
                    validation_split=0.2,
                    batch_size=32)

In [None]:
gru_test_result = gru_model.predict(X_test)

In [None]:
prepare_submission(testdex, gru_test_results, label_mapper, "gru")

## LSTM Model

In [None]:
def build_LSTM(vocab_size):
  model = Sequential()

  model.add(layers.Embedding(input_dim=vocab_size,
                             output_dim=500,
                             input_length=MAXLEN))

  model.add(layers.LSTM(128,return_sequences=True))
  model.add(layers.LSTM(256,return_sequences=True))
  
  model.add(layers.Flatten())
  model.add(layers.Dense(1024, activation="relu"))
  model.add(layers.Dropout(0.2))
  model.add(layers.Dense(3, activation='softmax'))

  model.compile(loss='categorical_crossentropy',
                optimizer='adam', 
                metrics=['accuracy'])

  return model

In [None]:
lstm_model = build_LSTM(vocab_size)
lstm_model.summary()

In [None]:
history = lstm_model.fit(X_train, 
                         train_labels, 
                         epochs=5, 
                         validation_split=0.2, 
                         batch_size=32)

In [None]:
lstm_test_result = lstm_model.predict(X_test)

In [None]:
prepare_submission(testdex, lstm_test_results, label_mapper, "lstm")

## Simple RNN Model

In [None]:
def build_simple_RNN(vocab_size):
  model = Sequential()

  model.add(layers.Embedding(input_dim=vocab_size, 
                             output_dim=500, 
                             input_length=MAXLEN))

  model.add(layers.SimpleRNN(128, dropout=0.2,return_sequences=True))
  model.add(layers.SimpleRNN(256, dropout=0.2,return_sequences=True))
  
  model.add(layers.Flatten())
  model.add(layers.Dense(1024, activation="relu"))
  model.add(layers.Dropout(0.2))
  model.add(layers.Dense(3, activation='softmax'))

  model.compile(loss='categorical_crossentropy', 
                optimizer='adam', 
                metrics=['accuracy'])

  return model

In [None]:
rnn_model = build_simple_RNN(vocab_size)
rnn_model.summary()

In [None]:
history = rnn_model.fit(X_train, 
                        train_labels, 
                        epochs=5, 
                        validation_split=0.2, 
                        batch_size=32)

In [None]:
rnn_test_result = rnn_model.predict(X_test)

In [None]:
prepare_submission(testdex, rnn_test_results, label_mapper, "rnn")