<a href="https://colab.research.google.com/github/SheethalVelutharambath/citation_intent/blob/main/lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer



from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


import re

print("Tensorflow Version",tf.__version__)

In [None]:
train = pd.read_csv("../input/cite-dataset/tsv/train.tsv", sep= '\t',names=["id", "explicit", "text", "label"])
dev = pd.read_csv("../input/cite-dataset/tsv/dev.tsv", sep= '\t', names=["id", "explicit", "text", "label"])
test = pd.read_csv("../input/cite-dataset/tsv/test.tsv", sep= '\t', names=["id", "explicit", "text", "label"])

In [None]:
train.drop(['id', 'explicit'], axis = 1, inplace = True)
test.drop(['id', 'explicit'], axis = 1, inplace = True)

In [None]:
lab_to_sentiment = {"background":0, "method":1, "result":2}

In [None]:
def label_decoder(label):
    return lab_to_sentiment[label]
train["label1"] = train["label"].apply(lambda x: label_decoder(x))
train.head()

In [None]:
test["label1"] = test["label"].apply(lambda x: label_decoder(x))
test.head()

In [None]:
stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')

text_cleaning_re = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

In [None]:
def preprocess(text, stem=False):
  text = re.sub(text_cleaning_re, ' ', str(text).lower()).strip()
  tokens = []
  for token in text.split():
    if token not in stop_words:
      if stem:
        tokens.append(stemmer.stem(token))
      else:
        tokens.append(token)
  return " ".join(tokens)

In [None]:
train["text1"] = train["text"].apply(lambda x: preprocess(x))

In [None]:
test["text1"] = test["text"].apply(lambda x: preprocess(x))

In [None]:
MAX_SEQUENCE_LENGTH = 100

In [None]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train.text1)

word_index = tokenizer.word_index
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary Size :", vocab_size)

In [None]:
from keras.preprocessing.sequence import pad_sequences

x_train = pad_sequences(tokenizer.texts_to_sequences(train.text1),
                        maxlen = MAX_SEQUENCE_LENGTH)
x_test = pad_sequences(tokenizer.texts_to_sequences(test.text1),
                       maxlen = MAX_SEQUENCE_LENGTH)

print("Training X Shape:",x_train.shape)
print("Testing X Shape:",x_test.shape)

In [None]:
y_train = train['label1']
y_test = test['label1']

In [None]:
y_train


In [None]:
y_train_list = []

In [None]:
for i in y_train:
    kd = np.zeros(3)
    kd[i]=1
    y_train_list.append(list(kd))

In [None]:
y_test_list = []
for i in y_test:
    kd = np.zeros(3)
    kd[i]=1
    y_test_list.append(list(kd))

In [None]:
y_train_list = np.array(y_train_list)

In [None]:
y_test_list = np.array(y_test_list)

In [None]:
#encoder = LabelEncoder()
#encoder.fit(y_train_list)

#y_train = encoder.transform(y_train_list)
#y_test = encoder.transform(y_train_list)

# y_train = y_train.reshape(-1,1)
# y_test = y_test.reshape(-1,1)

#print("y_train shape:", y_train.shape)
#print("y_test shape:", y_test.shape)

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

In [None]:
GLOVE_EMB = '/kaggle/working/glove.6B.300d.txt'
EMBEDDING_DIM = 300
LR = 1e-3
BATCH_SIZE = 1024
EPOCHS = 10
MODEL_PATH = '.../output/kaggle/working/best_model.hdf5'

In [None]:
embeddings_index = {}

f = open(GLOVE_EMB)
for line in f:
  values = line.split()
  word = value = values[0]
  coefs = np.asarray(values[1:], dtype='float32')
  embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' %len(embeddings_index))

In [None]:
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in word_index.items():
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

In [None]:
embedding_layer = tf.keras.layers.Embedding(vocab_size,
                                          EMBEDDING_DIM,
                                          weights=[embedding_matrix],
                                          input_length=MAX_SEQUENCE_LENGTH,
                                          trainable=False)

In [None]:
from tensorflow.keras.layers import Conv1D, Bidirectional, LSTM, Dense, Input, Dropout
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.callbacks import ModelCheckpoint

In [None]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_sequences = embedding_layer(sequence_input)
x = SpatialDropout1D(0.2)(embedding_sequences)
x = Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2))(x)
outputs = Dense(3, activation='softmax')(x)
model = tf.keras.Model(sequence_input, outputs)

In [None]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau

model.compile(optimizer=Adam(learning_rate=LR), loss='categorical_crossentropy',
              metrics=['accuracy'])
ReduceLROnPlateau = ReduceLROnPlateau(factor=0.1,
                                     min_lr = 0.01,
                                     monitor = 'val_loss',
                                     verbose = 1)

In [None]:
print("Training on GPU...") if tf.test.is_gpu_available() else print("Training on CPU...")

In [None]:
y_train_list.shape

In [None]:
history = model.fit(x_train, y_train_list, batch_size=BATCH_SIZE, epochs=15,
                    validation_data=(x_test, y_test_list), callbacks=[ReduceLROnPlateau])

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
earlystopping = EarlyStopping(monitor='val_loss',
                              mode='min', 
                              verbose=1, 
                              patience=10
                             )
History = model.fit(x_train,y_train_list, batch_size=BATCH_SIZE,
                              epochs = 50, validation_data = (x_test,y_test_list),
                              verbose = 2, callbacks=[earlystopping])

In [None]:
test_acc = model.evaluate(x_test, y_test_list, verbose=0)

In [None]:
test_acc