In [6]:
import pandas as pd
import numpy as np

import gensim
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import stopwords

In [2]:
dataset = pd.read_csv('../Datasets/train.csv')
dataset.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate


In [7]:
sw = stopwords.words('english')

In [17]:
def clean_data(sentence):
  tokens = simple_preprocess(sentence)
  tokens = [token for token in tokens if not token in sw]
  return ' '.join(tokens)
  #return tokens

In [18]:
dataset['Tokens'] = dataset['discourse_text'].apply(clean_data)
dataset.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,Tokens
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,hi isaac going writing face mars natural landf...
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,perspective think face natural landform dont t...
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,think face natural landform life mars descover...
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,life mars would know reason think natural land...
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,people thought face formed alieans thought lif...


In [32]:
max_count = max(len(sentence.split(' ')) for sentence in dataset['Tokens'])
max_count

373

# Encoding

In [115]:
import numpy as np
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM, Bidirectional
from tensorflow.keras.layers import Embedding

In [116]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(dataset['Tokens'])
#sequences = tokenizer.texts_to_sequences(dataset['Tokens'])

In [117]:
def encode(sentence):
  seq = tokenizer.texts_to_sequences([sentence])[0]
  encoded = pad_sequences([seq], maxlen=max_count, truncating='pre')
  return encoded[0]

In [118]:
dataset['Encoded'] = dataset['Tokens'].apply(encode)
dataset.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,Tokens,Encoded
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,hi isaac going writing face mars natural landf...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,perspective think face natural landform dont t...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,think face natural landform life mars descover...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,life mars would know reason think natural land...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,people thought face formed alieans thought lif...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


# LSTM

In [126]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
import tensorflow

In [127]:
dataset.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,Tokens,Encoded
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,hi isaac going writing face mars natural landf...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,perspective think face natural landform dont t...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,think face natural landform life mars descover...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,life mars would know reason think natural land...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,people thought face formed alieans thought lif...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [128]:
voc_size = 16000
embedding_vector_features = 500

In [129]:
# One hot representation of words
onehot_repr=[one_hot(sentence,voc_size) for sentence in dataset['Tokens']] 
#onehot_repr

In [130]:
# Embedding the sentences
embedded_docs = pad_sequences(onehot_repr,padding='pre',maxlen=max_count)
print(embedded_docs)

[[    0     0     0 ...   111 11291  4772]
 [    0     0     0 ... 10718 11291  4772]
 [    0     0     0 ...   111 13269 15726]
 ...
 [    0     0     0 ... 11530   562  5412]
 [    0     0     0 ...  7387 15647 15702]
 [    0     0     0 ...  5381 13440  9103]]


In [145]:
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=embedded_docs.shape[1]))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(200, kernel_initializer  = 'he_uniform', return_sequences=True)))
model.add(Bidirectional(LSTM(100, kernel_initializer  = 'he_uniform')))

model.add(Dense(1,activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [146]:
my_callbacks_lstm = [
    EarlyStopping(patience=2),
    ModelCheckpoint(filepath='../LSTM_WordEmbeddings/Callbacks/model.{epoch:02d}-{val_loss:.2f}.h5'),
    TensorBoard(log_dir='../LSTM_WordEmbeddings/logs'),
]

In [147]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [148]:
le = LabelEncoder()
dataset['le_discourse_effectiveness'] = le.fit_transform(dataset['discourse_effectiveness'])
dataset.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,Tokens,Encoded,le_discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,hi isaac going writing face mars natural landf...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,perspective think face natural landform dont t...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,think face natural landform life mars descover...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,life mars would know reason think natural land...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,people thought face formed alieans thought lif...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0


In [149]:
X_LSTM = embedded_docs.copy()
y_LSTM = np.array(dataset['le_discourse_effectiveness'])

In [150]:
X_LSTM.shape, y_LSTM.shape

((36765, 373), (36765,))

In [151]:
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X_LSTM, y_LSTM, test_size=.33, random_state=42, shuffle=True)

In [152]:
model.fit(X_train_lstm, y_train_lstm, validation_data=(X_test_lstm, y_test_lstm), epochs=100, batch_size=1024, callbacks=my_callbacks_lstm)

Epoch 1/100


KeyboardInterrupt: 