In [1]:
import pandas as pd
import numpy as np

import gensim
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import stopwords

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

import tensorflow

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
dataset = pd.read_csv('/content/drive/MyDrive/Predicting Effective Arguments/Datasets/train.csv')
dataset.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate


In [4]:
nltk.download('stopwords')
sw = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
def clean_data(sentence):
  tokens = simple_preprocess(sentence)
  tokens = [token for token in tokens if not token in sw]
  return ' '.join(tokens)
  #return tokens

In [6]:
dataset['Tokens'] = dataset['discourse_text'].apply(clean_data)
dataset.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,Tokens
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,hi isaac going writing face mars natural landf...
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,perspective think face natural landform dont t...
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,think face natural landform life mars descover...
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,life mars would know reason think natural land...
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,people thought face formed alieans thought lif...


In [7]:
dataset.drop(['discourse_id', 'essay_id'], axis=1, inplace=True)

In [8]:
dataset.head()

Unnamed: 0,discourse_text,discourse_type,discourse_effectiveness,Tokens
0,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,hi isaac going writing face mars natural landf...
1,"On my perspective, I think that the face is a ...",Position,Adequate,perspective think face natural landform dont t...
2,I think that the face is a natural landform be...,Claim,Adequate,think face natural landform life mars descover...
3,"If life was on Mars, we would know by now. The...",Evidence,Adequate,life mars would know reason think natural land...
4,People thought that the face was formed by ali...,Counterclaim,Adequate,people thought face formed alieans thought lif...


In [9]:
max_count = max(len(sentence.split(' ')) for sentence in dataset['Tokens'])
max_count

373

# Encoding

In [10]:
# import numpy as np
# from tensorflow.keras.utils import to_categorical, plot_model
# from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense
# from tensorflow.keras.layers import LSTM, Bidirectional
# from tensorflow.keras.layers import Embedding

In [11]:
# # integer encode sequences of words
# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(dataset['Tokens'])
# #sequences = tokenizer.texts_to_sequences(dataset['Tokens'])

In [12]:
# def encode(sentence):
#   seq = tokenizer.texts_to_sequences([sentence])[0]
#   encoded = pad_sequences([seq], maxlen=max_count, truncating='pre')
#   return encoded[0]

In [13]:
# dataset['Encoded'] = dataset['Tokens'].apply(encode)
# dataset.head()

In [14]:
ohe = OneHotEncoder()

In [15]:
discourse_type_ohe = pd.DataFrame(ohe.fit_transform(dataset[['discourse_type']]).toarray())
discourse_type_ohe.head()

Unnamed: 0,0,1,2,3,4,5,6
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [16]:
dataset = pd.concat([dataset['Tokens'], discourse_type_ohe, dataset['discourse_effectiveness']], axis=1)
dataset.head()

Unnamed: 0,Tokens,0,1,2,3,4,5,6,discourse_effectiveness
0,hi isaac going writing face mars natural landf...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Adequate
1,perspective think face natural landform dont t...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Adequate
2,think face natural landform life mars descover...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,Adequate
3,life mars would know reason think natural land...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Adequate
4,people thought face formed alieans thought lif...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,Adequate


# LSTM

In [17]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
import tensorflow

In [18]:
dataset.head()

Unnamed: 0,Tokens,0,1,2,3,4,5,6,discourse_effectiveness
0,hi isaac going writing face mars natural landf...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Adequate
1,perspective think face natural landform dont t...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Adequate
2,think face natural landform life mars descover...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,Adequate
3,life mars would know reason think natural land...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Adequate
4,people thought face formed alieans thought lif...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,Adequate


In [19]:
voc_size = 16000
embedding_vector_features = 500

In [20]:
# One hot representation of words
onehot_repr=[one_hot(sentence,voc_size) for sentence in dataset['Tokens']] 
#onehot_repr

In [21]:
# Embedding the sentences
embedded_docs = pad_sequences(onehot_repr,padding='pre',maxlen=max_count)
print(embedded_docs)

[[    0     0     0 ...  1762 10715   752]
 [    0     0     0 ...  2067 10715   752]
 [    0     0     0 ...  1762  4670 10857]
 ...
 [    0     0     0 ... 14752 15858  2803]
 [    0     0     0 ...  7996 14851 11283]
 [    0     0     0 ...  2949  1345  3502]]


In [22]:
discourse_type_ohe = np.array(discourse_type_ohe)
discourse_type_ohe

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [23]:
embedded_docs.shape

(36765, 373)

In [24]:
embedded_docs = np.hstack((embedded_docs, discourse_type_ohe))
embedded_docs

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [25]:
embedded_docs.shape

(36765, 380)

In [26]:
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=embedded_docs.shape[1]))
model.add(Dropout(0.3))

model.add(LSTM(512, activation='relu', return_sequences=True))
model.add(Dropout(0.3))

model.add(LSTM(256, activation='relu', return_sequences=True))
model.add(Dropout(0.3))

model.add(LSTM(128, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(128,activation='softmax'))
model.add(Dense(3,activation='softmax'))
model.compile(loss= 'categorical_crossentropy', optimizer='adam', metrics=['accuracy'])



In [27]:
my_callbacks_lstm = [
    EarlyStopping(patience=2),
    ModelCheckpoint(filepath='/content/drive/MyDrive/Predicting Effective Arguments/LSTM_WordEmbeddings/Callbacks/model.{epoch:02d}-{val_loss:.2f}.h5'),
    TensorBoard(log_dir='/content/drive/MyDrive/Predicting Effective Arguments/LSTM_WordEmbeddings/logs'),
]

In [28]:
le = LabelEncoder()
dataset['le_discourse_effectiveness'] = le.fit_transform(dataset['discourse_effectiveness'])
dataset.head()

Unnamed: 0,Tokens,0,1,2,3,4,5,6,discourse_effectiveness,le_discourse_effectiveness
0,hi isaac going writing face mars natural landf...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Adequate,0
1,perspective think face natural landform dont t...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Adequate,0
2,think face natural landform life mars descover...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,Adequate,0
3,life mars would know reason think natural land...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Adequate,0
4,people thought face formed alieans thought lif...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,Adequate,0


In [29]:
ohe_discourse_effectiveness = pd.get_dummies(dataset['discourse_effectiveness'])
ohe_discourse_effectiveness

Unnamed: 0,Adequate,Effective,Ineffective
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
...,...,...,...
36760,1,0,0
36761,1,0,0
36762,1,0,0
36763,0,0,1


In [30]:
X_LSTM = embedded_docs.copy()
y_LSTM = ohe_discourse_effectiveness.copy()
#y_LSTM = dataset['le_discourse_effectiveness']

In [31]:
X_LSTM.shape, y_LSTM.shape

((36765, 380), (36765, 3))

In [32]:
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X_LSTM, y_LSTM, test_size=.33, random_state=42, shuffle=True)

In [33]:
model.fit(X_train_lstm, y_train_lstm, validation_data=(X_test_lstm, y_test_lstm), epochs=100, batch_size=512, callbacks=my_callbacks_lstm)

Epoch 1/100
Epoch 2/100


<keras.callbacks.History at 0x7f54c033b750>