In [1]:
import pandas as pd
import numpy as np

import gensim
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import stopwords

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

import tensorflow

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
dataset = pd.read_csv('/content/drive/MyDrive/Predicting Effective Arguments/Datasets/train.csv')
dataset.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate


In [4]:
nltk.download('stopwords')
sw = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
def clean_data(sentence):
  tokens = simple_preprocess(sentence)
  tokens = [token for token in tokens if not token in sw]
  return ' '.join(tokens)
  #return tokens

In [6]:
dataset['Tokens'] = dataset['discourse_text'].apply(clean_data)
dataset.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,Tokens
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,hi isaac going writing face mars natural landf...
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,perspective think face natural landform dont t...
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,think face natural landform life mars descover...
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,life mars would know reason think natural land...
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,people thought face formed alieans thought lif...


In [7]:
dataset.drop(['discourse_id', 'essay_id'], axis=1, inplace=True)

In [8]:
dataset.head()

Unnamed: 0,discourse_text,discourse_type,discourse_effectiveness,Tokens
0,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,hi isaac going writing face mars natural landf...
1,"On my perspective, I think that the face is a ...",Position,Adequate,perspective think face natural landform dont t...
2,I think that the face is a natural landform be...,Claim,Adequate,think face natural landform life mars descover...
3,"If life was on Mars, we would know by now. The...",Evidence,Adequate,life mars would know reason think natural land...
4,People thought that the face was formed by ali...,Counterclaim,Adequate,people thought face formed alieans thought lif...


In [9]:
max_count = max(len(sentence.split(' ')) for sentence in dataset['Tokens'])
max_count

373

# Encoding

In [10]:
# import numpy as np
# from tensorflow.keras.utils import to_categorical, plot_model
# from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense
# from tensorflow.keras.layers import LSTM, Bidirectional
# from tensorflow.keras.layers import Embedding

In [11]:
# # integer encode sequences of words
# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(dataset['Tokens'])
# #sequences = tokenizer.texts_to_sequences(dataset['Tokens'])

In [12]:
# def encode(sentence):
#   seq = tokenizer.texts_to_sequences([sentence])[0]
#   encoded = pad_sequences([seq], maxlen=max_count, truncating='pre')
#   return encoded[0]

In [13]:
# dataset['Encoded'] = dataset['Tokens'].apply(encode)
# dataset.head()

In [14]:
ohe = OneHotEncoder()

In [15]:
discourse_type_ohe = pd.DataFrame(ohe.fit_transform(dataset[['discourse_type']]).toarray())
discourse_type_ohe.head()

Unnamed: 0,0,1,2,3,4,5,6
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [16]:
dataset = pd.concat([dataset['Tokens'], discourse_type_ohe, dataset['discourse_effectiveness']], axis=1)
dataset.head()

Unnamed: 0,Tokens,0,1,2,3,4,5,6,discourse_effectiveness
0,hi isaac going writing face mars natural landf...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Adequate
1,perspective think face natural landform dont t...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Adequate
2,think face natural landform life mars descover...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,Adequate
3,life mars would know reason think natural land...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Adequate
4,people thought face formed alieans thought lif...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,Adequate


# LSTM

In [17]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
import tensorflow

In [18]:
dataset.head()

Unnamed: 0,Tokens,0,1,2,3,4,5,6,discourse_effectiveness
0,hi isaac going writing face mars natural landf...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Adequate
1,perspective think face natural landform dont t...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Adequate
2,think face natural landform life mars descover...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,Adequate
3,life mars would know reason think natural land...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Adequate
4,people thought face formed alieans thought lif...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,Adequate


In [19]:
voc_size = 16000
embedding_vector_features = 500

In [20]:
# One hot representation of words
onehot_repr=[one_hot(sentence,voc_size) for sentence in dataset['Tokens']] 
#onehot_repr

In [21]:
onehot_repr

[[1256,
  8484,
  10960,
  919,
  3170,
  12489,
  9459,
  13415,
  2729,
  12489,
  5699,
  3642,
  10326,
  14007,
  13726,
  12489,
  3170,
  11026,
  13747,
  10326,
  3637,
  13415,
  7377,
  2729,
  12489,
  9459,
  13415],
 [9573,
  13628,
  3170,
  9459,
  13415,
  8823,
  13628,
  2729,
  12489,
  9265,
  4260,
  8697,
  13628,
  9459,
  13415],
 [13628, 3170, 9459, 13415, 2729, 12489, 9771, 12279],
 [2729,
  12489,
  6872,
  3637,
  10330,
  13628,
  9459,
  13415,
  15890,
  5342,
  12489,
  15910,
  7654,
  6093,
  307,
  10415,
  3250,
  4478,
  10600,
  6484,
  3250,
  3637,
  9459,
  13415,
  3727,
  6484,
  9522,
  9459,
  13415],
 [3773, 8434, 3170, 9415, 6830, 8434, 2729, 12489],
 [2265, 8421, 2729, 12489, 4056, 13628, 2729, 12489],
 [307,
  10415,
  2516,
  12489,
  10857,
  9565,
  12115,
  10600,
  10931,
  3237,
  10225,
  15243,
  14007,
  13726,
  12489,
  14276,
  5602,
  3170,
  9459,
  13415],
 [1867,
  8434,
  5699,
  6830,
  2685,
  2265,
  8457,
  13628,
 

In [22]:
one_hot_dict = dict()

In [23]:
for encodingList, tokensList in zip(onehot_repr, dataset['Tokens']):
  tokensList = tokensList.split(' ')
  for index, word in zip(encodingList, tokensList):
    one_hot_dict[word] = index

In [24]:
# Embedding the sentences
embedded_docs = pad_sequences(onehot_repr,padding='pre',maxlen=max_count)
print(embedded_docs)

[[    0     0     0 ... 12489  9459 13415]
 [    0     0     0 ... 13628  9459 13415]
 [    0     0     0 ... 12489  9771 12279]
 ...
 [    0     0     0 ...  2888  2461  8421]
 [    0     0     0 ...  8151  7340 14139]
 [    0     0     0 ...  4092  8768 14975]]


In [25]:
discourse_type_ohe = np.array(discourse_type_ohe)
discourse_type_ohe

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [26]:
embedded_docs.shape

(36765, 373)

In [27]:
embedded_docs = np.hstack((embedded_docs, discourse_type_ohe))
embedded_docs

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [28]:
embedded_docs.shape

(36765, 380)

In [29]:
sum(pd.DataFrame(embedded_docs).isnull().sum())

0

In [30]:
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=embedded_docs.shape[1]))
model.add(Dropout(0.3))

model.add(LSTM(512, activation='relu', return_sequences=True))
model.add(Dropout(0.3))

model.add(LSTM(256, activation='relu', return_sequences=True))
model.add(Dropout(0.3))

model.add(LSTM(128, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(128,activation='relu'))
model.add(Dense(3,activation='softmax'))
optimizer = tensorflow.keras.optimizers.Adadelta(learning_rate=.01)
model.compile(loss= 'categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [31]:
my_callbacks_lstm = [
    EarlyStopping(patience=2),
    ModelCheckpoint(filepath='/content/drive/MyDrive/Predicting Effective Arguments/LSTM_WordEmbeddings/Callbacks/model.{epoch:02d}-{val_loss:.2f}.h5'),
    TensorBoard(log_dir='/content/drive/MyDrive/Predicting Effective Arguments/LSTM_WordEmbeddings/logs'),
]

In [32]:
le = LabelEncoder()
dataset['le_discourse_effectiveness'] = le.fit_transform(dataset['discourse_effectiveness'])
dataset.head()

Unnamed: 0,Tokens,0,1,2,3,4,5,6,discourse_effectiveness,le_discourse_effectiveness
0,hi isaac going writing face mars natural landf...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Adequate,0
1,perspective think face natural landform dont t...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Adequate,0
2,think face natural landform life mars descover...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,Adequate,0
3,life mars would know reason think natural land...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Adequate,0
4,people thought face formed alieans thought lif...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,Adequate,0


In [33]:
ohe_discourse_effectiveness = pd.get_dummies(dataset['discourse_effectiveness'])
ohe_discourse_effectiveness

Unnamed: 0,Adequate,Effective,Ineffective
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
...,...,...,...
36760,1,0,0
36761,1,0,0
36762,1,0,0
36763,0,0,1


In [34]:
X_LSTM = embedded_docs.copy()
y_LSTM = ohe_discourse_effectiveness.copy()
#y_LSTM = dataset['le_discourse_effectiveness']

In [35]:
X_LSTM.shape, y_LSTM.shape

((36765, 380), (36765, 3))

In [36]:
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X_LSTM, y_LSTM, test_size=.33, random_state=42, shuffle=True)

In [37]:
#model.fit(X_train_lstm, y_train_lstm, validation_data=(X_test_lstm, y_test_lstm), epochs=10, batch_size=512, callbacks=my_callbacks_lstm)

In [38]:
# Saving the model
#tensorflow.keras.models.save_model(model, '/content/drive/MyDrive/Predicting Effective Arguments/Models/model.h5')

# Check model's performance

In [58]:
test = pd.read_csv('/content/drive/MyDrive/Predicting Effective Arguments/Datasets/test.csv')
test.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type
0,a261b6e14276,D72CB1C11673,Making choices in life can be very difficult. ...,Lead
1,5a88900e7dc1,D72CB1C11673,Seeking multiple opinions can help a person ma...,Position
2,9790d835736b,D72CB1C11673,it can decrease stress levels,Claim
3,75ce6d68b67b,D72CB1C11673,a great chance to learn something new,Claim
4,93578d946723,D72CB1C11673,can be very helpful and beneficial.,Claim


In [59]:
test['Tokens'] = test['discourse_text'].apply(clean_data)
test.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,Tokens
0,a261b6e14276,D72CB1C11673,Making choices in life can be very difficult. ...,Lead,making choices life difficult people often ask...
1,5a88900e7dc1,D72CB1C11673,Seeking multiple opinions can help a person ma...,Position,seeking multiple opinions help person make bet...
2,9790d835736b,D72CB1C11673,it can decrease stress levels,Claim,decrease stress levels
3,75ce6d68b67b,D72CB1C11673,a great chance to learn something new,Claim,great chance learn something new
4,93578d946723,D72CB1C11673,can be very helpful and beneficial.,Claim,helpful beneficial


In [60]:
discourse_id = test['discourse_id']

In [41]:
test.drop(['discourse_id', 'essay_id'], axis=1, inplace=True)

In [42]:
discourse_type_ohe_test = pd.DataFrame(ohe.transform(test[['discourse_type']]).toarray())
discourse_type_ohe_test.head()

Unnamed: 0,0,1,2,3,4,5,6
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
def encode_test_tokens(sentence):
  encoded_tokens = [one_hot_dict[token] for token in sentence.split(' ')]
  return encoded_tokens

In [44]:
# One hot representation of words
one_hot_repr_test = [encode_test_tokens(sentence) for sentence in test['Tokens']]
#one_hot_repr_test


# Embedding the sentences
embedded_docs_test = pad_sequences(one_hot_repr_test, padding='pre', maxlen=max_count)
#print(embedded_docs_test)

discourse_type_ohe_test = np.array(discourse_type_ohe_test)
#discourse_type_ohe

embedded_docs_test = np.hstack((embedded_docs_test, discourse_type_ohe_test))
embedded_docs_test

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [45]:
sum(pd.DataFrame(embedded_docs_test).isnull().sum())

0

In [46]:
# test = pd.concat([test['Tokens'], discourse_type_ohe_test], axis=1)
# test.head()

In [47]:
# Loading the model
model = tensorflow.keras.models.load_model('/content/drive/MyDrive/Predicting Effective Arguments/Models/model.h5')

In [48]:
embedded_docs.shape, embedded_docs_test.shape

((36765, 380), (10, 380))

In [61]:
predictions = model.predict(embedded_docs_test)

In [62]:
predictions = pd.DataFrame(predictions, columns=['Ineffective', 'Adequate', 'Effective'])

In [63]:
predictions = pd.concat([discourse_id, predictions], axis=1)
predictions

Unnamed: 0,discourse_id,Ineffective,Adequate,Effective
0,a261b6e14276,0.448232,0.308554,0.243214
1,5a88900e7dc1,0.451857,0.307562,0.240582
2,9790d835736b,0.454682,0.306972,0.238347
3,75ce6d68b67b,0.452392,0.307575,0.240033
4,93578d946723,0.455629,0.306688,0.237683
5,2e214524dbe3,0.447446,0.30887,0.243684
6,84812fc2ab9f,0.447445,0.308805,0.243749
7,c668ff840720,0.452324,0.30735,0.240326
8,739a6d00f44a,0.447479,0.308809,0.243712
9,bcfae2c9a244,0.446501,0.308875,0.244625


In [64]:
predictions.to_csv('/content/drive/MyDrive/Predicting Effective Arguments/Datasets/Predictions.csv', index=False)