---
---
*<h1 align='center'> Causal Sentence Detection </h1>*
*<h2 align='center'> Models: RNN, LSTM, Bi-LSTM,</h2>*
---
---
Name: Logesh.V <br>
Email: vlogesh2001@gmail.com <br>
DATASETS: 
- Causaly_small: https://archive.org/details/CausalySmall <br>
About Dataset: The dataset contains 2000 manually annotated sentences derived from
pubmed articles. 1113 out of 2000 sentences are annotated as Causal (Annotated_Causal = 1) and the rest (887) are annotated as non-Causal (Annotated_Causal = 0).

In [2]:
# importing the necessary package
import pandas as pd

In [3]:
# reading the csv file using pandas package
df = pd.read_csv("Causaly_small.csv")

In [4]:
# printing first 5 Tweet with its corresponding class(normal,abusive or other)
df.head()

Unnamed: 0,Sentence,Annotated_Causal
0,"Results indicated that boys with ADHD , relati...",1
1,Ticagrelor appears to provide higher value for...,0
2,"Whatever the mechanism , active smoking is an ...",1
3,"In this study , we examined whether use of an ...",0
4,"Using causal inference testing , we searched c...",0


In [5]:
# showing nullable data
df.isnull().sum()

Sentence            0
Annotated_Causal    0
dtype: int64

In [6]:
# Describing the data
df.describe()

Unnamed: 0,Annotated_Causal
count,2000.0
mean,0.5565
std,0.496922
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [7]:
# Getting the distinct values of classes of tweets
df['Sentence'].unique()

array(['Results indicated that boys with ADHD , relative to typically developing boys , exhibited greater motor activity across tasks , and both groups activity was greater during EF tasks relative to control tasks .',
       'Ticagrelor appears to provide higher value for patients in several recognized high-risk subgroups .',
       'Whatever the mechanism , active smoking is an important modifiable factor that seems to be associated with a poor response to MTX .',
       ...,
       'These results in part explain the impairment of host-defense mechanisms seen in the perioperative period .',
       'Modern research makes frequent use of animal models , that is , organisms raised and bred experimentally in order to help the understanding of biological and chemical processes affecting organisms or whole environments .',
       'Vaccination rates for influenza , pneumococcus , and zoster in patients with rheumatoid arthritis have remained low .'],
      dtype=object)

In [8]:
# Getting the number of values
df.shape

(2000, 2)

In [10]:
# split text by space
def split_white_space(text):
    text = text.lower().split()
    return text

In [11]:
# remove pnctuation
import string

def remove_punctuation(text):
    result = string.punctuation
    listText=[]
    for words in text:
        String =""
        for word in words:
            if word not in result:
                String+=word
            else:
                break
        if (String!="") :
            listText.append(String)    
    return listText

In [14]:
def clean_data(text):
    #text = split_white_space(text)
    #text = remove_punctuation(text)
    return text

df['Sentence'] = df['Sentence'].apply(clean_data)

df.head()

Unnamed: 0,Sentence,Annotated_Causal
0,"Results indicated that boys with ADHD , relati...",1
1,Ticagrelor appears to provide higher value for...,0
2,"Whatever the mechanism , active smoking is an ...",1
3,"In this study , we examined whether use of an ...",0
4,"Using causal inference testing , we searched c...",0


In [17]:
# Vectorizing the words using hot encoding
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split( df['Sentence'], df['Annotated_Causal'], test_size=0.2)

In [18]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

print('Number of Unique Tokens',len(tokenizer.word_index))

vocab_size = len(tokenizer.word_index) + 1

maxlen = 200

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)
print( X_train.shape )
print(X_train,y_train)

Number of Unique Tokens 7257
(1600, 200)
[[  11 1591 2150 ...    0    0    0]
 [   7 1264  229 ...    0    0    0]
 [3392  109  110 ...    0    0    0]
 ...
 [1061   64    3 ...    0    0    0]
 [3052  645  516 ...    0    0    0]
 [1225  222  448 ...    0    0    0]] 1352    0
616     1
337     0
114     1
1592    1
       ..
736     1
531     0
824     0
1787    1
1688    1
Name: Annotated_Causal, Length: 1600, dtype: int64


# RNN Architecture

In [21]:
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Flatten, MaxPooling1D, Input, Concatenate
vocab_size = 10000
embedding_dim = 1000

#tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
#tf.config.experimental_connect_to_cluster(tpu)
#tf.tpu.experimental.initialize_tpu_system(tpu)


# instantiate a distribution strategy
model = tf.keras.Sequential([
    #embedding layer(input)
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.SimpleRNN(50),
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    tf.keras.layers.Dense(145, activation='relu'),
    #output layer
    tf.keras.layers.Dense(4, activation="softmax")
    ])
model.compile(optimizer="adam",loss="sparse_categorical_crossentropy" , metrics=["accuracy"])
# train model normally
model.fit(X_train, y_train)



<keras.callbacks.History at 0x7fb51e27cad0>

In [22]:
# Printing Test Accuracy Value
test_lost , test_acc_rnn = model.evaluate(X_test, y_test)


print("The accuracy of the RNN model is:",(test_acc_rnn*100))

The accuracy of the RNN model is: 44.749999046325684


# Long Short Term Memory Architecture (LSTM)

In [23]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
vocab_size = 10000
embedding_dim = 1000

#tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
#tf.config.experimental_connect_to_cluster(tpu)
#tf.tpu.experimental.initialize_tpu_system(tpu)

# instantiate a distribution strategy
#tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

# instantiating the model in the strategy scope creates the model on the TPU
model = tf.keras.Sequential([
        #Word embdading layer (Input layer)
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        tf.keras.layers.LSTM(44),
        tf.keras.layers.Dense(embedding_dim, activation='softmax'),
        tf.keras.layers.Dense(140, activation='relu'),
        tf.keras.layers.Dense(150, activation='softmax'),
        #Output layer(We use softmax activation function in multiple classification)
        tf.keras.layers.Dense(4, activation="softmax")
    ])

model.compile(optimizer="adam",loss="sparse_categorical_crossentropy" , metrics=["accuracy"])
# train model normally
model.fit(X_train, y_train,epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7fb51dff0cd0>

In [24]:
# Printing Test Accuracy Value
test_lost , test_acc_rnn = model.evaluate(X_test, y_test)


print("The accuracy of the LSTM model is:",(test_acc_rnn*100))

The accuracy of the LSTM model is: 55.250000953674316


In [25]:
model.save(r'./LSTM.h5')

# Bidirectional Long Short Term Memory Architecture (Bi-LSTM)

In [26]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
vocab_size = 10000
embedding_dim = 1000

#tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
#tf.config.experimental_connect_to_cluster(tpu)
#tf.tpu.experimental.initialize_tpu_system(tpu)
from tensorflow.keras import layers
# instantiate a distribution strategy
#tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

# instantiating the model in the strategy scope creates the model on the TPU
model = tf.keras.Sequential([
        #Word embdading layer (Input layer)
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(10)),
        tf.keras.layers.Dense(embedding_dim, activation='relu'),
        tf.keras.layers.Dense(140, activation='relu'),
        tf.keras.layers.Dense(150, activation='relu'),
        #Output layer(We use softmax activation function in multiple classification)
        tf.keras.layers.Dense(4, activation="softmax")
    ])

model.compile(optimizer="adam",loss="sparse_categorical_crossentropy" , metrics=["accuracy"])
# train model normally
model.fit(X_train, y_train)



<keras.callbacks.History at 0x7fb51d972bd0>

In [27]:
# Printing Test Accuracy Value
test_lost , test_acc_rnn = model.evaluate(X_test, y_test)


print("The accuracy of the Bi-LSTM model is:",(test_acc_rnn*100))

The accuracy of the Bi-LSTM model is: 66.50000214576721


---
#### *<div style="text-align: center"> - - - - - Thank You - - - - - <div>*
---