In [1]:
import pandas as pd
import numpy as np
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from pandas import Series
import torch
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional


train_data = pd.read_csv("../input/hopedata/Hope_ENG_train.csv",usecols=['Text','Label'])
test_data = pd.read_csv("../input/hopedata/Hope_ENG_dev.csv")#,usecols=['Text','Label'])
test_data.columns = ['Text','Label']

# Helper functions
def get_maxLen(data):
    maxL = 0
    for i in range(0,data.shape[0]):
        if len(data['Text'][i]) > maxL:
            maxL = len(data['Text'][i])
    return maxL

def pre(data):
    tokenizer = Tokenizer(num_words=100, lower= 1, oov_token="<OOV>")
    tokenizer.fit_on_texts(data)
    word_index = tokenizer.word_index
    sequences=tokenizer.texts_to_sequences(data)
    padded=pad_sequences(sequences,padding='post',maxlen=l)
    return padded

def sub(data):
    l = []
    for s in data:
        l.append(int(s == 'Hope_speech'))
    return l

# Preprocessing Functions

In [2]:
# Converting text to Lower Case
def to_lower(data):
    for i in range(0,data.shape[0]):
        data['Text'][i] = (data['Text'][i]).lower()

# Removing HTML if present in 'Text'
def rm_html(data):
    html_pattern = re.compile('<.*?>')
    for i in range(0,data.shape[0]):
        data['Text'][i] = html_pattern.sub(r'', data['Text'][i])
        
# Removing URLs present in the 'Text'
def rm_url(data):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    for i in range(0,data.shape[0]):
        data['Text'][i] = url_pattern.sub(r'', data['Text'][i])
        
def pre_process(data):
    to_lower(data)
    rm_html(data)
    rm_url(data)

In [3]:
pre_process(train_data)
pre_process(test_data)

In [4]:
train_data[101:107]

In [3]:
print("Train data")
print("Total Number of sentences: " + str(len(train_data['Text'])))
print("Length of longest sentence: " + str(get_maxLen(train_data)))
train_data['Label'].value_counts()

In [4]:
print("Test data")
print("Total Number of sentences: " + str(len(test_data['Text'])))
print("Length of longest sentence: " + str(get_maxLen(test_data)))
test_data['Label'].value_counts()

## Binary Classification using Bi-LSTM 

In [5]:
#training data
x_train = train_data.Text
y_train = train_data.Label

#testing data
x_test = test_data.Text
y_test = test_data.Label

In [6]:
# adjusting maximum length to 200 
l = 200
n_unique_words = 200 # cut texts after this number of words
maxlen = 200
batch_size = 128

In [7]:
x_train = pre(x_train)
x_test = pre(x_test)
y_train = tf.convert_to_tensor(np.asarray(sub(y_train)).astype('float32'))
y_test = tf.convert_to_tensor(np.asarray(sub(y_test)).astype('float32'))

In [8]:
model = Sequential()
model.add(Embedding(n_unique_words, 128, input_length=maxlen))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

In [9]:
print('Train...')
history=model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=10)

In [10]:
from matplotlib import pyplot
pyplot.plot(history.history['loss'])
pyplot.plot(history.history['accuracy'])
pyplot.title('model loss vs accuracy')

pyplot.xlabel('epoch')
pyplot.legend(['loss', 'accuracy'], loc='upper right')
pyplot.show()

In [11]:
print('Test...')
y_pred = model.predict(x_test)

# Model Accuracy

In [12]:
from sklearn import metrics

print(metrics.accuracy_score(y_test, y_pred.round()))

# Classification Report

In [14]:
from sklearn.metrics import classification_report

class_name = ['Hope_speech','Non_hope_speech']
print(classification_report(y_test, y_pred.round(), target_names=class_name,zero_division=0, digits=4))