In [1]:
import pandas as pd
import os
from keras.models import Sequential
from keras import regularizers
from keras.layers.core import Dense, Dropout, Flatten
from keras import layers
from sklearn.metrics import accuracy_score, f1_score
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
import numpy as np
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt 
import keras
from time import time

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
train_df = pd.read_csv("train_normalised.csv")
dev_df = pd.read_csv("dev_normalised.csv")
test_df = pd.read_csv("test_normalised.csv")

In [3]:
train_df

Unnamed: 0.1,Unnamed: 0,text,label,id
0,0,4 . Can eating garlic help prevent infection w...,0,1250219300389974016
1,1,French police chief killed himself after #Char...,1,554886875303780352
2,2,Coronavirus disease ( COVID - 19 ) advice for ...,0,1237901309011021825
3,3,Ottawa police confirm that there were multiple...,0,524958128392376320
4,4,if the primary focus of a government is n't to...,0,1239295488677085185
...,...,...,...,...
1802,1802,Desperate Ted Cruz Claims Planned Parenthood S...,1,671181758692507648
1803,1803,""" Thoughts and prayers are not enough . "" Pres...",1,672513234419638273
1804,1804,Police have surrounded this building where the...,0,553508098825261056
1805,1805,@USER @USER @USER @USER It 's an insult to God...,0,1249479605582327808


In [5]:
train_df.text = train_df.text.map(lambda x:str(x))
train_df.label = train_df.label.astype("category")
train_df.label = train_df.label.cat.codes

dev_df.text = dev_df.text.map(lambda x:str(x))
dev_df.label = dev_df.label.astype("category")
dev_df.label = dev_df.label.cat.codes

test_df.text = test_df.text.map(lambda x:str(x))

In [6]:
x_train = train_df.text.values
x_dev = dev_df.text.values
x_test = test_df.text.values

In [7]:
x_train

array(['4 . Can eating garlic help prevent infection with the new coronavirus ? #COVID19Malaysia HTTPURL 5 . Can regularly rinsing your nose with saline help prevent infection with the new coronavirus ? HTTPURL 6 . Do vaccines against pneumonia protect you against the new coronavirus ? HTTPURL 7 . Can spraying alcohol or chlorine all over your body kill the new coronavirus ? #Chamber HTTPURL 8 . How effective are thermal scanners in detecting people infected with the new coronavirus ? HTTPURL 9 . Can an ultraviolet disinfection lamp kill the new coronavirus ? HTTPURL 10 . Are hand dryers effective in killing the new coronavirus ? HTTPURL 11 . The new coronavirus CANNOT be transmitted through mosquito bites . HTTPURL 12 . Taking a hot bath does not prevent the new coronavirus disease HTTPURL 13 . Cold weather and snow CANNOT kill the new coronavirus . HTTPURL 14 . COVID - 19 virus can be transmitted in areas with hot and humid climates HTTPURL 15 . Drinking alcohol does not protect you 

In [8]:
y_train = train_df.label.values
y_train = to_categorical(y_train,2)
y_dev = dev_df.label.values
y_dev = to_categorical(y_dev,2)

In [10]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

train_tokenizer = Tokenizer(num_words=5000)
train_tokenizer.fit_on_texts(train_df.text.values)

dev_tokenizer = Tokenizer(num_words=5000)
dev_tokenizer.fit_on_texts(dev_df.text.values)

test_tokenizer = Tokenizer(num_words=5000)
test_tokenizer.fit_on_texts(test_df.text.values)

X_train = train_tokenizer.texts_to_sequences(x_train)
X_dev = dev_tokenizer.texts_to_sequences(x_dev)
X_test = test_tokenizer.texts_to_sequences(x_test)

vocab_size = len(train_tokenizer.word_index) + len(dev_tokenizer.word_index) + len(test_tokenizer.word_index) + 1

X_train = pad_sequences(X_train, padding='pre', maxlen=500)
X_dev = pad_sequences(X_dev, padding='pre', maxlen=500)
X_test = pad_sequences(X_test, padding='pre', maxlen=500)

In [14]:
activation = ["relu", "sigmoid", "tanh"]
optimizer = ["adam", "SGD", "Adadelta", "RMSprop"]

In [15]:
#using keras embedding

def rnn_model(activation,optimizer,epochs,batchsize):

  model = Sequential()
  model.add(layers.Embedding(input_dim=vocab_size, output_dim=100, input_length=500))
  model.add(layers.SimpleRNN(64))
  model.add(Dense(2,activation=activation))
  model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
  history = model.fit(X_train, y_train,
                    epochs=epochs,
                    verbose=0,
                    validation_data=(X_dev, y_dev),
                    batch_size=batchsize)
  
  return history, model

In [21]:
#For best activation function
sel_activation = {}
for i in activation:
  history, model = rnn_model(i,"adam",15,128)
  temp = {i:model.evaluate(X_dev,y_dev)[1]}
  sel_activation.update(temp)
  keras.backend.clear_session()

best_activation = max(sel_activation, key=sel_activation.get)
print("best activation function is ",best_activation)

best activation function is  sigmoid


In [23]:
#For best optimizzer
sel_optimizer = {}
for i in optimizer:
  history, model = rnn_model(best_activation,i,15,128)
  temp = {i:model.evaluate(X_dev,y_dev)[1]}
  sel_optimizer.update(temp)
  keras.backend.clear_session()

best_optimizer = max(sel_optimizer, key=sel_optimizer.get)
print("best optimizer is ",best_optimizer)

best optimizer is  SGD


In [17]:
y_dev2 = dev_df.label.values

In [26]:
history, model = rnn_model(best_activation,best_optimizer,15,128)
pred = np.argmax(model.predict(X_dev), axis=-1)
print("dev accuracy score = ",accuracy_score(y_pred=pred, y_true=y_dev2))

dev accuracy score =  0.7697478991596639


In [None]:
import sklearn.metrics as metrics
print(metrics.accuracy_score(y_dev2, pred))
print(metrics.precision_score(y_dev2, pred))
print(metrics.recall_score(y_dev2, pred))
print(metrics.f1_score(y_dev2, pred, average='macro'))
print(metrics.roc_auc_score(y_dev2, pred))

In [None]:
#For Kaggle
history, model = rnn_model(best_activation,best_optimizer,15,128)
pred_test = np.argmax(model.predict(X_test), axis=-1)

In [None]:
print("test accuracy score = ",accuracy_score(y_pred=pred, y_true=y_dev2))

test accuracy score =  0.7697478991596639


In [None]:
result = np.round(pred_test).astype(int)

In [None]:
with open('test.predictions_rnn.txt', 'w') as output:
  output.write('Id,Predicted\n')
  counter = 0
  for elem in list(pred_test):
    output.write(str(counter) + ',' + str(elem)+'\n')
    counter+=1

In [None]:

with open('test.predictions.txt', 'w') as output:
  output.write('Id,Predicted\n')
  counter = 0
  for elem in list(result):
    for elem2 in list(elem):
      output.write(str(counter) + ',' + str(elem2)+'\n')
      counter+=1