# Models on Textual Features
## Data Preparation

In [1]:
from Classifier import get_KNN_Model, get_accuracy_matric, get_lin_SVM_Model, get_NaiveBayes_Model
from Hawkes_Process import get_topic_vector
from Topic_Modelling import LDA_main_driver
from filters import train_test_splitter, get_users_dataframe
import numpy as np
import warnings

warnings.simplefilter(action='ignore')

In [2]:
df, dict_genuine, dict_fake, lda_genuine, lda_fake = LDA_main_driver()
num_topics = 10

-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-* LDA Training Started -*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*


-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-* LDA Training Ended -*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*




In [3]:
user_topic_vectors, labels = get_topic_vector(df, dict_genuine, dict_fake, lda_genuine, lda_fake, 2)

-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-* Hawkes Process Started -*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*


-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-* Hawkes Process Ended -*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*



In [4]:
total_len = len(user_topic_vectors)
X = np.array(user_topic_vectors)
Y = np.array(labels)

In [5]:
topic_probs = X[:,[range(0,20)]].reshape(X.shape[0],20)

In [6]:
topic_probs.shape

(1527, 20)

In [7]:
x_train, x_test, y_train, y_test = train_test_splitter(topic_probs, Y)

In [8]:
def confusionMatrixPrint(P,Y,dataType):
    TF = 0
    TT = 0
    FF = 0
    FT = 0
    for p,y in zip(P,Y):
        if (p,y) == (0,0):
            FF += 1
        elif (p,y) == (1,1):
            TT += 1
        elif (p,y) == (0,1):
            TF += 1
        else:
            FT += 1

    print('------------------------------------------------------------------------------------\n')
    if dataType == 0:
        print("  Confusion Matrix for Train Data : ")
    else:
        print("  Confusion Matrix for Validation Data \n")
    print("         True Positive = ",TT,"           True Negetive = ",TF)
    print("        False Positive = ",FT,"          False Negetive = ",FF)
    print('\n------------------------------------------------------------------------------------\n')
    total = TT+FF+TF+FT
    print(f"  Total Cases : {total}\n")
    print("Accuracy  : ",(TT+FF)/total)
    try:
        prec = (TT)/(TT+FT)
        recall = (TT)/(TT+TF)
        f = (2*recall*prec)/(recall+prec)
        print("Precision : ",prec)
        print("Recall    : ",recall)
        print("F1 Score  : ",f)
    except:
        pass
    
    
    print('\n------------------------------------------------------------------------------------')
    
    
    
    

# 1. KNN Classifier on Topic Vectors Generated by LDA

In [9]:
knn = get_KNN_Model(x_train, y_train)

In [10]:
pred = knn.predict(x_train)
confusionMatrixPrint(pred,y_train,0)

------------------------------------------------------------------------------------

  Confusion Matrix for Train Data : 
         True Positive =  114            True Negetive =  81
        False Positive =  31           False Negetive =  995

------------------------------------------------------------------------------------

  Total Cases : 1221

Accuracy  :  0.9082719082719083
Precision :  0.7862068965517242
Recall    :  0.5846153846153846
F1 Score  :  0.6705882352941177

------------------------------------------------------------------------------------


In [11]:
pred = knn.predict(x_test)
confusionMatrixPrint(pred,y_test,1)

------------------------------------------------------------------------------------

  Confusion Matrix for Validation Data 

         True Positive =  17            True Negetive =  23
        False Positive =  11           False Negetive =  255

------------------------------------------------------------------------------------

  Total Cases : 306

Accuracy  :  0.8888888888888888
Precision :  0.6071428571428571
Recall    :  0.425
F1 Score  :  0.5

------------------------------------------------------------------------------------


# 2. SVM Classifier on Topic Vectors Generated by LDA

In [25]:
svm = get_lin_SVM_Model(x_train, y_train)

In [27]:
pred = svm.predict(x_train)
confusionMatrixPrint(pred,y_train,0)

------------------------------------------------------------------------------------

  Confusion Matrix for Train Data : 
         True Positive =  27            True Negetive =  168
        False Positive =  10           False Negetive =  1016

------------------------------------------------------------------------------------

  Total Cases : 1221

Accuracy  :  0.8542178542178542
Precision :  0.7297297297297297
Recall    :  0.13846153846153847
F1 Score  :  0.23275862068965517

------------------------------------------------------------------------------------


In [29]:
pred = svm.predict(x_test)
confusionMatrixPrint(pred,y_test,1)

------------------------------------------------------------------------------------

  Confusion Matrix for Validation Data 

         True Positive =  5            True Negetive =  35
        False Positive =  1           False Negetive =  265

------------------------------------------------------------------------------------

  Total Cases : 306

Accuracy  :  0.8823529411764706
Precision :  0.8333333333333334
Recall    :  0.125
F1 Score  :  0.21739130434782608

------------------------------------------------------------------------------------


# 3. Naive Bayes Classifier on Topic Vectors Generated by LDA

In [30]:
nb = get_NaiveBayes_Model(x_train, y_train)

In [32]:
pred = nb.predict(x_train)
confusionMatrixPrint(pred,y_train,0)

------------------------------------------------------------------------------------

  Confusion Matrix for Train Data : 
         True Positive =  87            True Negetive =  108
        False Positive =  142           False Negetive =  884

------------------------------------------------------------------------------------

  Total Cases : 1221

Accuracy  :  0.7952497952497952
Precision :  0.3799126637554585
Recall    :  0.4461538461538462
F1 Score  :  0.410377358490566

------------------------------------------------------------------------------------


In [34]:
pred = nb.predict(x_test)
confusionMatrixPrint(pred,y_test,1)

------------------------------------------------------------------------------------

  Confusion Matrix for Validation Data 

         True Positive =  19            True Negetive =  21
        False Positive =  30           False Negetive =  236

------------------------------------------------------------------------------------

  Total Cases : 306

Accuracy  :  0.8333333333333334
Precision :  0.3877551020408163
Recall    :  0.475
F1 Score  :  0.42696629213483145

------------------------------------------------------------------------------------


# 4. Nueral Network Based Classifier on Topic Vectors Generated by LDA

## Fully Connected Network of :
     - Input Layer : 20, 64
     - Hidden Layer 1 : 64 , 128
     - Hidden Layer 2 : 128 , 256
     - Hidden Layer 3 : 256 , 512
     - Hidden Layer 4 : 512 , 256
     - Hidden Layer 5 : 256 , 128
     - Hidden Layer 6 : 128 , 64
     - Hidden Layer 7 : 64 , 32
     - Hidden Layer 8 : 32 , 16
     - Output Layer : 16 , 2

In [122]:
import torch.nn as nn
import warnings
import torch

In [123]:
def get_Device():
    return torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [124]:
device = get_Device()
print(device)

cuda


In [125]:
x_train = torch.Tensor(x_train).to(device)
y_train = torch.Tensor(y_train).to(device)
x_test = torch.Tensor(x_test).to(device)
y_test = torch.Tensor(y_test).to(device)
y_train = y_train.to(torch.long)

In [126]:
class NeuralNetworkClassifierModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(NeuralNetworkClassifierModel, self).__init__()
        self.layer1 = nn.Linear(input_size, 64)
        self.layer2 = nn.Linear(64, 128)
        self.layer3 = nn.Linear(128, 256)
        self.layer4 = nn.Linear(256, 512)
        self.layer5 = nn.Linear(512, 256)
        
        self.layer6 = nn.Linear(256, 128)
        self.layer7 = nn.Linear(128, 64)
        self.layer8 = nn.Linear(64, 32)
        self.layer9 = nn.Linear(32, 16)
        self.layer10 = nn.Linear(16, output_size)
        self.relu = nn.ReLU()
        

    def forward(self, inputs):
        out = self.layer1(inputs)
        out = self.relu(out)
        
        out = self.layer2(out)
        out = self.relu(out)
        
        out = self.layer3(out)
        out = self.relu(out)
        
        out = self.layer4(out)
        out = self.relu(out)
        
        out = self.layer5(out)
        out = self.relu(out)
        
        out = self.layer6(out)
        out = self.relu(out)
        
        out = self.layer7(out)
        out = self.relu(out)
        
        out = self.layer8(out)
        out = self.relu(out)
        
        out = self.layer9(out)
        out = self.relu(out)
        
        out = self.layer10(out)
        
        return out

In [127]:
input_size = 20
output_size = 2
learning_rate = 0.0001
n_epochs = 500

In [128]:
model = NeuralNetworkClassifierModel(input_size=input_size,
                                     output_size=output_size)

In [129]:
model.to(device)

NeuralNetworkClassifierModel(
  (layer1): Linear(in_features=20, out_features=64, bias=True)
  (layer2): Linear(in_features=64, out_features=128, bias=True)
  (layer3): Linear(in_features=128, out_features=256, bias=True)
  (layer4): Linear(in_features=256, out_features=512, bias=True)
  (layer5): Linear(in_features=512, out_features=256, bias=True)
  (layer6): Linear(in_features=256, out_features=128, bias=True)
  (layer7): Linear(in_features=128, out_features=64, bias=True)
  (layer8): Linear(in_features=64, out_features=32, bias=True)
  (layer9): Linear(in_features=32, out_features=16, bias=True)
  (layer10): Linear(in_features=16, out_features=2, bias=True)
  (relu): ReLU()
)

In [130]:
lossfn = torch.nn.CrossEntropyLoss()
lossfn.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [131]:
def Validator(x_test, y_test):
    predicted = model(x_test).to(device)
    pred = torch.max(predicted.data,1)[1]
    total_test = len(y_test)
    correct_pred = 0

    for i in range(total_test):
        if y_test[i] == pred[i]:
            correct_pred += 1

    return correct_pred/total_test

In [132]:
val_acc_list = []
training_acc_list = []
model_list = []
for epoch in range(n_epochs):
    
    predicted = model(x_train).to(device)    

    loss = lossfn(predicted,y_train)
        
    optimizer.zero_grad()
    loss.backward()
    
    optimizer.step()
    
    val_acc = Validator(x_test, y_test.to(torch.int))
    
    print(f'Epoch [ {epoch+1} / {n_epochs} ] Training-Loss = {loss.item():.4f} Training-Accuracy = {1- loss.item():.4f} Validation-Accuracy = {val_acc:.4f}')
    
    training_acc_list.append(1-loss.item())
    val_acc_list.append(val_acc)
    model_list.append(model)

Epoch [ 1 / 500 ] Training-Loss = 0.7790 Training-Accuracy = 0.2210078239440918 Validation-Accuracy = 0.13071895424836602
Epoch [ 2 / 500 ] Training-Loss = 0.7785 Training-Accuracy = 0.22152233123779297 Validation-Accuracy = 0.13071895424836602
Epoch [ 3 / 500 ] Training-Loss = 0.7780 Training-Accuracy = 0.222029447555542 Validation-Accuracy = 0.13071895424836602
Epoch [ 4 / 500 ] Training-Loss = 0.7775 Training-Accuracy = 0.22252368927001953 Validation-Accuracy = 0.13071895424836602
Epoch [ 5 / 500 ] Training-Loss = 0.7770 Training-Accuracy = 0.22301268577575684 Validation-Accuracy = 0.13071895424836602
Epoch [ 6 / 500 ] Training-Loss = 0.7765 Training-Accuracy = 0.22350436449050903 Validation-Accuracy = 0.13071895424836602
Epoch [ 7 / 500 ] Training-Loss = 0.7760 Training-Accuracy = 0.22399765253067017 Validation-Accuracy = 0.13071895424836602
Epoch [ 8 / 500 ] Training-Loss = 0.7755 Training-Accuracy = 0.22449231147766113 Validation-Accuracy = 0.13071895424836602
Epoch [ 9 / 500 ] T

Epoch [ 71 / 500 ] Training-Loss = 0.7378 Training-Accuracy = 0.2622489929199219 Validation-Accuracy = 0.13071895424836602
Epoch [ 72 / 500 ] Training-Loss = 0.7366 Training-Accuracy = 0.26343291997909546 Validation-Accuracy = 0.13071895424836602
Epoch [ 73 / 500 ] Training-Loss = 0.7353 Training-Accuracy = 0.26469284296035767 Validation-Accuracy = 0.13071895424836602
Epoch [ 74 / 500 ] Training-Loss = 0.7339 Training-Accuracy = 0.2660548686981201 Validation-Accuracy = 0.13071895424836602
Epoch [ 75 / 500 ] Training-Loss = 0.7325 Training-Accuracy = 0.267520010471344 Validation-Accuracy = 0.13071895424836602
Epoch [ 76 / 500 ] Training-Loss = 0.7309 Training-Accuracy = 0.2690735459327698 Validation-Accuracy = 0.13071895424836602
Epoch [ 77 / 500 ] Training-Loss = 0.7293 Training-Accuracy = 0.27070915699005127 Validation-Accuracy = 0.13071895424836602
Epoch [ 78 / 500 ] Training-Loss = 0.7276 Training-Accuracy = 0.2724449038505554 Validation-Accuracy = 0.13071895424836602
Epoch [ 79 / 5

Epoch [ 138 / 500 ] Training-Loss = 0.4267 Training-Accuracy = 0.573297917842865 Validation-Accuracy = 0.869281045751634
Epoch [ 139 / 500 ] Training-Loss = 0.4267 Training-Accuracy = 0.5733185112476349 Validation-Accuracy = 0.869281045751634
Epoch [ 140 / 500 ] Training-Loss = 0.4267 Training-Accuracy = 0.5732945799827576 Validation-Accuracy = 0.869281045751634
Epoch [ 141 / 500 ] Training-Loss = 0.4267 Training-Accuracy = 0.5732890367507935 Validation-Accuracy = 0.869281045751634
Epoch [ 142 / 500 ] Training-Loss = 0.4267 Training-Accuracy = 0.5733481645584106 Validation-Accuracy = 0.869281045751634
Epoch [ 143 / 500 ] Training-Loss = 0.4265 Training-Accuracy = 0.5735004246234894 Validation-Accuracy = 0.869281045751634
Epoch [ 144 / 500 ] Training-Loss = 0.4262 Training-Accuracy = 0.5737593173980713 Validation-Accuracy = 0.869281045751634
Epoch [ 145 / 500 ] Training-Loss = 0.4259 Training-Accuracy = 0.574124276638031 Validation-Accuracy = 0.869281045751634
Epoch [ 146 / 500 ] Traini

Epoch [ 206 / 500 ] Training-Loss = 0.4001 Training-Accuracy = 0.5999224185943604 Validation-Accuracy = 0.869281045751634
Epoch [ 207 / 500 ] Training-Loss = 0.3996 Training-Accuracy = 0.6003841459751129 Validation-Accuracy = 0.869281045751634
Epoch [ 208 / 500 ] Training-Loss = 0.3992 Training-Accuracy = 0.6008488833904266 Validation-Accuracy = 0.869281045751634
Epoch [ 209 / 500 ] Training-Loss = 0.3987 Training-Accuracy = 0.6013143956661224 Validation-Accuracy = 0.869281045751634
Epoch [ 210 / 500 ] Training-Loss = 0.3982 Training-Accuracy = 0.6017818748950958 Validation-Accuracy = 0.869281045751634
Epoch [ 211 / 500 ] Training-Loss = 0.3977 Training-Accuracy = 0.6022524237632751 Validation-Accuracy = 0.869281045751634
Epoch [ 212 / 500 ] Training-Loss = 0.3973 Training-Accuracy = 0.6027269065380096 Validation-Accuracy = 0.869281045751634
Epoch [ 213 / 500 ] Training-Loss = 0.3968 Training-Accuracy = 0.6032041907310486 Validation-Accuracy = 0.869281045751634
Epoch [ 214 / 500 ] Trai

Epoch [ 275 / 500 ] Training-Loss = 0.3605 Training-Accuracy = 0.6394682824611664 Validation-Accuracy = 0.869281045751634
Epoch [ 276 / 500 ] Training-Loss = 0.3598 Training-Accuracy = 0.6401938796043396 Validation-Accuracy = 0.8856209150326797
Epoch [ 277 / 500 ] Training-Loss = 0.3591 Training-Accuracy = 0.6409088373184204 Validation-Accuracy = 0.8856209150326797
Epoch [ 278 / 500 ] Training-Loss = 0.3584 Training-Accuracy = 0.6416147947311401 Validation-Accuracy = 0.8856209150326797
Epoch [ 279 / 500 ] Training-Loss = 0.3577 Training-Accuracy = 0.6423209011554718 Validation-Accuracy = 0.8856209150326797
Epoch [ 280 / 500 ] Training-Loss = 0.3570 Training-Accuracy = 0.6430259048938751 Validation-Accuracy = 0.8856209150326797
Epoch [ 281 / 500 ] Training-Loss = 0.3563 Training-Accuracy = 0.6437257826328278 Validation-Accuracy = 0.8856209150326797
Epoch [ 282 / 500 ] Training-Loss = 0.3556 Training-Accuracy = 0.6444241404533386 Validation-Accuracy = 0.8856209150326797
Epoch [ 283 / 500

Epoch [ 343 / 500 ] Training-Loss = 0.3240 Training-Accuracy = 0.6759584844112396 Validation-Accuracy = 0.8954248366013072
Epoch [ 344 / 500 ] Training-Loss = 0.3236 Training-Accuracy = 0.676418274641037 Validation-Accuracy = 0.8954248366013072
Epoch [ 345 / 500 ] Training-Loss = 0.3231 Training-Accuracy = 0.6768836379051208 Validation-Accuracy = 0.8921568627450981
Epoch [ 346 / 500 ] Training-Loss = 0.3226 Training-Accuracy = 0.6773508787155151 Validation-Accuracy = 0.8954248366013072
Epoch [ 347 / 500 ] Training-Loss = 0.3222 Training-Accuracy = 0.6778181195259094 Validation-Accuracy = 0.8986928104575164
Epoch [ 348 / 500 ] Training-Loss = 0.3217 Training-Accuracy = 0.6782884895801544 Validation-Accuracy = 0.8986928104575164
Epoch [ 349 / 500 ] Training-Loss = 0.3212 Training-Accuracy = 0.6787594556808472 Validation-Accuracy = 0.8986928104575164
Epoch [ 350 / 500 ] Training-Loss = 0.3208 Training-Accuracy = 0.6792353689670563 Validation-Accuracy = 0.8921568627450981
Epoch [ 351 / 500

Epoch [ 412 / 500 ] Training-Loss = 0.2864 Training-Accuracy = 0.7136164605617523 Validation-Accuracy = 0.9150326797385621
Epoch [ 413 / 500 ] Training-Loss = 0.2858 Training-Accuracy = 0.7142221629619598 Validation-Accuracy = 0.9150326797385621
Epoch [ 414 / 500 ] Training-Loss = 0.2852 Training-Accuracy = 0.7147887051105499 Validation-Accuracy = 0.9150326797385621
Epoch [ 415 / 500 ] Training-Loss = 0.2846 Training-Accuracy = 0.7153941988945007 Validation-Accuracy = 0.9150326797385621
Epoch [ 416 / 500 ] Training-Loss = 0.2840 Training-Accuracy = 0.715965986251831 Validation-Accuracy = 0.9150326797385621
Epoch [ 417 / 500 ] Training-Loss = 0.2834 Training-Accuracy = 0.7165780067443848 Validation-Accuracy = 0.9150326797385621
Epoch [ 418 / 500 ] Training-Loss = 0.2828 Training-Accuracy = 0.7171552777290344 Validation-Accuracy = 0.9150326797385621
Epoch [ 419 / 500 ] Training-Loss = 0.2822 Training-Accuracy = 0.7177522778511047 Validation-Accuracy = 0.9150326797385621
Epoch [ 420 / 500

Epoch [ 484 / 500 ] Training-Loss = 0.2511 Training-Accuracy = 0.7489468455314636 Validation-Accuracy = 0.9183006535947712
Epoch [ 485 / 500 ] Training-Loss = 0.2507 Training-Accuracy = 0.74929079413414 Validation-Accuracy = 0.9183006535947712
Epoch [ 486 / 500 ] Training-Loss = 0.2503 Training-Accuracy = 0.749735951423645 Validation-Accuracy = 0.9183006535947712
Epoch [ 487 / 500 ] Training-Loss = 0.2499 Training-Accuracy = 0.750051885843277 Validation-Accuracy = 0.9183006535947712
Epoch [ 488 / 500 ] Training-Loss = 0.2498 Training-Accuracy = 0.7502288669347763 Validation-Accuracy = 0.9183006535947712
Epoch [ 489 / 500 ] Training-Loss = 0.2496 Training-Accuracy = 0.7504225224256516 Validation-Accuracy = 0.9215686274509803
Epoch [ 490 / 500 ] Training-Loss = 0.2492 Training-Accuracy = 0.7507832050323486 Validation-Accuracy = 0.9183006535947712
Epoch [ 491 / 500 ] Training-Loss = 0.2488 Training-Accuracy = 0.7511656433343887 Validation-Accuracy = 0.9183006535947712
Epoch [ 492 / 500 ] 

In [136]:
print("Maximum Training Accuracy = ", max(training_acc_list))
print("Maximum Validation Accuracy = ",max(val_acc_list) )
model = model_list[val_acc_list.index(max(val_acc_list))]

Maximum Training Accuracy =  0.7534352838993073
Maximum Validation Accuracy =  0.9248366013071896


In [138]:
predicted = model(x_train).to(device)
predicted = torch.max(predicted.data,1)[1]
confusionMatrixPrint(predicted.to('cpu').numpy(),y_train.to('cpu').numpy(),0)

------------------------------------------------------------------------------------

  Confusion Matrix for Train Data : 

         True Positive =  129            True Negetive =  66
        False Positive =  32           False Negetive =  994

------------------------------------------------------------------------------------

  Total Cases : 1221

Accuracy  :  0.9197379197379197
Precision :  0.8012422360248447
Recall    :  0.6615384615384615
F1 Score  :  0.7247191011235955

------------------------------------------------------------------------------------


In [139]:
predicted = model(x_test).to(device)
predicted = torch.max(predicted.data,1)[1]
confusionMatrixPrint(predicted.to('cpu').numpy(),y_test.to('cpu').numpy(),1)

------------------------------------------------------------------------------------

  Confusion Matrix for Validation Data 

         True Positive =  25            True Negetive =  15
        False Positive =  10           False Negetive =  256

------------------------------------------------------------------------------------

  Total Cases : 306

Accuracy  :  0.9183006535947712
Precision :  0.7142857142857143
Recall    :  0.625
F1 Score  :  0.6666666666666666

------------------------------------------------------------------------------------


In [332]:
val_acc_list.clear()
training_acc_list.clear()
model_list.clear()

# 5. Nueral Network Based Classifier on Bag Words Technique

## LSTM Based Network of :
     - Input Layer : 18, 256
     - LSTM Layer 1 : 256 , 256
     - LSTM Layer 2 : 256 , 256
     - LSTM Layer 3 : 256 , 256
     - LSTM Layer 4 : 256 , 256
     - LSTM Layer 5 : 256 , 256
     - LSTM Layer 6 : 256 , 256
     - LSTM Layer 7 : 256 , 256
     - LSTM Layer 8 : 256 , 256
     - LSTM Layer 9 : 256 , 256
     - LSTM Layer 10 : 256 , 256
     - LSTM Layer 11 : 256 , 256
     - LSTM Layer 12 : 256 , 256
     - LSTM Layer 13 : 256 , 256
     - LSTM Layer 14 : 256 , 256
     - LSTM Layer 15 : 256 , 256
     - LSTM Layer 16 : 256 , 256
     - Output Layer  : 256 , 2

In [17]:
import torch.nn as nn
import warnings
import torch

In [18]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from keras.utils import to_categorical
from filters import train_test_splitter, get_users_dataframe

In [19]:
def get_Device():
    return torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = get_Device()
print(device)

cuda


In [20]:
df = get_users_dataframe()

In [21]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.tweet_text)
print(f"{len(tokenizer.word_index)} words are used\n")

counts = tokenizer.word_counts
print(len(counts))

6533 words are used

6533


In [22]:
word_size=7000
vocab_size = word_size
tokenizer = Tokenizer(num_words=word_size)

tokenizer.fit_on_texts(df.tweet_text)
tokenized = tokenizer.texts_to_sequences(df.tweet_text)

In [23]:
lengths = [len(s) for s in tokenized]
print(f"Average words length of all tweets>> {np.mean(lengths)}")
print(f"Maximum words length of a tweet_text >> {np.max(lengths)}")

sequence_size = 18
print(f"Pad all sequences into size of {sequence_size}")

padded = pad_sequences(tokenized,maxlen=sequence_size,padding='post',truncating='post')


Average words length of all tweets>> 13.114906832298137
Maximum words length of a tweet_text >> 40
Pad all sequences into size of 18


In [24]:
x_train, x_test, y_train, y_test = train_test_split(padded, df.Annotation.values, test_size=0.20)

In [25]:
x_train.shape, x_test.shape

((1288, 18), (322, 18))

In [26]:
y_train.shape, y_test.shape

((1288,), (322,))

In [27]:
x_train = x_train.reshape(x_train.shape[0], 1, x_train.shape[1])
x_test = x_test.reshape(x_test.shape[0], 1, x_test.shape[1])

In [30]:
x_train[0]

array([[74, 75, 76, 77, 78, 65, 79, 80, 60, 81, 16, 72,  9, 10, 57,  0,
         0,  0]])

In [344]:
x_train = torch.Tensor(x_train).to(device)
y_train = torch.Tensor(y_train).to(device)
x_test = torch.Tensor(x_test).to(device)
y_test = torch.Tensor(y_test).to(device)
y_train = y_train.to(torch.long)

In [345]:
input_size = 18
output_size = 2
hidden_size = 32
num_layers = 4
learning_rate = 0.0001
n_epochs = 512

In [346]:
class NeuralNetworkClassifierModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(NeuralNetworkClassifierModel, self).__init__()
        
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
        
        self.fc = nn.Linear(hidden_size, output_size)
        

    def forward(self, inputs):
        h0 = torch.zeros(self.num_layers, inputs.size(1), self.hidden_size).to(device) 
        c0 =  torch.zeros(self.num_layers, inputs.size(1), self.hidden_size).to(device) 
        
        out, _ = self.lstm(inputs, (h0, c0))  
        out = out[:, -1, :]
        
        out = self.fc(out)
        
        return out

In [347]:
model = NeuralNetworkClassifierModel(input_size = input_size,
                                     hidden_size = hidden_size,
                                     num_layers = num_layers,
                                     output_size = output_size)

In [348]:
model.to(device)

NeuralNetworkClassifierModel(
  (lstm): LSTM(18, 32, num_layers=4)
  (fc): Linear(in_features=32, out_features=2, bias=True)
)

In [349]:
lossfn = torch.nn.CrossEntropyLoss()
lossfn.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [350]:
def Validate(x_test, y_test):
    predicted = model(x_test).to(device)
    pred = torch.max(predicted.data,1)[1]
    total_test = len(y_test)
    correct_pred = 0
    for i in range(total_test):
        if y_test[i] == pred[i]:
            correct_pred += 1
    return correct_pred/total_test

In [351]:
valid_acc_list = []
vaild_acc_list = []
model_list = []
for epoch in range(n_epochs):

    predicted = model(x_train).to(device)
    

    loss = lossfn(predicted,y_train)
        
    optimizer.zero_grad()
    loss.backward()
    
    optimizer.step()
    val_acc = Validate(x_test,y_test.to(torch.int))
    
    print(f'Epoch [{epoch+1}/{n_epochs}] Training-Loss = {loss.item():.4f} Train-Accuracy = {1-loss.item():.4f} Valid-Accuracy = {val_acc:.4f}')
    val_acc_list.append(val_acc)
    training_acc_list.append(1-loss.item())
    model_list.append(model)

Epoch [1/512] Training-Loss = 0.8335 Train-Accuracy = 0.1665 Valid-Accuracy = 0.1491
Epoch [2/512] Training-Loss = 0.8325 Train-Accuracy = 0.1675 Valid-Accuracy = 0.1491
Epoch [3/512] Training-Loss = 0.8315 Train-Accuracy = 0.1685 Valid-Accuracy = 0.1491
Epoch [4/512] Training-Loss = 0.8306 Train-Accuracy = 0.1694 Valid-Accuracy = 0.1491
Epoch [5/512] Training-Loss = 0.8296 Train-Accuracy = 0.1704 Valid-Accuracy = 0.1491
Epoch [6/512] Training-Loss = 0.8287 Train-Accuracy = 0.1713 Valid-Accuracy = 0.1491
Epoch [7/512] Training-Loss = 0.8277 Train-Accuracy = 0.1723 Valid-Accuracy = 0.1491
Epoch [8/512] Training-Loss = 0.8268 Train-Accuracy = 0.1732 Valid-Accuracy = 0.1491
Epoch [9/512] Training-Loss = 0.8259 Train-Accuracy = 0.1741 Valid-Accuracy = 0.1491
Epoch [10/512] Training-Loss = 0.8249 Train-Accuracy = 0.1751 Valid-Accuracy = 0.1491
Epoch [11/512] Training-Loss = 0.8240 Train-Accuracy = 0.1760 Valid-Accuracy = 0.1491
Epoch [12/512] Training-Loss = 0.8231 Train-Accuracy = 0.1769 V

Epoch [97/512] Training-Loss = 0.7181 Train-Accuracy = 0.2819 Valid-Accuracy = 0.1491
Epoch [98/512] Training-Loss = 0.7155 Train-Accuracy = 0.2845 Valid-Accuracy = 0.1491
Epoch [99/512] Training-Loss = 0.7128 Train-Accuracy = 0.2872 Valid-Accuracy = 0.1491
Epoch [100/512] Training-Loss = 0.7100 Train-Accuracy = 0.2900 Valid-Accuracy = 0.1491
Epoch [101/512] Training-Loss = 0.7072 Train-Accuracy = 0.2928 Valid-Accuracy = 0.1491
Epoch [102/512] Training-Loss = 0.7042 Train-Accuracy = 0.2958 Valid-Accuracy = 0.1739
Epoch [103/512] Training-Loss = 0.7012 Train-Accuracy = 0.2988 Valid-Accuracy = 0.2888
Epoch [104/512] Training-Loss = 0.6981 Train-Accuracy = 0.3019 Valid-Accuracy = 0.3789
Epoch [105/512] Training-Loss = 0.6948 Train-Accuracy = 0.3052 Valid-Accuracy = 0.4907
Epoch [106/512] Training-Loss = 0.6914 Train-Accuracy = 0.3086 Valid-Accuracy = 0.5714
Epoch [107/512] Training-Loss = 0.6879 Train-Accuracy = 0.3121 Valid-Accuracy = 0.6677
Epoch [108/512] Training-Loss = 0.6843 Train-A

Epoch [192/512] Training-Loss = 0.4296 Train-Accuracy = 0.5704 Valid-Accuracy = 0.8509
Epoch [193/512] Training-Loss = 0.4293 Train-Accuracy = 0.5707 Valid-Accuracy = 0.8509
Epoch [194/512] Training-Loss = 0.4291 Train-Accuracy = 0.5709 Valid-Accuracy = 0.8509
Epoch [195/512] Training-Loss = 0.4289 Train-Accuracy = 0.5711 Valid-Accuracy = 0.8509
Epoch [196/512] Training-Loss = 0.4287 Train-Accuracy = 0.5713 Valid-Accuracy = 0.8509
Epoch [197/512] Training-Loss = 0.4286 Train-Accuracy = 0.5714 Valid-Accuracy = 0.8509
Epoch [198/512] Training-Loss = 0.4284 Train-Accuracy = 0.5716 Valid-Accuracy = 0.8509
Epoch [199/512] Training-Loss = 0.4283 Train-Accuracy = 0.5717 Valid-Accuracy = 0.8509
Epoch [200/512] Training-Loss = 0.4281 Train-Accuracy = 0.5719 Valid-Accuracy = 0.8509
Epoch [201/512] Training-Loss = 0.4280 Train-Accuracy = 0.5720 Valid-Accuracy = 0.8509
Epoch [202/512] Training-Loss = 0.4279 Train-Accuracy = 0.5721 Valid-Accuracy = 0.8509
Epoch [203/512] Training-Loss = 0.4279 Trai

Epoch [288/512] Training-Loss = 0.4271 Train-Accuracy = 0.5729 Valid-Accuracy = 0.8509
Epoch [289/512] Training-Loss = 0.4271 Train-Accuracy = 0.5729 Valid-Accuracy = 0.8509
Epoch [290/512] Training-Loss = 0.4271 Train-Accuracy = 0.5729 Valid-Accuracy = 0.8509
Epoch [291/512] Training-Loss = 0.4271 Train-Accuracy = 0.5729 Valid-Accuracy = 0.8509
Epoch [292/512] Training-Loss = 0.4271 Train-Accuracy = 0.5729 Valid-Accuracy = 0.8509
Epoch [293/512] Training-Loss = 0.4271 Train-Accuracy = 0.5729 Valid-Accuracy = 0.8509
Epoch [294/512] Training-Loss = 0.4271 Train-Accuracy = 0.5729 Valid-Accuracy = 0.8509
Epoch [295/512] Training-Loss = 0.4271 Train-Accuracy = 0.5729 Valid-Accuracy = 0.8509
Epoch [296/512] Training-Loss = 0.4271 Train-Accuracy = 0.5729 Valid-Accuracy = 0.8509
Epoch [297/512] Training-Loss = 0.4271 Train-Accuracy = 0.5729 Valid-Accuracy = 0.8509
Epoch [298/512] Training-Loss = 0.4271 Train-Accuracy = 0.5729 Valid-Accuracy = 0.8509
Epoch [299/512] Training-Loss = 0.4271 Trai

Epoch [383/512] Training-Loss = 0.4269 Train-Accuracy = 0.5731 Valid-Accuracy = 0.8509
Epoch [384/512] Training-Loss = 0.4269 Train-Accuracy = 0.5731 Valid-Accuracy = 0.8509
Epoch [385/512] Training-Loss = 0.4269 Train-Accuracy = 0.5731 Valid-Accuracy = 0.8509
Epoch [386/512] Training-Loss = 0.4269 Train-Accuracy = 0.5731 Valid-Accuracy = 0.8509
Epoch [387/512] Training-Loss = 0.4269 Train-Accuracy = 0.5731 Valid-Accuracy = 0.8509
Epoch [388/512] Training-Loss = 0.4269 Train-Accuracy = 0.5731 Valid-Accuracy = 0.8509
Epoch [389/512] Training-Loss = 0.4269 Train-Accuracy = 0.5731 Valid-Accuracy = 0.8509
Epoch [390/512] Training-Loss = 0.4269 Train-Accuracy = 0.5731 Valid-Accuracy = 0.8509
Epoch [391/512] Training-Loss = 0.4269 Train-Accuracy = 0.5731 Valid-Accuracy = 0.8509
Epoch [392/512] Training-Loss = 0.4269 Train-Accuracy = 0.5731 Valid-Accuracy = 0.8509
Epoch [393/512] Training-Loss = 0.4269 Train-Accuracy = 0.5731 Valid-Accuracy = 0.8509
Epoch [394/512] Training-Loss = 0.4269 Trai

Epoch [479/512] Training-Loss = 0.4266 Train-Accuracy = 0.5734 Valid-Accuracy = 0.8509
Epoch [480/512] Training-Loss = 0.4266 Train-Accuracy = 0.5734 Valid-Accuracy = 0.8509
Epoch [481/512] Training-Loss = 0.4266 Train-Accuracy = 0.5734 Valid-Accuracy = 0.8509
Epoch [482/512] Training-Loss = 0.4266 Train-Accuracy = 0.5734 Valid-Accuracy = 0.8509
Epoch [483/512] Training-Loss = 0.4266 Train-Accuracy = 0.5734 Valid-Accuracy = 0.8509
Epoch [484/512] Training-Loss = 0.4266 Train-Accuracy = 0.5734 Valid-Accuracy = 0.8509
Epoch [485/512] Training-Loss = 0.4266 Train-Accuracy = 0.5734 Valid-Accuracy = 0.8509
Epoch [486/512] Training-Loss = 0.4266 Train-Accuracy = 0.5734 Valid-Accuracy = 0.8509
Epoch [487/512] Training-Loss = 0.4266 Train-Accuracy = 0.5734 Valid-Accuracy = 0.8509
Epoch [488/512] Training-Loss = 0.4266 Train-Accuracy = 0.5734 Valid-Accuracy = 0.8509
Epoch [489/512] Training-Loss = 0.4266 Train-Accuracy = 0.5734 Valid-Accuracy = 0.8509
Epoch [490/512] Training-Loss = 0.4266 Trai

In [352]:
print("Maximum Training Accuracy = ", max(training_acc_list))
print("Maximum Validation Accuracy = ",max(val_acc_list) )
model = model_list[training_acc_list.index(max(training_acc_list))]

Maximum Training Accuracy =  0.5734947621822357
Maximum Validation Accuracy =  0.8509316770186336


In [353]:
predicted = model(x_train).to(device)
predicted = torch.max(predicted.data,1)[1]
confusionMatrixPrint(predicted.to('cpu').numpy(),y_train.to('cpu').numpy(),0)

------------------------------------------------------------------------------------

  Confusion Matrix for Train Data : 
         True Positive =  0            True Negetive =  195
        False Positive =  2           False Negetive =  1091

------------------------------------------------------------------------------------

  Total Cases : 1288

Accuracy  :  0.8470496894409938


ZeroDivisionError: float division by zero

In [354]:
predicted = model(x_test).to(device)
predicted = torch.max(predicted.data,1)[1]
confusionMatrixPrint(predicted.to('cpu').numpy(),y_test.to('cpu').numpy(),1)

------------------------------------------------------------------------------------

  Confusion Matrix for Validation Data 

         True Positive =  1            True Negetive =  47
        False Positive =  1           False Negetive =  273

------------------------------------------------------------------------------------

  Total Cases : 322

Accuracy  :  0.8509316770186336
Precision :  0.5
Recall    :  0.020833333333333332
F1 Score  :  0.039999999999999994

------------------------------------------------------------------------------------


In [331]:
valid_acc_list.clear()
training_acc_list.clear()
model_list.clear()

# 4. Nueral Network Based Classifier on Bag of Words

## Nueral Network of :
     - Input Layer
     - Embedding Layer
     - Bidirectional LSTM
     - BatchNormalization
     - Bidirectional LSTM
     - BatchNormalization
     - Bidirectional LSTM
     - Flatten Layer
     - Dense Layer
     - Dense Layer
     - Dropout Layer
     - Dense Layer
     - Output Layer

In [267]:
from keras.layers import Input,Embedding,Bidirectional,LSTM,BatchNormalization,Dense,GlobalMaxPool1D,Dropout,Masking,Flatten
from keras.callbacks import EarlyStopping
from keras.utils import plot_model
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from keras.utils import to_categorical
from filters import train_test_splitter, get_users_dataframe

In [268]:
df = get_users_dataframe()

In [269]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.tweet_text)
print(f"{len(tokenizer.word_index)} words are used\n")

counts = tokenizer.word_counts
print(len(counts))

6533 words are used

6533


In [270]:
word_size=7000
vocab_size = word_size
tokenizer = Tokenizer(num_words=word_size)

tokenizer.fit_on_texts(df.tweet_text)
tokenized = tokenizer.texts_to_sequences(df.tweet_text)

In [271]:
lengths = [len(s) for s in tokenized]
print(f"Average words length of all tweets>> {np.mean(lengths)}")
print(f"Maximum words length of a tweet_text >> {np.max(lengths)}")

sequence_size = 18
print(f"Pad all sequences into size of {sequence_size}")

padded = pad_sequences(tokenized,maxlen=sequence_size,padding='post',truncating='post')

Average words length of all tweets>> 13.114906832298137
Maximum words length of a tweet_text >> 40
Pad all sequences into size of 18


In [272]:
x_train, x_test, y_train, y_test = train_test_split(padded, df.Annotation.values, test_size=0.20)

In [273]:
x_train.shape, x_test.shape,y_train.shape, y_test.shape

((1288, 18), (322, 18), (1288,), (322,))

In [274]:
y_train = to_categorical(y_train,num_classes=2)

In [275]:
y_train.shape

(1288, 2)

In [276]:
word_vec_size=20
hidden_size=128

In [277]:
def get_bilstm_model():
    model = Sequential()
    model.add(Input(shape=[sequence_size]))
    model.add(Embedding(vocab_size,word_vec_size,input_length=sequence_size))
    
    
    model.add(Bidirectional(LSTM(hidden_size,return_sequences=True)))
    model.add(BatchNormalization())
    model.add(Bidirectional(LSTM(int(hidden_size/2),return_sequences=True)))
    model.add(BatchNormalization())
    model.add(Bidirectional(LSTM(int(hidden_size/2),return_sequences=True)))
    
    
    model.add(Flatten())
    model.output_shape
    model.add(Dense(256,activation='relu'))
    model.add(Dense(128,activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(2,activation='softmax'))
    
    # model = keras.models.Model(X,Y)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    model.summary()
    return model

In [278]:
es = EarlyStopping(monitor='val_accuracy',mode='min',patience=4,verbose=1)

In [279]:
model = get_bilstm_model()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 18, 20)            140000    
_________________________________________________________________
bidirectional_3 (Bidirection (None, 18, 256)           152576    
_________________________________________________________________
batch_normalization_2 (Batch (None, 18, 256)           1024      
_________________________________________________________________
bidirectional_4 (Bidirection (None, 18, 128)           164352    
_________________________________________________________________
batch_normalization_3 (Batch (None, 18, 128)           512       
_________________________________________________________________
bidirectional_5 (Bidirection (None, 18, 128)           98816     
_________________________________________________________________
flatten_1 (Flatten)          (None, 2304)             

In [285]:
hist = model.fit(x_train,y_train,epochs=100,batch_size = 256, validation_split=0.2, callbacks=[es])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 00005: early stopping


In [286]:
ev = model.evaluate(x_test,to_categorical(y_test,num_classes=2))
ev



[0.6090861558914185, 0.8850931525230408]

In [287]:
max(hist.history['val_accuracy'])

0.8488371968269348

In [288]:
pred = model(x_test)
pred = np.argmax(pred, axis=-1)   

In [289]:
total_test = len(y_test)
correct_pred = 0
wrong_pred = 0

for i in range(total_test):
    if y_test[i] == pred[i]:
        correct_pred += 1
    else:
        wrong_pred += 1

print(f"Total {correct_pred} correct predictions out of {total_test}")
print(f"Total {wrong_pred} wrong predictions out of {total_test}")

print("Accuracy = ", (correct_pred/total_test)*100, " %")

Total 285 correct predictions out of 322
Total 37 wrong predictions out of 322
Accuracy =  88.50931677018633  %


In [290]:
confusionMatrixPrint(np.argmax(model(x_train), axis=-1),np.argmax(y_train,axis = -1),0)

------------------------------------------------------------------------------------

  Confusion Matrix for Train Data : 
         True Positive =  127            True Negetive =  71
        False Positive =  27           False Negetive =  1063

------------------------------------------------------------------------------------

  Total Cases : 1288

Accuracy  :  0.9239130434782609
Precision :  0.8246753246753247
Recall    :  0.6414141414141414
F1 Score  :  0.7215909090909091

------------------------------------------------------------------------------------


In [291]:
confusionMatrixPrint(np.argmax(model(x_test), axis=-1),y_test,1)

------------------------------------------------------------------------------------

  Confusion Matrix for Validation Data 

         True Positive =  14            True Negetive =  31
        False Positive =  6           False Negetive =  271

------------------------------------------------------------------------------------

  Total Cases : 322

Accuracy  :  0.8850931677018633
Precision :  0.7
Recall    :  0.3111111111111111
F1 Score  :  0.43076923076923074

------------------------------------------------------------------------------------


# Word Embeddings

In [1]:
import gensim
from keras.layers import Input,Embedding,Bidirectional,LSTM,BatchNormalization,Dense,GlobalMaxPool1D,Dropout,Masking,Flatten
from keras.callbacks import EarlyStopping
from keras.utils import plot_model
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from keras.utils import to_categorical
from filters import train_test_splitter, get_users_dataframe

In [2]:
df = get_users_dataframe()
tweet_text_list = list(df.tweet_text)
tweet_text_list = [x.split(' ') for x in tweet_text_list]
max_len = max([len(x) for x in tweet_text_list])

In [3]:
embedding_size = 64
hidden_size=128

In [4]:
word_embeddings = gensim.models.Word2Vec(sentences = tweet_text_list, vector_size = embedding_size,workers = 4, min_count = 1)
total_words = len(list(word_embeddings.wv.index_to_key)) + 1

In [5]:
word_embeddings.wv.most_similar('covid', topn=10) 

[('new', 0.9470752477645874),
 ('us', 0.9453030228614807),
 ('vaccine', 0.9266708493232727),
 ('coronavirus', 0.9239075779914856),
 ('day', 0.922370195388794),
 ('people', 0.9219273328781128),
 ('one', 0.9213962554931641),
 ('time', 0.9146817922592163),
 ('get', 0.9102112650871277),
 ('positive', 0.9063314199447632)]

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.tweet_text)
print(f"{len(tokenizer.word_index)} words are used\n")

counts = tokenizer.word_counts
print(len(counts))
word_size=7000
vocab_size = word_size
tokenizer = Tokenizer(num_words=word_size)

tokenizer.fit_on_texts(df.tweet_text)
tokenized = tokenizer.texts_to_sequences(df.tweet_text)
lengths = [len(s) for s in tokenized]
print(f"Average words length of all tweets>> {np.mean(lengths)}")
print(f"Maximum words length of a tweet_text >> {np.max(lengths)}")

sequence_size = 18
print(f"Pad all sequences into size of {sequence_size}")

padded = pad_sequences(tokenized,maxlen=sequence_size,padding='pre',truncating='pre')

6533 words are used

6533
Average words length of all tweets>> 13.114906832298137
Maximum words length of a tweet_text >> 40
Pad all sequences into size of 18


In [12]:
tokenized[-1]

[74, 75, 76, 77, 78, 65, 79, 80, 60, 81, 16, 72, 9, 10, 57]

In [14]:
padded[-1]

array([ 0,  0,  0, 74, 75, 76, 77, 78, 65, 79, 80, 60, 81, 16, 72,  9, 10,
       57])

In [9]:
embedding_matrix = np.zeros((total_words,embedding_size))

In [10]:
for idx,word in enumerate(list(word_embeddings.wv.index_to_key)):
    emb_vec = word_embeddings.wv[word]
    embedding_matrix[idx+1] = emb_vec
        
print("Embedding Matrix Shape : ",embedding_matrix.shape)

Embedding Matrix Shape :  (6534, 64)


In [80]:
model = Sequential()
model.add(Input(shape=[sequence_size]))
model.add(Embedding(total_words, embedding_size, weights = [embedding_matrix], input_length = sequence_size, trainable = False))
model.add(Bidirectional(LSTM(hidden_size,return_sequences=True)))
model.add(BatchNormalization())
model.add(Bidirectional(LSTM(int(hidden_size/2),return_sequences=True)))
model.add(BatchNormalization())
model.add(Bidirectional(LSTM(int(hidden_size/2),return_sequences=True)))

model.add(Flatten())
model.add(Dense(256,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(2,activation='softmax'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [81]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 18, 64)            418176    
_________________________________________________________________
bidirectional_15 (Bidirectio (None, 18, 256)           197632    
_________________________________________________________________
batch_normalization_10 (Batc (None, 18, 256)           1024      
_________________________________________________________________
bidirectional_16 (Bidirectio (None, 18, 128)           164352    
_________________________________________________________________
batch_normalization_11 (Batc (None, 18, 128)           512       
_________________________________________________________________
bidirectional_17 (Bidirectio (None, 18, 128)           98816     
_________________________________________________________________
flatten_5 (Flatten)          (None, 2304)             

In [85]:
x_train, x_test, y_train, y_test = train_test_split(padded, df.Annotation.values, test_size=0.20)
x_train.shape, x_test.shape,y_train.shape, y_test.shape

((1288, 18), (322, 18), (1288,), (322,))

In [87]:
y_train = to_categorical(df.Annotation.values,num_classes=2)

In [88]:
es = EarlyStopping(monitor='val_accuracy',mode='min',patience=7,verbose=1)
hist = model.fit(x_train,y_train,epochs=100,batch_size = 256, validation_split=0.2, callbacks=[es])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 00008: early stopping


In [91]:
confusionMatrixPrint(np.argmax(model(x_train), axis=-1),np.argmax(y_train,axis = -1),0)

------------------------------------------------------------------------------------

  Confusion Matrix for Train Data : 
         True Positive =  0            True Negetive =  198
        False Positive =  0           False Negetive =  1090

------------------------------------------------------------------------------------

  Total Cases : 1288

Accuracy  :  0.8462732919254659

------------------------------------------------------------------------------------


In [90]:
confusionMatrixPrint(np.argmax(model(x_test), axis=-1),y_test,1)

------------------------------------------------------------------------------------

  Confusion Matrix for Validation Data 

         True Positive =  0            True Negetive =  55
        False Positive =  0           False Negetive =  267

------------------------------------------------------------------------------------

  Total Cases : 322

Accuracy  :  0.8291925465838509

------------------------------------------------------------------------------------
