# Setup

Get the Data & Install Requirements

In [None]:
# Download Datasets
!wget https://raw.githubusercontent.com/MMU-TDMLab/CompLex/master/train/lcp_single_train.tsv
!wget https://raw.githubusercontent.com/MMU-TDMLab/CompLex/master/test-labels/lcp_single_test.tsv
!wget https://raw.githubusercontent.com/MMU-TDMLab/CompLex/master/trial/lcp_single_trial.tsv

In [None]:
# Install Dependencies
!pip install sentence-transformers
!pip install transformers
!pip install textstat

In [None]:
# Download & Unzip GloVe_42B Uncased Embeddings
!wget http://nlp.stanford.edu/data/glove.42B.300d.zip
!unzip glove.42B.300d.zip

Import Libraries


In [5]:
import os
import sys
import csv
import copy
from tqdm import tqdm, trange

import math
import numpy as np
import pandas as pd

import transformers
from transformers import BertTokenizer, BertModel

from sentence_transformers import SentenceTransformer

import string
from collections import Counter
import re

import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

import spacy 
import textstat
from textstat.textstat import textstatistics,legacy_round
import textstat.textstat as textstat

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import MinMaxScaler

from scipy import stats

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


Load Data

In [6]:
traindf = pd.read_csv("lcp_single_train.tsv", encoding='utf-8', delimiter='\t', quotechar='\t', keep_default_na=False)
testdf = pd.read_csv("lcp_single_test.tsv", encoding='utf-8', delimiter='\t', quotechar='\t', keep_default_na=False)
trialdf = pd.read_csv("lcp_single_trial.tsv", encoding='utf-8', delimiter='\t', quotechar='\t', keep_default_na=False)
print(traindf.shape)
print(trialdf.shape)
print(testdf.shape)

(7662, 5)
(421, 5)
(917, 5)


In [18]:
# To send inputs (Mostly Tensors) to GPU as well
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)

cuda


# Preprocessing

Load Pretrained GloVe Embeddings

In [7]:
glove_embeddings_dict = {}
with open("glove.42B.300d.txt") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        glove_embeddings_dict[word] = vector

# unk_300d is a representation for Unknown tokens
unk_300d = "0.22418134 -0.28881392 0.13854356 0.00365387 -0.12870757 0.10243822 0.061626635 0.07318011 -0.061350107 -1.3477012 0.42037755 -0.063593924 -0.09683349 0.18086134 0.23704372 0.014126852 0.170096 -1.1491593 0.31497982 0.06622181 0.024687296 0.076693475 0.13851812 0.021302193 -0.06640582 -0.010336159 0.13523154 -0.042144544 -0.11938788 0.006948221 0.13333307 -0.18276379 0.052385733 0.008943111 -0.23957317 0.08500333 -0.006894406 0.0015864656 0.063391194 0.19177166 -0.13113557 -0.11295479 -0.14276934 0.03413971 -0.034278486 -0.051366422 0.18891625 -0.16673574 -0.057783455 0.036823478 0.08078679 0.022949161 0.033298038 0.011784158 0.05643189 -0.042776518 0.011959623 0.011552498 -0.0007971594 0.11300405 -0.031369694 -0.0061559738 -0.009043574 -0.415336 -0.18870236 0.13708843 0.005911723 -0.113035575 -0.030096142 -0.23908928 -0.05354085 -0.044904727 -0.20228513 0.0065645403 -0.09578946 -0.07391877 -0.06487607 0.111740574 -0.048649278 -0.16565254 -0.052037314 -0.078968436 0.13684988 0.0757494 -0.006275573 0.28693774 0.52017444 -0.0877165 -0.33010918 -0.1359622 0.114895485 -0.09744406 0.06269521 0.12118575 -0.08026362 0.35256687 -0.060017522 -0.04889904 -0.06828978 0.088740796 0.003964443 -0.0766291 0.1263925 0.07809314 -0.023164088 -0.5680669 -0.037892066 -0.1350967 -0.11351585 -0.111434504 -0.0905027 0.25174105 -0.14841858 0.034635577 -0.07334565 0.06320108 -0.038343467 -0.05413284 0.042197507 -0.090380974 -0.070528865 -0.009174437 0.009069661 0.1405178 0.02958134 -0.036431845 -0.08625681 0.042951006 0.08230793 0.0903314 -0.12279937 -0.013899368 0.048119213 0.08678239 -0.14450377 -0.04424887 0.018319942 0.015026873 -0.100526 0.06021201 0.74059093 -0.0016333034 -0.24960588 -0.023739101 0.016396184 0.11928964 0.13950661 -0.031624354 -0.01645025 0.14079992 -0.0002824564 -0.08052984 -0.0021310581 -0.025350995 0.086938225 0.14308536 0.17146006 -0.13943303 0.048792403 0.09274929 -0.053167373 0.031103406 0.012354865 0.21057427 0.32618305 0.18015954 -0.15881181 0.15322933 -0.22558987 -0.04200665 0.0084689725 0.038156632 0.15188617 0.13274793 0.113756925 -0.095273495 -0.049490947 -0.10265804 -0.27064866 -0.034567792 -0.018810693 -0.0010360252 0.10340131 0.13883452 0.21131058 -0.01981019 0.1833468 -0.10751636 -0.03128868 0.02518242 0.23232952 0.042052146 0.11731903 -0.15506615 0.0063580726 -0.15429358 0.1511722 0.12745973 0.2576985 -0.25486213 -0.0709463 0.17983761 0.054027 -0.09884228 -0.24595179 -0.093028545 -0.028203879 0.094398156 0.09233813 0.029291354 0.13110267 0.15682974 -0.016919162 0.23927948 -0.1343307 -0.22422817 0.14634751 -0.064993896 0.4703685 -0.027190214 0.06224946 -0.091360025 0.21490277 -0.19562101 -0.10032754 -0.09056772 -0.06203493 -0.18876675 -0.10963594 -0.27734384 0.12616494 -0.02217992 -0.16058226 -0.080475815 0.026953284 0.110732645 0.014894041 0.09416802 0.14299914 -0.1594008 -0.066080004 -0.007995227 -0.11668856 -0.13081996 -0.09237365 0.14741232 0.09180138 0.081735 0.3211204 -0.0036552632 -0.047030564 -0.02311798 0.048961394 0.08669574 -0.06766279 -0.50028914 -0.048515294 0.14144728 -0.032994404 -0.11954345 -0.14929578 -0.2388355 -0.019883996 -0.15917352 -0.052084364 0.2801028 -0.0029121689 -0.054581646 -0.47385484 0.17112483 -0.12066923 -0.042173345 0.1395337 0.26115036 0.012869649 0.009291686 -0.0026459037 -0.075331464 0.017840583 -0.26869613 -0.21820338 -0.17084768 -0.1022808 -0.055290595 0.13513643 0.12362477 -0.10980586 0.13980341 -0.20233242 0.08813751 0.3849736 -0.10653763 -0.06199595 0.028849555 0.03230154 0.023856193 0.069950655 0.19310954 -0.077677034 -0.144811"
unk_300d = np.array([float(num) for num in unk_300d.split()])

Save a set of Stopwords

In [8]:
stop_words = set(stopwords.words('english'))

Preprocessing for getting GloVe Embeddings

In [9]:
def preprocessor_glove(s):
  s = s.lower()

  exclude = set(string.punctuation)
  s = ''.join(ch if ch not in exclude else ' ' for ch in s)

  wlist = s.split()

  return wlist

Set up Tokenization (BERT)

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Function to find Target token positions in sentence token-list

In [11]:
def tok2pos(sent_toklist, tw):
  currparse_pos = 0
  curr_subset = False

  poslist = []

  for i,tok in enumerate(sent_toklist):  
    if not curr_subset and tw.startswith(tok):
      if tw == tok and ( i+1 == len(sent_toklist) or not (sent_toklist[i+1]).startswith('##')):
        poslist.append(i)
        break
      else:
        poslist.append(i)
        currparse_pos = len(tok)
        curr_subset = True
    
    elif curr_subset and not tok.startswith('##'):
      currparse_pos = 0
      curr_subset = False
      poslist = []
      if tw.startswith(tok):
        if tw == tok and ( i+1 == len(sent_toklist) or not (sent_toklist[i+1]).startswith('##')):
          poslist.append(i)
          break
        else:
          poslist.append(i)
          currparse_pos = len(tok)
          curr_subset = True

    elif curr_subset and tok.startswith('##') and (tw[currparse_pos:]).startswith(tok[2:]):
      if tw[currparse_pos:] == tok[2:] and ( i+1 == len(sent_toklist) or not (sent_toklist[i+1]).startswith('##')):
        poslist.append(i)
        break
      else:
        poslist.append(i)
        currparse_pos += len(tok)-2
    
  return poslist

In [14]:
def find_twpos(df):
  sentlist = list(df['sentence'])
  twordlist = list(df['token'])
  poslist_list = []
  maxlen = 0
  i = 0
  for tw, sent in zip(twordlist, sentlist):
    sent_toklist = tokenizer.tokenize(sent)
    
    poslist = tok2pos(sent_toklist, tw.lower())
    
    if poslist==[]:
      print(i,tw,sent)
      # Take central position of sentence if target word not found 
      poslist_list.append([len(sent_toklist)//2])
    
    else:
      poslist_list.append(poslist)
    
    maxlen = max(maxlen, len(sent_toklist))
    i += 1
  
  print("Max:", maxlen)
  df['target_tokpos'] = poslist_list

In [15]:
find_twpos(traindf)
find_twpos(trialdf)
find_twpos(testdf)

5079 N Superficial and deep anterior cortical staining was grossly different between alphaA/BKO (Figure 3A,3D,3F and 3I) and wild type lenses (Figure 3K,3N,3P and 3S).
Max: 302
Max: 170
522 P Superficial and deep anterior cortical staining was grossly different between alphaA/BKO (Figure 3A,3D,3F and 3I) and wild type lenses (Figure 3K,3N,3P and 3S).
Max: 313


# Feature Extraction

## Extract Various Word Embeddings

#### Extract Contextual BERT embeddings

In [None]:
bertmodel = BertModel.from_pretrained('bert-base-uncased')
bertmodel.eval()
bertmodel.cuda()

In [19]:
def getwordfeat_bert(df):
  sentlist = list(df['sentence'])
  twordlist = list(df['token'])
  twposlist_list = list(df['target_tokpos'])

  featvec_list = []
  
  for tw, sent, twposlist in zip(twordlist, sentlist, twposlist_list):
    with torch.no_grad():
      inp_embeds = tokenizer(sent, return_tensors="pt").to(DEVICE)
      out_states = bertmodel(**inp_embeds, output_hidden_states=True)
      hlayers = out_states.hidden_states

      # Based on assumption, last hidden layer is often geared towards a specific classification task the BERT was trained on.... 
      # Choosing the 2nd last layer instead to generalize better
      hlayer_minus2 = hlayers[-2]

      # All layers of BERT vectors are of size 768
      embed_t = torch.zeros(1,768, device=DEVICE)
      word_embed = torch.zeros(768, device=DEVICE)

      # Word embedding = Avg of its constituent subword embeddings
      for pos in twposlist:
        word_embed += hlayer_minus2[0][pos+1]
      word_embed = word_embed / len(twposlist)

      embed_t[0] += word_embed

    featvec_list.append(embed_t)
  
  featmat_t = torch.cat(featvec_list)
  featmat = np.array(featmat_t.detach().cpu())

  return featmat

In [21]:
bertword_train = getwordfeat_bert(traindf)
bertword_trial = getwordfeat_bert(trialdf)
bertword_test = getwordfeat_bert(testdf)

In [22]:
print(list(set(list(np.where(np.isnan(bertword_train))[0]))))
print(list(set(list(np.where(np.isnan(bertword_trial))[0]))))
print(list(set(list(np.where(np.isnan(bertword_test))[0]))))

[]
[]
[]


#### Extract GloVe Word Embeddings

In [28]:
def getwordfeat_glove(df):
  sentlist = list(df['sentence'])
  twordlist = list(df['token'])

  featvec_list = []
  
  for tw, sent in zip(twordlist, sentlist):
    tw = tw.strip().lower()
    toklist = preprocessor_glove(sent)
    
    embed_t = torch.zeros(1,300, device=DEVICE)

    word_glove = torch.zeros(300, device=DEVICE)
    valid_wocc = 0
    
    for k,token in enumerate(toklist):
      if token == tw:
        valid_wocc += 1
        if token in glove_embeddings_dict:
          word_glove += torch.Tensor(glove_embeddings_dict[token]).to(DEVICE)
        else:
          word_glove += torch.Tensor(unk_300d).to(DEVICE)

    if valid_wocc != 0:
      word_glove = word_glove / valid_wocc

    else:
      word_glove += torch.Tensor(unk_300d).to(DEVICE)
    
    embed_t[0] += word_glove
    featvec_list.append(embed_t)
  
  featmat_t = torch.cat(featvec_list)
  featmat = np.array(featmat_t.detach().cpu())

  return featmat

In [29]:
gloveword_train = getwordfeat_glove(traindf)
gloveword_trial = getwordfeat_glove(trialdf)
gloveword_test = getwordfeat_glove(testdf)

In [30]:
print(list(set(list(np.where(np.isnan(gloveword_train))[0]))))
print(list(set(list(np.where(np.isnan(gloveword_trial))[0]))))
print(list(set(list(np.where(np.isnan(gloveword_test))[0]))))

[]
[]
[]


## Extract Various Sentence Embeddings

#### Pooled Output (Averaging 2 methods)

In [31]:
def getsentfeat_bert(df):
  sentlist = list(df['sentence'])
  twordlist = list(df['token'])

  featvec_list = []
  
  for tw, sent in zip(twordlist, sentlist):
    tw = tw.strip().lower()
    
    with torch.no_grad():
      inp_embeds = tokenizer(sent, return_tensors="pt").to(DEVICE)
      out_states = bertmodel(**inp_embeds, output_hidden_states=True)
      hlayers = out_states.hidden_states

      # Based on assumption, last hidden layer is often geared towards a specific classification task the BERT was trained on.... 
      # Choosing the 2nd last layer instead to generalize better
      hlayer_minus2 = hlayers[-2]

      embed_t = torch.zeros(1,768, device=DEVICE)
      
      pooled_embed = torch.zeros(768, device=DEVICE)

      for w_e in hlayer_minus2[0]:
         pooled_embed += w_e
      pooled_embed = pooled_embed/((hlayer_minus2[0].shape)[0])

      # Also taking the Standard BERT-Pooler output, ie, last 4-layers CLS combination
      pooled_embed += out_states.pooler_output[0]
      pooled_embed = pooled_embed / 2

      embed_t[0] += pooled_embed

    featvec_list.append(embed_t)
  
  featmat_t = torch.cat(featvec_list)
  featmat = np.array(featmat_t.detach().cpu())

  return featmat

In [32]:
bertsent_train = getsentfeat_bert(traindf)
bertsent_trial = getsentfeat_bert(trialdf)
bertsent_test = getsentfeat_bert(testdf)

In [33]:
print(list(set(list(np.where(np.isnan(bertsent_train))[0]))))
print(list(set(list(np.where(np.isnan(bertsent_trial))[0]))))
print(list(set(list(np.where(np.isnan(bertsent_test))[0]))))

[]
[]
[]


#### Average GloVe

In [34]:
def getsentfeat_glove(df):
  sentlist = list(df['sentence'])
  twordlist = list(df['token'])

  featvec_list = []
  missing = 0
  total = 0
  
  for tw, sent in zip(twordlist, sentlist):
    embed_t = torch.zeros(1,300, device=DEVICE)
    
    toklist = preprocessor_glove(sent)
      
    avg_glove = torch.zeros(300, device=DEVICE)
      
    valid_tcount = 0
    
    for k,token in enumerate(toklist):
      total += 1
      if token in glove_embeddings_dict:
        avg_glove += torch.Tensor(glove_embeddings_dict[token]).to(DEVICE)
        valid_tcount += 1
      else:
        missing += 1

    if valid_tcount == 0:
      print("what")
    
    else:
      avg_glove = avg_glove/valid_tcount
      
    embed_t[0] += avg_glove
    
    featvec_list.append(embed_t)
  

  print("Missing:", missing, "out of", total)
  
  featmat_t = torch.cat(featvec_list)
  featmat = np.array(featmat_t.detach().cpu())

  return featmat

In [36]:
glovesent_train = getsentfeat_glove(traindf)
glovesent_trial = getsentfeat_glove(trialdf)
glovesent_test = getsentfeat_glove(testdf)

Missing: 1667 out of 212261
Missing: 91 out of 11922
Missing: 187 out of 25622


In [37]:
print(list(set(list(np.where(np.isnan(glovesent_train))[0]))))
print(list(set(list(np.where(np.isnan(glovesent_trial))[0]))))
print(list(set(list(np.where(np.isnan(glovesent_test))[0]))))

[]
[]
[]


#### Paraphrase-DistilRoberta Features

In [None]:
roberta_sentmodel = SentenceTransformer('paraphrase-distilroberta-base-v1')
roberta_sentmodel.eval()
roberta_sentmodel.cuda()

In [39]:
def getsentfeat_dtrob(df):
  sentlist = list(df['sentence'])
  featvec_list = []
  
  for sent in sentlist:
    with torch.no_grad():
      embed_t = torch.zeros(1,768, device=DEVICE)
      sent_embed = roberta_sentmodel.encode(sent, device=DEVICE, convert_to_tensor=True)
      embed_t[0] += sent_embed

    featvec_list.append(embed_t)
  
  featmat_t = torch.cat(featvec_list)
  featmat = np.array(featmat_t.detach().cpu())

  return featmat

In [40]:
dtrobsent_train = getsentfeat_dtrob(traindf)
dtrobsent_trial = getsentfeat_dtrob(trialdf)
dtrobsent_test = getsentfeat_dtrob(testdf)

In [41]:
print(list(set(list(np.where(np.isnan(dtrobsent_train))[0]))))
print(list(set(list(np.where(np.isnan(dtrobsent_trial))[0]))))
print(list(set(list(np.where(np.isnan(dtrobsent_test))[0]))))

[]
[]
[]


## Extract Manual Features

In [42]:
trainmf = traindf.copy()
trialmf = trialdf.copy()
testmf = testdf.copy()

In [43]:
# Add Corpus type information (one-hot)
one_hot_corp_train = pd.get_dummies(trainmf['corpus'])
one_hot_corp_trial = pd.get_dummies(trialmf['subcorpus'])
one_hot_corp_test = pd.get_dummies(testmf['corpus'])

trainmf = trainmf.join(one_hot_corp_train)
trialmf = trialmf.join(one_hot_corp_trial)
testmf = testmf.join(one_hot_corp_test)

In [44]:
# Given a sentence, returns number of vowels
def num_vowels(sentence):
  count=0
  for c in sentence:
    if (c=='a' or c=='e' or c=='i' or c=='o' or c=='u'):
      count+=1
  return count

In [45]:
def remove_punc(sen):
  x = re.compile('['+string.punctuation+']'+'–')
  new_sen = re.sub(x, ' ', sen)
  return new_sen

#### POS Tag Extraction

In [None]:
# Get all the pos_tags possible from the training corpus
pos_tag_dic={}
pos_result=set()
pos_result.add('None')

In [46]:
def pos_tagger_old(txt):
  txt = remove_punc(txt)
  tokenized = sent_tokenize(txt)
  
  for i in tokenized:
    # Word tokenizers is used to find the words
    # and punctuation in a string
    wordsList = nltk.word_tokenize(i)

    # removing stop words from wordList
    wordsList = [w for w in wordsList if not w in stop_words]

    # Using a Tagger. Which is part-of-speech
    # tagger or POS-tagger.
    tagged = nltk.pos_tag(wordsList)

    count = Counter([j for i,j in tagged])
    for i in count:
      if i in pos_tag_dic.keys():
        pos_tag_dic[i]+=count[i]
      else:
        pos_tag_dic[i]=count[i]

In [47]:
for i in range(len(trainmf['sentence'])):
  pos_tagger_old(trainmf['sentence'][i])

#add a feature for each pos_tag
for i in pos_tag_dic.keys():
  trainmf["c_"+i]=[0]*len(trainmf)

In [48]:
trainmf["tw_postag"]=["None"]*len(trainmf)

In [None]:
def pos_tagger_train(row,txt,tw,df):
  txt = remove_punc(txt)
  tokenized = sent_tokenize(txt)
  
  for i in tokenized:  
    # Word tokenizers is used to find the words
    # and punctuation in a string
    wordsList = nltk.word_tokenize(i)

    # removing stop words from wordList
    wordsList = [w for w in wordsList if not w in stop_words]

    # Using a Tagger. Which is part-of-speech
    # tagger or POS-tagger.
    tagged = nltk.pos_tag(wordsList)

    for i,j in tagged:
      if i.lower() == tw.lower():
        df['tw_postag'][row]=j
        pos_result.add(j)
    count = Counter([j for i,j in tagged])

    new_pos_tag_dic={}
    for k in pos_tag_dic.keys():
      new_pos_tag_dic[k]=0
    for i in count:
      if i in new_pos_tag_dic.keys():
        new_pos_tag_dic[i]+=count[i]
    
    for i in new_pos_tag_dic.keys():
      df["c_"+i][row]=new_pos_tag_dic[i]

In [None]:
for i in range(len(trainmf)):
  pos_tagger_train(i,trainmf['sentence'][i],trainmf['token'][i],trainmf)

In [50]:
#getting the columns which has more than 90% data as zeros
x = list(zip(trainmf.columns,list(map(lambda i : (0.1*len(trainmf)>i),trainmf.astype(bool).sum(axis=0)))))
for name,val in x:
  if val:
    trainmf.drop(name,axis=1,inplace=True)
    del pos_tag_dic[name[2:]]

In [51]:
for k in pos_tag_dic.keys():
  trialmf["c_"+k]=[0]*len(trialmf)
  testmf["c_"+k]=[0]*len(testmf)

for i in pos_result:
  testmf[i]=[0]*len(testmf)
  trialmf[i]=[0]*len(trialmf)

In [52]:
one_hot_pos = pd.get_dummies(trainmf['tw_postag'])
# Drop column as it is now encoded
trainmf = trainmf.drop('tw_postag',axis = 1)
# Join the encoded df
trainmf = trainmf.join(one_hot_pos)

In [53]:
def pos_tagger_test(row,txt,tw,df):
  txt = remove_punc(txt)
  tokenized = sent_tokenize(txt)
  for i in tokenized:
    
    # Word tokenizers is used to find the words
    # and punctuation in a string
    wordsList = nltk.word_tokenize(i)

    # removing stop words from wordList
    wordsList = [w for w in wordsList if not w in stop_words]

    # Using a Tagger. Which is part-of-speech
    # tagger or POS-tagger.
    tagged = nltk.pos_tag(wordsList)

    for i,j in tagged:
      if i.lower() == tw.lower() and j in pos_result:
        if j not in df.columns:
          df[j]=[0]*len(df)
        df[j][row]=1
    count = Counter([j for i,j in tagged])

    new_pos_tag_dic={}
    for k in pos_tag_dic.keys():
        new_pos_tag_dic[k]=0
    for i in count:
      if i in new_pos_tag_dic.keys():
        new_pos_tag_dic[i]+=count[i]
    
    for i in new_pos_tag_dic.keys():
      df["c_"+i][row]=new_pos_tag_dic[i]

In [None]:
for i in range(len(testmf)):
  pos_tagger_test(i,testmf['sentence'][i],testmf['token'][i],testmf)

for i in range(len(trialmf)):
  pos_tagger_test(i,trialmf['sentence'][i],trialmf['token'][i],trialmf)

#### Readability Features Extraction

In [56]:
# Load Easy word set from Textstat
easy_word_set = textstat._textstatistics__get_lang_easy_words()

# Load pretrained SpaCy model
nlp = spacy.load('en')

In [57]:
# Splits the text into sentences, using
# Spacy's sentence segmentation
def break_sentences(text):
	doc = nlp(text)
	return doc.sents

# Returns Number of Words in the text
def word_count(text):
	sentences = break_sentences(text)
	words = 0
	for sentence in sentences:
		words += len([token for token in sentence])
	return words

# Returns the number of sentences in the text
def sentence_count(text):
  sentences = break_sentences(text)
  return len(list(sentences))


# Returns average sentence length
def avg_sentence_length(text):
	words = word_count(text)
	sentences = sentence_count(text)
	average_sentence_length = float(words / sentences)
	return average_sentence_length

# Using Textstat to calculate statistics from text to 
# determine readability, complexity and grade level of a particular corpus.
def syllables_count(word):
	return textstatistics().syllable_count(str(word))

# Returns the average number of syllables per
# word in the text
def avg_syllables_per_word(text):
	syllable = syllables_count(text)
	words = word_count(text)
	ASPW = float(syllable) / float(words)
	return legacy_round(ASPW, 1)

# Return total Difficult Words in a text
def difficult_words(text):

	# Find all words in the text
	words = []
	sentences = break_sentences(text)
	for sentence in sentences:
		words += [str(token) for token in sentence]

	# Difficult words are those with syllables >= 2
	# easy_word_set is provide by Textstat as
	# a list of common words
	diff_words_set = set()
	
	for word in words:
		syllable_count = syllables_count(word)
		if word not in easy_word_set and syllable_count >= 2:
			diff_words_set.add(word)

	return len(diff_words_set)

# A word is polysyllablic if it has more than 3 syllables
# this functions returns the number of all such words
# present in the text
def poly_syllable_count(text):
	count = 0
	words = []
	sentences = break_sentences(text)
	for sentence in sentences:
		words += [token for token in sentence]
	

	for word in words:
		syllable_count = syllables_count(word)
		if syllable_count >= 3:
			count += 1
	return count


def flesch_reading_ease(text):
	"""
		Implements Flesch Formula:
		Reading Ease score = 206.835 - (1.015 × ASL) - (84.6 × ASW)
		Here,
		ASL = average sentence length (number of words
				divided by number of sentences)
		ASW = average word length in syllables (number of syllables
				divided by number of words)
	"""
	FRE = 206.835 - float(1.015 * avg_sentence_length(text)) -\
		float(84.6 * avg_syllables_per_word(text))
	return legacy_round(FRE, 2)


def gunning_fog(text):
	per_diff_words = (difficult_words(text) / word_count(text) * 100) + 5
	grade = 0.4 * (avg_sentence_length(text) + per_diff_words)
	return grade


def smog_index(text):
	"""
		Implements SMOG Formula / Grading
		SMOG grading = 3 + ?polysyllable count.
		Here,
		polysyllable count = number of words of more
		than two syllables in a sample of 30 sentences.
	"""

	if sentence_count(text) >= 3:
		poly_syllab = poly_syllable_count(text)
		SMOG = (1.043 * (30*(poly_syllab / sentence_count(text)))**0.5) \
				+ 3.1291
		return legacy_round(SMOG, 1)
	else:
		return 0


def dale_chall_readability_score(text):
	"""
		Implements Dale Challe Formula:
		Raw score = 0.1579*(PDW) + 0.0496*(ASL) + 3.6365
		Here,
			PDW = Percentage of difficult words.
			ASL = Average sentence length
	"""
	words = word_count(text)
	# Number of words not termed as difficult words
	count = words - difficult_words(text)
	if words > 0:

		# Percentage of words not on difficult word list

		per = float(count) / float(words) * 100
	
	# diff_words stores percentage of difficult words
	diff_words = 100 - per

	raw_score = (0.1579 * diff_words) + \
				(0.0496 * avg_sentence_length(text))
	
	# If Percentage of Difficult Words is greater than 5 %, then;
	# Adjusted Score = Raw Score + 3.6365,
	# otherwise Adjusted Score = Raw Score

	if diff_words > 5:	

		raw_score += 3.6365
		
	return legacy_round(raw_score, 2)

In [59]:
# Function add the different features
def add_feature(name,feature,df,for_sen,for_tw):
  if for_sen:
    ls = []
    for i in range(len(df['sentence'])):
      ls.append(feature(df['sentence'][i]))
    df[name+'_sent']=ls
  if for_tw:
    ls = []
    for i in range(len(df['token'])):
      ls.append(feature(df['token'][i]))
    df[name+'_tw']=ls

In [60]:
add_feature("vowel_c",num_vowels,trainmf,1,1)
add_feature("dale_chall_readability_score",dale_chall_readability_score,trainmf,1,1)
add_feature("smog_index",smog_index,trainmf,1,0)
add_feature("gunning_fog",gunning_fog,trainmf,1,0)
add_feature("flesch_reading_ease",flesch_reading_ease,trainmf,1,0)

add_feature("vowel_c",num_vowels,trialmf,1,1)
add_feature("dale_chall_readability_score",dale_chall_readability_score,trialmf,1,1)
add_feature("smog_index",smog_index,trialmf,1,0)
add_feature("gunning_fog",gunning_fog,trialmf,1,0)
add_feature("flesch_reading_ease",flesch_reading_ease,trialmf,1,0)

add_feature("vowel_c",num_vowels,testmf,1,1)
add_feature("dale_chall_readability_score",dale_chall_readability_score,testmf,1,1)
add_feature("smog_index",smog_index,testmf,1,0)
add_feature("gunning_fog",gunning_fog,testmf,1,0)
add_feature("flesch_reading_ease",flesch_reading_ease,testmf,1,0)

#### Drop Other columns, Save Features as Normalized Numpy Array

In [61]:
trainmf.drop(['id', 'corpus', 'sentence', 'token', 'complexity', 'target_tokpos'],axis=1,inplace=True)
trialmf.drop(['id', 'subcorpus', 'sentence', 'token', 'complexity', 'target_tokpos'],axis=1,inplace=True)
testmf.drop(['id', 'corpus', 'sentence', 'token', 'complexity', 'target_tokpos'],axis=1,inplace=True)

In [62]:
trainmf = trainmf.to_numpy()
trialmf = trialmf.to_numpy()
testmf = testmf.to_numpy()

# Feature Scaling Normalization for Manual Features
norm = MinMaxScaler().fit(trainmf)

trainmf = norm.transform(trainmf)
trialmf = norm.transform(trialmf)
testmf = norm.transform(testmf)

## Check Dimensions of all Extracted Features

In [64]:
print('Dataset Size (Extra Column added for Token-Position) :')
print(traindf.shape)
print(trialdf.shape)
print(testdf.shape)
print('------------------------------------------------------')
print('Size of Extracted Features :')
print('---------------')
print('Manual Featues :')
print(trainmf.shape)
print(trialmf.shape)
print(testmf.shape)
print('---------------')
print('BERT Contextual Word-Embeddings :')
print(bertword_train.shape)
print(bertword_trial.shape)
print(bertword_test.shape)
print('---------------')
print('GloVe Word-Embeddings :')
print(gloveword_train.shape)
print(gloveword_trial.shape)
print(gloveword_test.shape)
print('---------------')
print('BERT Special Pooled Sentence-Embeddings :')
print(bertsent_train.shape)
print(bertsent_trial.shape)
print(bertsent_test.shape)
print('---------------')
print('Paraphrase-DistilRoberta Sentence-Embeddings :')
print(dtrobsent_train.shape)
print(dtrobsent_trial.shape)
print(dtrobsent_test.shape)
print('---------------')
print('GloVe-Averaged Sentence-Embeddings :')
print(glovesent_train.shape)
print(glovesent_trial.shape)
print(glovesent_test.shape)

Dataset Size (Extra Column added for Token-Position) :
(7662, 6)
(421, 6)
(917, 6)
------------------------------------------------------
Size of Extracted Features :
---------------
Manual Featues :
(7662, 55)
(421, 55)
(917, 55)
---------------
BERT Contextual Word-Embeddings :
(7662, 768)
(421, 768)
(917, 768)
---------------
GloVe Word-Embeddings :
(7662, 300)
(421, 300)
(917, 300)
---------------
BERT Special Pooled Sentence-Embeddings :
(7662, 768)
(421, 768)
(917, 768)
---------------
Paraphrase-DistilRoberta Sentence-Embeddings :
(7662, 768)
(421, 768)
(917, 768)
---------------
GloVe-Averaged Sentence-Embeddings :
(7662, 300)
(421, 300)
(917, 300)


# Base Models

## Fully-Connected Neural Networks

### Dataloader, Datasets, Evaluation, & Training Function for Torch NN

In [65]:
def mae(y_pred,probability):
  mae_v = 0
  for i,row in enumerate(y_pred):
    mae_v += abs(row[0]-probability[i][0])
  mae_v = mae_v/len(y_pred)
  return mae_v

In [66]:
def pearson(y_pred, label):
  if y_pred.device.type == 'cuda':
    y_pred_np = np.array(y_pred.detach().cpu())  
  else:
    y_pred_np = np.array(y_pred)

  if label.device.type == 'cuda':
    label_np = np.array(label.cpu())
  else:
    label_np = np.array(label)

  label_np = label_np.reshape(-1)
  y_pred_np = y_pred_np.reshape(-1)

  pcoff, _ = stats.pearsonr(y_pred_np, label_np)

  return pcoff

In [67]:
class dataset(Dataset):
  def __init__(self,data,labels):
    self.data = data
    self.labels = labels

  def __getitem__(self, index):
    return self.data[index], self.labels[index]
  
  def __len__(self):
    return len(self.labels)

In [68]:
def my_collate_fn(batch):
  return [x[0] for x in batch],[x[1] for x in batch]

Custom Evaluation (Testing) Function:

In [69]:
def testeval(model, data_loader, DEVICE):
  testiter = iter(data_loader)
  testd_ = testiter.next()
  data_tst, labs_tst = testd_

  labs_tst = torch.Tensor(labs_tst).to(DEVICE)
  labs_tst = torch.unsqueeze(labs_tst, 1)

  ypreds_tst = model(data_tst)

  #MAE
  mae_v = mae(ypreds_tst, labs_tst).detach().cpu()
  #MSE
  loss = nn.MSELoss()
  mse_v = loss(ypreds_tst, labs_tst).detach().cpu()
  #Pearson Coefficient
  pcoff_v = pearson(ypreds_tst, labs_tst)
  
  print("[ MAE value: ", mae_v, " ]\t[ MSE value: ", mse_v, " ]\t[ Pearson Coefficient: ", pcoff_v, " ]")

  return pcoff_v

Custom Training function for Torch NNs:

In [70]:
def train(model, tr_data, tr_lab, val_data, val_lab, tst_data, tst_lab, MAX_EPOCHS, BATCH_SIZE, LEARNING_RATE, DEVICE):
  iters_per_epoch = math.ceil(len(tr_lab)/BATCH_SIZE)
  VALBATCH_SIZE = len(val_lab)
  TESTBATCH_SIZE = len(tst_lab)

  train_data = dataset(tr_data, tr_lab)
  valid_data = dataset(val_data, val_lab)
  test_data = dataset(tst_data, tst_lab)

  train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=my_collate_fn)

  train_loader_full = DataLoader(dataset=train_data, batch_size=len(tr_lab), shuffle=False, collate_fn=my_collate_fn)
  valid_loader = DataLoader(dataset=valid_data, batch_size=VALBATCH_SIZE, shuffle=False, collate_fn=my_collate_fn)
  test_loader = DataLoader(dataset=test_data, batch_size=TESTBATCH_SIZE, shuffle=False, collate_fn=my_collate_fn)

  criterion = (nn.MSELoss()).to(DEVICE)
  optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

  for i in range(1,MAX_EPOCHS+1):
    print("[Epoch: ",str(i),"/",str(MAX_EPOCHS),"]")
    
    model.train()
    
    for j,(data_batch, label_batch) in enumerate(tqdm(train_loader, total=iters_per_epoch)):
      label_batch = torch.Tensor(label_batch).to(DEVICE)
      label_batch = torch.unsqueeze(label_batch, 1)

      optimizer.zero_grad()

      y_pred_batch = model(data_batch)
      
      loss = criterion(y_pred_batch, label_batch)
      
      loss.backward()
      optimizer.step()
    
    model.eval()
    print('Validation scores: ', end='')
    curr_pscore = testeval(model, valid_loader, DEVICE)
    print('---------------------')
    print('Test scores: ', end='')
    curr_pscore = testeval(model, test_loader, DEVICE)
    print('---------------------')
    print('Train scores: ', end='')
    testeval(model, train_loader_full, DEVICE)
    print('------------------------------------------------------------------------------------------------------')

### FCNN v1 Framework (3 hidden; relu 1st 2 ; sigmoid last) (Default Dropouts 0.5, 0.25, 0.125)

In [71]:
class Framework_v1(nn.Module):
    def __init__(self, in_size, h1_size, h2_size, h3_size, DEVICE, drop_1=0.5, drop_2=0.25, drop_3=0.125):
        super().__init__()
        self.device = DEVICE

        self.layer_1 = (nn.Linear(in_size,h1_size)).to(DEVICE)
        self.layer_2 = (nn.Linear(h1_size,h2_size)).to(DEVICE)
        self.layer_3 = (nn.Linear(h2_size,h3_size)).to(DEVICE)
        self.layer_out = (nn.Linear(h3_size,1)).to(DEVICE)
        self.layer_drop1 = (nn.Dropout(drop_1)).to(DEVICE)
        self.layer_drop2 = (nn.Dropout(drop_2)).to(DEVICE)
        self.layer_drop3 = (nn.Dropout(drop_3)).to(DEVICE)
        
        # Define activations
        self.sigmoid = nn.Sigmoid().to(DEVICE)
        self.relu = nn.ReLU().to(DEVICE)
    
    
    def lasthidden(self, x):
        # Pass the input tensor through each of our operations (except dropouts & output layer)
        x = torch.Tensor(x).to(self.device)

        out = self.layer_1(x)
        out = self.relu(out)
        out = self.layer_2(out)
        out = self.relu(out)
        out = self.layer_3(out)
        out = self.sigmoid(out)

        return out
        

    def forward(self, x):
        # Pass the input tensor through each of our operations
        x = torch.Tensor(x).to(self.device)

        out = self.layer_1(x)
        out = self.relu(out)
        out = self.layer_drop1(out)
        out = self.layer_2(out)
        out = self.relu(out)
        out = self.layer_drop2(out)
        out = self.layer_3(out)
        out = self.sigmoid(out)
        out = self.layer_drop3(out)
        out = self.layer_out(out)
        out = self.sigmoid(out)

        return out

### Training & Testing FCNNs

##### Case-1 - 0.75535 - (bertword_train,dtrobsent_train,gloveword_train,glovesent_train)

In [118]:
# Define Model
fcnnf1_1 = Framework_v1(2136,1024,512,256,DEVICE,0.4,0.4,0.4)

# Train Model with selected hyperparams
MAX_EPOCHS = 20
BATCH_SIZE = 16
LEARNING_RATE = 1e-5

featmat_train = np.concatenate((bertword_train,dtrobsent_train,gloveword_train,glovesent_train),axis=1)
featmat_val = np.concatenate((bertword_trial,dtrobsent_trial,gloveword_trial,glovesent_trial),axis=1)
featmat_test = np.concatenate((bertword_test,dtrobsent_test,gloveword_test,glovesent_test),axis=1)

labels_train = np.array(traindf['complexity'])
labels_val = np.array(trialdf['complexity'])
labels_test = np.array(testdf['complexity'])

train(fcnnf1_1, featmat_train, labels_train, featmat_val, labels_val, featmat_test, labels_test, MAX_EPOCHS, BATCH_SIZE, LEARNING_RATE, DEVICE)

  3%|▎         | 16/479 [00:00<00:02, 155.14it/s]

[Epoch:  1 / 20 ]


100%|██████████| 479/479 [00:03<00:00, 152.69it/s]


Validation scores: [ MAE value:  tensor(0.0908)  ]	[ MSE value:  tensor(0.0148)  ]	[ Pearson Coefficient:  0.5850055126792502  ]
---------------------
Test scores: [ MAE value:  tensor(0.0883)  ]	[ MSE value:  tensor(0.0131)  ]	[ Pearson Coefficient:  0.5108875596559543  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:02, 154.53it/s]

[ MAE value:  tensor(0.0909)  ]	[ MSE value:  tensor(0.0138)  ]	[ Pearson Coefficient:  0.5828428929297597  ]
------------------------------------------------------------------------------------------------------
[Epoch:  2 / 20 ]


100%|██████████| 479/479 [00:03<00:00, 152.59it/s]


Validation scores: [ MAE value:  tensor(0.0804)  ]	[ MSE value:  tensor(0.0115)  ]	[ Pearson Coefficient:  0.6799523022793179  ]
---------------------
Test scores: [ MAE value:  tensor(0.0783)  ]	[ MSE value:  tensor(0.0104)  ]	[ Pearson Coefficient:  0.6255024090265945  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:02, 159.28it/s]

[ MAE value:  tensor(0.0773)  ]	[ MSE value:  tensor(0.0101)  ]	[ Pearson Coefficient:  0.7064770693468966  ]
------------------------------------------------------------------------------------------------------
[Epoch:  3 / 20 ]


100%|██████████| 479/479 [00:03<00:00, 151.16it/s]


Validation scores: [ MAE value:  tensor(0.0737)  ]	[ MSE value:  tensor(0.0096)  ]	[ Pearson Coefficient:  0.7168204980771878  ]
---------------------
Test scores: [ MAE value:  tensor(0.0731)  ]	[ MSE value:  tensor(0.0090)  ]	[ Pearson Coefficient:  0.6716447846629077  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:02, 155.22it/s]

[ MAE value:  tensor(0.0690)  ]	[ MSE value:  tensor(0.0080)  ]	[ Pearson Coefficient:  0.7539446108128957  ]
------------------------------------------------------------------------------------------------------
[Epoch:  4 / 20 ]


100%|██████████| 479/479 [00:03<00:00, 150.86it/s]


Validation scores: [ MAE value:  tensor(0.0711)  ]	[ MSE value:  tensor(0.0088)  ]	[ Pearson Coefficient:  0.7358934916178705  ]
---------------------
Test scores: [ MAE value:  tensor(0.0707)  ]	[ MSE value:  tensor(0.0084)  ]	[ Pearson Coefficient:  0.6963350424848668  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:03, 151.32it/s]

[ MAE value:  tensor(0.0651)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7774523198086267  ]
------------------------------------------------------------------------------------------------------
[Epoch:  5 / 20 ]


100%|██████████| 479/479 [00:03<00:00, 152.81it/s]


Validation scores: [ MAE value:  tensor(0.0697)  ]	[ MSE value:  tensor(0.0084)  ]	[ Pearson Coefficient:  0.749572958322306  ]
---------------------
Test scores: [ MAE value:  tensor(0.0694)  ]	[ MSE value:  tensor(0.0081)  ]	[ Pearson Coefficient:  0.7109433927729101  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:02, 155.70it/s]

[ MAE value:  tensor(0.0631)  ]	[ MSE value:  tensor(0.0066)  ]	[ Pearson Coefficient:  0.7945990252676479  ]
------------------------------------------------------------------------------------------------------
[Epoch:  6 / 20 ]


100%|██████████| 479/479 [00:03<00:00, 151.99it/s]


Validation scores: [ MAE value:  tensor(0.0685)  ]	[ MSE value:  tensor(0.0081)  ]	[ Pearson Coefficient:  0.760443903575254  ]
---------------------
Test scores: [ MAE value:  tensor(0.0685)  ]	[ MSE value:  tensor(0.0078)  ]	[ Pearson Coefficient:  0.7214176903566595  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:02, 157.12it/s]

[ MAE value:  tensor(0.0615)  ]	[ MSE value:  tensor(0.0063)  ]	[ Pearson Coefficient:  0.8061318617559957  ]
------------------------------------------------------------------------------------------------------
[Epoch:  7 / 20 ]


100%|██████████| 479/479 [00:03<00:00, 152.76it/s]


Validation scores: [ MAE value:  tensor(0.0682)  ]	[ MSE value:  tensor(0.0081)  ]	[ Pearson Coefficient:  0.7658288216719844  ]
---------------------
Test scores: [ MAE value:  tensor(0.0683)  ]	[ MSE value:  tensor(0.0078)  ]	[ Pearson Coefficient:  0.7296959927745399  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:02, 158.88it/s]

[ MAE value:  tensor(0.0610)  ]	[ MSE value:  tensor(0.0061)  ]	[ Pearson Coefficient:  0.8154008240162147  ]
------------------------------------------------------------------------------------------------------
[Epoch:  8 / 20 ]


100%|██████████| 479/479 [00:03<00:00, 154.38it/s]


Validation scores: [ MAE value:  tensor(0.0674)  ]	[ MSE value:  tensor(0.0077)  ]	[ Pearson Coefficient:  0.7700173765284013  ]
---------------------
Test scores: [ MAE value:  tensor(0.0672)  ]	[ MSE value:  tensor(0.0075)  ]	[ Pearson Coefficient:  0.7362111378785  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:02, 157.78it/s]

[ MAE value:  tensor(0.0593)  ]	[ MSE value:  tensor(0.0058)  ]	[ Pearson Coefficient:  0.8220522632469033  ]
------------------------------------------------------------------------------------------------------
[Epoch:  9 / 20 ]


100%|██████████| 479/479 [00:03<00:00, 154.32it/s]


Validation scores: [ MAE value:  tensor(0.0668)  ]	[ MSE value:  tensor(0.0076)  ]	[ Pearson Coefficient:  0.7752486606522352  ]
---------------------
Test scores: [ MAE value:  tensor(0.0670)  ]	[ MSE value:  tensor(0.0074)  ]	[ Pearson Coefficient:  0.7401869719997821  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:02, 155.33it/s]

[ MAE value:  tensor(0.0586)  ]	[ MSE value:  tensor(0.0056)  ]	[ Pearson Coefficient:  0.8281829536539547  ]
------------------------------------------------------------------------------------------------------
[Epoch:  10 / 20 ]


100%|██████████| 479/479 [00:03<00:00, 153.21it/s]


Validation scores: [ MAE value:  tensor(0.0670)  ]	[ MSE value:  tensor(0.0077)  ]	[ Pearson Coefficient:  0.7769969497467695  ]
---------------------
Test scores: [ MAE value:  tensor(0.0670)  ]	[ MSE value:  tensor(0.0074)  ]	[ Pearson Coefficient:  0.7434496313202272  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:02, 158.08it/s]

[ MAE value:  tensor(0.0583)  ]	[ MSE value:  tensor(0.0055)  ]	[ Pearson Coefficient:  0.8332635261259546  ]
------------------------------------------------------------------------------------------------------
[Epoch:  11 / 20 ]


100%|██████████| 479/479 [00:03<00:00, 154.04it/s]


Validation scores: [ MAE value:  tensor(0.0663)  ]	[ MSE value:  tensor(0.0075)  ]	[ Pearson Coefficient:  0.7791245269382636  ]
---------------------
Test scores: [ MAE value:  tensor(0.0662)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7465616805560201  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:02, 157.80it/s]

[ MAE value:  tensor(0.0569)  ]	[ MSE value:  tensor(0.0053)  ]	[ Pearson Coefficient:  0.8387107923158115  ]
------------------------------------------------------------------------------------------------------
[Epoch:  12 / 20 ]


100%|██████████| 479/479 [00:03<00:00, 152.96it/s]


Validation scores: [ MAE value:  tensor(0.0662)  ]	[ MSE value:  tensor(0.0074)  ]	[ Pearson Coefficient:  0.7816636854266042  ]
---------------------
Test scores: [ MAE value:  tensor(0.0658)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7481212658989923  ]
---------------------
Train scores: 

  3%|▎         | 14/479 [00:00<00:03, 139.79it/s]

[ MAE value:  tensor(0.0562)  ]	[ MSE value:  tensor(0.0051)  ]	[ Pearson Coefficient:  0.8427581656432298  ]
------------------------------------------------------------------------------------------------------
[Epoch:  13 / 20 ]


100%|██████████| 479/479 [00:03<00:00, 150.98it/s]


Validation scores: [ MAE value:  tensor(0.0662)  ]	[ MSE value:  tensor(0.0074)  ]	[ Pearson Coefficient:  0.7816966824462389  ]
---------------------
Test scores: [ MAE value:  tensor(0.0656)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7494853324846313  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:02, 157.39it/s]

[ MAE value:  tensor(0.0557)  ]	[ MSE value:  tensor(0.0050)  ]	[ Pearson Coefficient:  0.847562280215521  ]
------------------------------------------------------------------------------------------------------
[Epoch:  14 / 20 ]


100%|██████████| 479/479 [00:03<00:00, 152.99it/s]


Validation scores: [ MAE value:  tensor(0.0661)  ]	[ MSE value:  tensor(0.0074)  ]	[ Pearson Coefficient:  0.7801921094703855  ]
---------------------
Test scores: [ MAE value:  tensor(0.0653)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7513070783056226  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:02, 157.88it/s]

[ MAE value:  tensor(0.0546)  ]	[ MSE value:  tensor(0.0049)  ]	[ Pearson Coefficient:  0.8519802408465887  ]
------------------------------------------------------------------------------------------------------
[Epoch:  15 / 20 ]


100%|██████████| 479/479 [00:03<00:00, 154.01it/s]


Validation scores: [ MAE value:  tensor(0.0662)  ]	[ MSE value:  tensor(0.0075)  ]	[ Pearson Coefficient:  0.7823088441275722  ]
---------------------
Test scores: [ MAE value:  tensor(0.0655)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7517663864329007  ]
---------------------
Train scores: 

  3%|▎         | 15/479 [00:00<00:03, 144.67it/s]

[ MAE value:  tensor(0.0546)  ]	[ MSE value:  tensor(0.0049)  ]	[ Pearson Coefficient:  0.8561164294801312  ]
------------------------------------------------------------------------------------------------------
[Epoch:  16 / 20 ]


100%|██████████| 479/479 [00:03<00:00, 151.03it/s]


Validation scores: [ MAE value:  tensor(0.0657)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7849529702000564  ]
---------------------
Test scores: [ MAE value:  tensor(0.0654)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7517625089929629  ]
---------------------
Train scores: 

  3%|▎         | 15/479 [00:00<00:03, 149.25it/s]

[ MAE value:  tensor(0.0534)  ]	[ MSE value:  tensor(0.0046)  ]	[ Pearson Coefficient:  0.8599123987511658  ]
------------------------------------------------------------------------------------------------------
[Epoch:  17 / 20 ]


100%|██████████| 479/479 [00:03<00:00, 152.95it/s]


Validation scores: [ MAE value:  tensor(0.0657)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7864468718292722  ]
---------------------
Test scores: [ MAE value:  tensor(0.0651)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7535178403556984  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:02, 157.80it/s]

[ MAE value:  tensor(0.0528)  ]	[ MSE value:  tensor(0.0045)  ]	[ Pearson Coefficient:  0.8642361083964794  ]
------------------------------------------------------------------------------------------------------
[Epoch:  18 / 20 ]


100%|██████████| 479/479 [00:03<00:00, 150.70it/s]


Validation scores: [ MAE value:  tensor(0.0654)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7869940100813733  ]
---------------------
Test scores: [ MAE value:  tensor(0.0650)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7548547898593907  ]
---------------------
Train scores: 

  3%|▎         | 15/479 [00:00<00:03, 143.06it/s]

[ MAE value:  tensor(0.0518)  ]	[ MSE value:  tensor(0.0044)  ]	[ Pearson Coefficient:  0.8679345389482315  ]
------------------------------------------------------------------------------------------------------
[Epoch:  19 / 20 ]


100%|██████████| 479/479 [00:03<00:00, 152.27it/s]


Validation scores: [ MAE value:  tensor(0.0655)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7848963734673352  ]
---------------------
Test scores: [ MAE value:  tensor(0.0652)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7542339264170366  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:02, 156.99it/s]

[ MAE value:  tensor(0.0510)  ]	[ MSE value:  tensor(0.0042)  ]	[ Pearson Coefficient:  0.8724796767886899  ]
------------------------------------------------------------------------------------------------------
[Epoch:  20 / 20 ]


100%|██████████| 479/479 [00:03<00:00, 153.96it/s]


Validation scores: [ MAE value:  tensor(0.0658)  ]	[ MSE value:  tensor(0.0074)  ]	[ Pearson Coefficient:  0.7826258486981806  ]
---------------------
Test scores: [ MAE value:  tensor(0.0651)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7553539325815444  ]
---------------------
Train scores: [ MAE value:  tensor(0.0503)  ]	[ MSE value:  tensor(0.0041)  ]	[ Pearson Coefficient:  0.8767810562868135  ]
------------------------------------------------------------------------------------------------------


In [121]:
predf1_train = np.array(fcnnf1_1.forward(np.concatenate((bertword_train,dtrobsent_train,gloveword_train,glovesent_train),axis=1)).detach().cpu()).reshape(-1)
predf1 = np.array(fcnnf1_1.forward(np.concatenate((bertword_test,dtrobsent_test,gloveword_test,glovesent_test),axis=1)).detach().cpu()).reshape(-1)
predf1_trial = np.array(fcnnf1_1.forward(np.concatenate((bertword_trial,dtrobsent_trial,gloveword_trial,glovesent_trial),axis=1)).detach().cpu()).reshape(-1)

In [122]:
stats.pearsonr(predf1, testdf['complexity'])

(0.7553539343795164, 3.941816568322594e-170)

##### Case-2 - 0.7574 - (bertword_train,bertsent_train,gloveword_train,glovesent_train)

In [None]:
# Define Model
fcnnf2 = Framework_v1(2136,1024,512,256,DEVICE,0.4,0.4,0.4)

# Train Model with selected hyperparams
MAX_EPOCHS = 20
BATCH_SIZE = 16
LEARNING_RATE = 1e-5

featmat_train = np.concatenate((bertword_train,bertsent_train,gloveword_train,glovesent_train),axis=1)
featmat_val = np.concatenate((bertword_trial,bertsent_trial,gloveword_trial,glovesent_trial),axis=1)
featmat_test = np.concatenate((bertword_test,bertsent_test,gloveword_test,glovesent_test),axis=1)

labels_train = np.array(traindf['complexity'])
labels_val = np.array(trialdf['complexity'])
labels_test = np.array(testdf['complexity'])

train(fcnnf2, featmat_train, labels_train, featmat_val, labels_val, featmat_test, labels_test, MAX_EPOCHS, BATCH_SIZE, LEARNING_RATE, DEVICE)

In [92]:
# Define Model
fcnnf2_2 = Framework_v1(2136,1024,512,256,DEVICE,0.4,0.4,0.4)

# Train Model with selected hyperparams
MAX_EPOCHS = 25
BATCH_SIZE = 16
LEARNING_RATE = 1e-5

featmat_train = np.concatenate((bertword_train,bertsent_train,gloveword_train,glovesent_train),axis=1)
featmat_val = np.concatenate((bertword_trial,bertsent_trial,gloveword_trial,glovesent_trial),axis=1)
featmat_test = np.concatenate((bertword_test,bertsent_test,gloveword_test,glovesent_test),axis=1)

labels_train = np.array(traindf['complexity'])
labels_val = np.array(trialdf['complexity'])
labels_test = np.array(testdf['complexity'])

train(fcnnf2_2, featmat_train, labels_train, featmat_val, labels_val, featmat_test, labels_test, MAX_EPOCHS, BATCH_SIZE, LEARNING_RATE, DEVICE)

  3%|▎         | 16/479 [00:00<00:02, 157.82it/s]

[Epoch:  1 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 153.43it/s]


Validation scores: [ MAE value:  tensor(0.0936)  ]	[ MSE value:  tensor(0.0155)  ]	[ Pearson Coefficient:  0.5911809144008207  ]
---------------------
Test scores: [ MAE value:  tensor(0.0894)  ]	[ MSE value:  tensor(0.0136)  ]	[ Pearson Coefficient:  0.5266465548307705  ]
---------------------
Train scores: 

  3%|▎         | 12/479 [00:00<00:04, 115.12it/s]

[ MAE value:  tensor(0.0931)  ]	[ MSE value:  tensor(0.0146)  ]	[ Pearson Coefficient:  0.570958188154015  ]
------------------------------------------------------------------------------------------------------
[Epoch:  2 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 151.26it/s]


Validation scores: [ MAE value:  tensor(0.0797)  ]	[ MSE value:  tensor(0.0113)  ]	[ Pearson Coefficient:  0.6863028525482631  ]
---------------------
Test scores: [ MAE value:  tensor(0.0769)  ]	[ MSE value:  tensor(0.0100)  ]	[ Pearson Coefficient:  0.6491099679736455  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:03, 151.47it/s]

[ MAE value:  tensor(0.0761)  ]	[ MSE value:  tensor(0.0098)  ]	[ Pearson Coefficient:  0.7179467543468507  ]
------------------------------------------------------------------------------------------------------
[Epoch:  3 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 153.90it/s]


Validation scores: [ MAE value:  tensor(0.0731)  ]	[ MSE value:  tensor(0.0094)  ]	[ Pearson Coefficient:  0.72484101657727  ]
---------------------
Test scores: [ MAE value:  tensor(0.0716)  ]	[ MSE value:  tensor(0.0088)  ]	[ Pearson Coefficient:  0.6885865076653431  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:03, 153.50it/s]

[ MAE value:  tensor(0.0681)  ]	[ MSE value:  tensor(0.0078)  ]	[ Pearson Coefficient:  0.7597876671997956  ]
------------------------------------------------------------------------------------------------------
[Epoch:  4 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 153.59it/s]


Validation scores: [ MAE value:  tensor(0.0703)  ]	[ MSE value:  tensor(0.0084)  ]	[ Pearson Coefficient:  0.7449320781598755  ]
---------------------
Test scores: [ MAE value:  tensor(0.0695)  ]	[ MSE value:  tensor(0.0081)  ]	[ Pearson Coefficient:  0.7089619283305579  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:02, 155.64it/s]

[ MAE value:  tensor(0.0646)  ]	[ MSE value:  tensor(0.0069)  ]	[ Pearson Coefficient:  0.7805967662306955  ]
------------------------------------------------------------------------------------------------------
[Epoch:  5 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 151.78it/s]


Validation scores: [ MAE value:  tensor(0.0690)  ]	[ MSE value:  tensor(0.0082)  ]	[ Pearson Coefficient:  0.7576254524239894  ]
---------------------
Test scores: [ MAE value:  tensor(0.0685)  ]	[ MSE value:  tensor(0.0079)  ]	[ Pearson Coefficient:  0.7219636280140896  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:02, 156.57it/s]

[ MAE value:  tensor(0.0631)  ]	[ MSE value:  tensor(0.0066)  ]	[ Pearson Coefficient:  0.7952478818269241  ]
------------------------------------------------------------------------------------------------------
[Epoch:  6 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 150.26it/s]


Validation scores: [ MAE value:  tensor(0.0681)  ]	[ MSE value:  tensor(0.0079)  ]	[ Pearson Coefficient:  0.7668705958484349  ]
---------------------
Test scores: [ MAE value:  tensor(0.0682)  ]	[ MSE value:  tensor(0.0078)  ]	[ Pearson Coefficient:  0.7299024115293035  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:02, 155.29it/s]

[ MAE value:  tensor(0.0621)  ]	[ MSE value:  tensor(0.0063)  ]	[ Pearson Coefficient:  0.8041825166785193  ]
------------------------------------------------------------------------------------------------------
[Epoch:  7 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 150.66it/s]


Validation scores: [ MAE value:  tensor(0.0676)  ]	[ MSE value:  tensor(0.0077)  ]	[ Pearson Coefficient:  0.7705559071903799  ]
---------------------
Test scores: [ MAE value:  tensor(0.0672)  ]	[ MSE value:  tensor(0.0075)  ]	[ Pearson Coefficient:  0.7364049818900533  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:02, 154.38it/s]

[ MAE value:  tensor(0.0607)  ]	[ MSE value:  tensor(0.0061)  ]	[ Pearson Coefficient:  0.81162000322634  ]
------------------------------------------------------------------------------------------------------
[Epoch:  8 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 152.26it/s]


Validation scores: [ MAE value:  tensor(0.0669)  ]	[ MSE value:  tensor(0.0076)  ]	[ Pearson Coefficient:  0.7740830261200619  ]
---------------------
Test scores: [ MAE value:  tensor(0.0667)  ]	[ MSE value:  tensor(0.0074)  ]	[ Pearson Coefficient:  0.7410940993476353  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:02, 156.70it/s]

[ MAE value:  tensor(0.0600)  ]	[ MSE value:  tensor(0.0059)  ]	[ Pearson Coefficient:  0.8168845213794885  ]
------------------------------------------------------------------------------------------------------
[Epoch:  9 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 153.91it/s]


Validation scores: [ MAE value:  tensor(0.0665)  ]	[ MSE value:  tensor(0.0075)  ]	[ Pearson Coefficient:  0.7775532096725544  ]
---------------------
Test scores: [ MAE value:  tensor(0.0663)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7439417151246016  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:03, 150.97it/s]

[ MAE value:  tensor(0.0593)  ]	[ MSE value:  tensor(0.0058)  ]	[ Pearson Coefficient:  0.8214953157137865  ]
------------------------------------------------------------------------------------------------------
[Epoch:  10 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 154.04it/s]


Validation scores: [ MAE value:  tensor(0.0667)  ]	[ MSE value:  tensor(0.0075)  ]	[ Pearson Coefficient:  0.7815551473601641  ]
---------------------
Test scores: [ MAE value:  tensor(0.0665)  ]	[ MSE value:  tensor(0.0074)  ]	[ Pearson Coefficient:  0.7454428517847511  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:02, 156.02it/s]

[ MAE value:  tensor(0.0593)  ]	[ MSE value:  tensor(0.0058)  ]	[ Pearson Coefficient:  0.8259797943637199  ]
------------------------------------------------------------------------------------------------------
[Epoch:  11 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 152.31it/s]


Validation scores: [ MAE value:  tensor(0.0665)  ]	[ MSE value:  tensor(0.0075)  ]	[ Pearson Coefficient:  0.782250798899689  ]
---------------------
Test scores: [ MAE value:  tensor(0.0665)  ]	[ MSE value:  tensor(0.0074)  ]	[ Pearson Coefficient:  0.7458138147582709  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:02, 155.23it/s]

[ MAE value:  tensor(0.0587)  ]	[ MSE value:  tensor(0.0056)  ]	[ Pearson Coefficient:  0.8288692007174345  ]
------------------------------------------------------------------------------------------------------
[Epoch:  12 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 152.65it/s]


Validation scores: [ MAE value:  tensor(0.0664)  ]	[ MSE value:  tensor(0.0074)  ]	[ Pearson Coefficient:  0.7830615370483932  ]
---------------------
Test scores: [ MAE value:  tensor(0.0660)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7481257046607334  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:02, 155.45it/s]

[ MAE value:  tensor(0.0580)  ]	[ MSE value:  tensor(0.0055)  ]	[ Pearson Coefficient:  0.8329133021646851  ]
------------------------------------------------------------------------------------------------------
[Epoch:  13 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 152.97it/s]


Validation scores: [ MAE value:  tensor(0.0660)  ]	[ MSE value:  tensor(0.0074)  ]	[ Pearson Coefficient:  0.7843097216293282  ]
---------------------
Test scores: [ MAE value:  tensor(0.0655)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7498800539204222  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:03, 152.51it/s]

[ MAE value:  tensor(0.0573)  ]	[ MSE value:  tensor(0.0054)  ]	[ Pearson Coefficient:  0.8364949698275664  ]
------------------------------------------------------------------------------------------------------
[Epoch:  14 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 153.13it/s]


Validation scores: [ MAE value:  tensor(0.0662)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7844494998276418  ]
---------------------
Test scores: [ MAE value:  tensor(0.0658)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7506245892112231  ]
---------------------
Train scores: 

  4%|▎         | 17/479 [00:00<00:02, 160.36it/s]

[ MAE value:  tensor(0.0570)  ]	[ MSE value:  tensor(0.0053)  ]	[ Pearson Coefficient:  0.8393232613083654  ]
------------------------------------------------------------------------------------------------------
[Epoch:  15 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 153.73it/s]


Validation scores: [ MAE value:  tensor(0.0664)  ]	[ MSE value:  tensor(0.0074)  ]	[ Pearson Coefficient:  0.7818527127482231  ]
---------------------
Test scores: [ MAE value:  tensor(0.0654)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7520182345541008  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:02, 157.11it/s]

[ MAE value:  tensor(0.0562)  ]	[ MSE value:  tensor(0.0051)  ]	[ Pearson Coefficient:  0.8431824264384786  ]
------------------------------------------------------------------------------------------------------
[Epoch:  16 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 154.14it/s]


Validation scores: [ MAE value:  tensor(0.0665)  ]	[ MSE value:  tensor(0.0074)  ]	[ Pearson Coefficient:  0.7826351458300008  ]
---------------------
Test scores: [ MAE value:  tensor(0.0656)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7521872541616225  ]
---------------------
Train scores: 

  3%|▎         | 15/479 [00:00<00:03, 145.97it/s]

[ MAE value:  tensor(0.0559)  ]	[ MSE value:  tensor(0.0051)  ]	[ Pearson Coefficient:  0.8456612491276794  ]
------------------------------------------------------------------------------------------------------
[Epoch:  17 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 149.54it/s]


Validation scores: [ MAE value:  tensor(0.0660)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7872899973390213  ]
---------------------
Test scores: [ MAE value:  tensor(0.0652)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7539681941399053  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:03, 154.15it/s]

[ MAE value:  tensor(0.0555)  ]	[ MSE value:  tensor(0.0050)  ]	[ Pearson Coefficient:  0.8494652571820728  ]
------------------------------------------------------------------------------------------------------
[Epoch:  18 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 149.22it/s]


Validation scores: [ MAE value:  tensor(0.0659)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7881126569026481  ]
---------------------
Test scores: [ MAE value:  tensor(0.0651)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7546510357146098  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:03, 150.64it/s]

[ MAE value:  tensor(0.0545)  ]	[ MSE value:  tensor(0.0048)  ]	[ Pearson Coefficient:  0.8527293683627629  ]
------------------------------------------------------------------------------------------------------
[Epoch:  19 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 150.92it/s]


Validation scores: [ MAE value:  tensor(0.0658)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7872756995843957  ]
---------------------
Test scores: [ MAE value:  tensor(0.0651)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7554954353611743  ]
---------------------
Train scores: 

  3%|▎         | 15/479 [00:00<00:03, 147.71it/s]

[ MAE value:  tensor(0.0540)  ]	[ MSE value:  tensor(0.0047)  ]	[ Pearson Coefficient:  0.8563022202898922  ]
------------------------------------------------------------------------------------------------------
[Epoch:  20 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 150.15it/s]


Validation scores: [ MAE value:  tensor(0.0662)  ]	[ MSE value:  tensor(0.0074)  ]	[ Pearson Coefficient:  0.7847645028685313  ]
---------------------
Test scores: [ MAE value:  tensor(0.0652)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7548782920983036  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:03, 153.64it/s]

[ MAE value:  tensor(0.0537)  ]	[ MSE value:  tensor(0.0047)  ]	[ Pearson Coefficient:  0.8596619068791249  ]
------------------------------------------------------------------------------------------------------
[Epoch:  21 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 151.35it/s]


Validation scores: [ MAE value:  tensor(0.0656)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.788282899676192  ]
---------------------
Test scores: [ MAE value:  tensor(0.0651)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7572593438542446  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:03, 151.33it/s]

[ MAE value:  tensor(0.0528)  ]	[ MSE value:  tensor(0.0045)  ]	[ Pearson Coefficient:  0.8634217025734678  ]
------------------------------------------------------------------------------------------------------
[Epoch:  22 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 150.12it/s]


Validation scores: [ MAE value:  tensor(0.0657)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7887645125763278  ]
---------------------
Test scores: [ MAE value:  tensor(0.0654)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7564576900506046  ]
---------------------
Train scores: 

  3%|▎         | 15/479 [00:00<00:03, 148.76it/s]

[ MAE value:  tensor(0.0524)  ]	[ MSE value:  tensor(0.0044)  ]	[ Pearson Coefficient:  0.8661117422227222  ]
------------------------------------------------------------------------------------------------------
[Epoch:  23 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 151.03it/s]


Validation scores: [ MAE value:  tensor(0.0659)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7873903412254041  ]
---------------------
Test scores: [ MAE value:  tensor(0.0652)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7565040215475428  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:03, 154.24it/s]

[ MAE value:  tensor(0.0516)  ]	[ MSE value:  tensor(0.0043)  ]	[ Pearson Coefficient:  0.8700650368677093  ]
------------------------------------------------------------------------------------------------------
[Epoch:  24 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 151.73it/s]


Validation scores: [ MAE value:  tensor(0.0661)  ]	[ MSE value:  tensor(0.0074)  ]	[ Pearson Coefficient:  0.7870729625909723  ]
---------------------
Test scores: [ MAE value:  tensor(0.0651)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7571123002126393  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:02, 154.82it/s]

[ MAE value:  tensor(0.0519)  ]	[ MSE value:  tensor(0.0044)  ]	[ Pearson Coefficient:  0.8736981752667619  ]
------------------------------------------------------------------------------------------------------
[Epoch:  25 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 152.92it/s]


Validation scores: [ MAE value:  tensor(0.0658)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7899566515474962  ]
---------------------
Test scores: [ MAE value:  tensor(0.0656)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7574037901160383  ]
---------------------
Train scores: [ MAE value:  tensor(0.0504)  ]	[ MSE value:  tensor(0.0041)  ]	[ Pearson Coefficient:  0.8764848658901985  ]
------------------------------------------------------------------------------------------------------


In [132]:
predf2_train = np.array(fcnnf2_2.forward(np.concatenate((bertword_train,bertsent_train,gloveword_train,glovesent_train),axis=1)).detach().cpu()).reshape(-1)
predf2 = np.array(fcnnf2_2.forward(np.concatenate((bertword_test,bertsent_test,gloveword_test,glovesent_test),axis=1)).detach().cpu()).reshape(-1)
predf2_trial = np.array(fcnnf2_2.forward(np.concatenate((bertword_trial,bertsent_trial,gloveword_trial,glovesent_trial),axis=1)).detach().cpu()).reshape(-1)

In [140]:
stats.pearsonr(predf2, testdf['complexity'])

(0.7574037918021691, 1.4275926358056614e-171)

##### Case-3 - 0.7553 - (May Remove) (bertword_train,bertsent_train,dtrobsent_train,gloveword_train,glovesent_train)

In [91]:
# Define Model
fcnnf3 = Framework_v1(2904,1200,512,256,DEVICE,0.5,0.5,0.5)

# Train Model with selected hyperparams
MAX_EPOCHS = 26
BATCH_SIZE = 16
LEARNING_RATE = 1e-5

featmat_train = np.concatenate((bertword_train,bertsent_train,dtrobsent_train,gloveword_train,glovesent_train),axis=1)
featmat_val = np.concatenate((bertword_trial,bertsent_trial,dtrobsent_trial,gloveword_trial,glovesent_trial),axis=1)
featmat_test = np.concatenate((bertword_test,bertsent_test,dtrobsent_test,gloveword_test,glovesent_test),axis=1)

labels_train = np.array(traindf['complexity'])
labels_val = np.array(trialdf['complexity'])
labels_test = np.array(testdf['complexity'])

train(fcnnf3, featmat_train, labels_train, featmat_val, labels_val, featmat_test, labels_test, MAX_EPOCHS, BATCH_SIZE, LEARNING_RATE, DEVICE)

  2%|▏         | 11/479 [00:00<00:04, 104.15it/s]

[Epoch:  1 / 26 ]


100%|██████████| 479/479 [00:04<00:00, 114.88it/s]


Validation scores: [ MAE value:  tensor(0.0940)  ]	[ MSE value:  tensor(0.0158)  ]	[ Pearson Coefficient:  0.5138673587193796  ]
---------------------
Test scores: [ MAE value:  tensor(0.0904)  ]	[ MSE value:  tensor(0.0139)  ]	[ Pearson Coefficient:  0.44897405146955927  ]
---------------------
Train scores: 

  3%|▎         | 12/479 [00:00<00:03, 116.85it/s]

[ MAE value:  tensor(0.0947)  ]	[ MSE value:  tensor(0.0151)  ]	[ Pearson Coefficient:  0.4695398496877434  ]
------------------------------------------------------------------------------------------------------
[Epoch:  2 / 26 ]


100%|██████████| 479/479 [00:04<00:00, 113.95it/s]


Validation scores: [ MAE value:  tensor(0.0824)  ]	[ MSE value:  tensor(0.0123)  ]	[ Pearson Coefficient:  0.6822491394232981  ]
---------------------
Test scores: [ MAE value:  tensor(0.0803)  ]	[ MSE value:  tensor(0.0110)  ]	[ Pearson Coefficient:  0.6186167432778228  ]
---------------------
Train scores: 

  3%|▎         | 12/479 [00:00<00:03, 118.37it/s]

[ MAE value:  tensor(0.0811)  ]	[ MSE value:  tensor(0.0112)  ]	[ Pearson Coefficient:  0.6886856645128351  ]
------------------------------------------------------------------------------------------------------
[Epoch:  3 / 26 ]


100%|██████████| 479/479 [00:04<00:00, 115.60it/s]


Validation scores: [ MAE value:  tensor(0.0743)  ]	[ MSE value:  tensor(0.0098)  ]	[ Pearson Coefficient:  0.7179239924377054  ]
---------------------
Test scores: [ MAE value:  tensor(0.0734)  ]	[ MSE value:  tensor(0.0091)  ]	[ Pearson Coefficient:  0.6700382747603724  ]
---------------------
Train scores: 

  3%|▎         | 12/479 [00:00<00:04, 116.32it/s]

[ MAE value:  tensor(0.0705)  ]	[ MSE value:  tensor(0.0084)  ]	[ Pearson Coefficient:  0.7434178577853203  ]
------------------------------------------------------------------------------------------------------
[Epoch:  4 / 26 ]


100%|██████████| 479/479 [00:04<00:00, 115.50it/s]


Validation scores: [ MAE value:  tensor(0.0706)  ]	[ MSE value:  tensor(0.0088)  ]	[ Pearson Coefficient:  0.7364997998395253  ]
---------------------
Test scores: [ MAE value:  tensor(0.0710)  ]	[ MSE value:  tensor(0.0084)  ]	[ Pearson Coefficient:  0.6944087127208384  ]
---------------------
Train scores: 

  3%|▎         | 12/479 [00:00<00:03, 118.46it/s]

[ MAE value:  tensor(0.0661)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7679303008883477  ]
------------------------------------------------------------------------------------------------------
[Epoch:  5 / 26 ]


100%|██████████| 479/479 [00:04<00:00, 116.73it/s]


Validation scores: [ MAE value:  tensor(0.0688)  ]	[ MSE value:  tensor(0.0083)  ]	[ Pearson Coefficient:  0.7530037786209351  ]
---------------------
Test scores: [ MAE value:  tensor(0.0697)  ]	[ MSE value:  tensor(0.0081)  ]	[ Pearson Coefficient:  0.7108566960873081  ]
---------------------
Train scores: 

  3%|▎         | 12/479 [00:00<00:04, 114.49it/s]

[ MAE value:  tensor(0.0642)  ]	[ MSE value:  tensor(0.0069)  ]	[ Pearson Coefficient:  0.7845076581760355  ]
------------------------------------------------------------------------------------------------------
[Epoch:  6 / 26 ]


100%|██████████| 479/479 [00:04<00:00, 114.29it/s]


Validation scores: [ MAE value:  tensor(0.0677)  ]	[ MSE value:  tensor(0.0080)  ]	[ Pearson Coefficient:  0.7632791293130748  ]
---------------------
Test scores: [ MAE value:  tensor(0.0688)  ]	[ MSE value:  tensor(0.0079)  ]	[ Pearson Coefficient:  0.72192829686265  ]
---------------------
Train scores: 

  3%|▎         | 12/479 [00:00<00:04, 115.79it/s]

[ MAE value:  tensor(0.0629)  ]	[ MSE value:  tensor(0.0065)  ]	[ Pearson Coefficient:  0.7957402568310538  ]
------------------------------------------------------------------------------------------------------
[Epoch:  7 / 26 ]


100%|██████████| 479/479 [00:04<00:00, 116.21it/s]


Validation scores: [ MAE value:  tensor(0.0671)  ]	[ MSE value:  tensor(0.0078)  ]	[ Pearson Coefficient:  0.7684259428780433  ]
---------------------
Test scores: [ MAE value:  tensor(0.0681)  ]	[ MSE value:  tensor(0.0077)  ]	[ Pearson Coefficient:  0.729064724576165  ]
---------------------
Train scores: 

  3%|▎         | 12/479 [00:00<00:03, 118.23it/s]

[ MAE value:  tensor(0.0616)  ]	[ MSE value:  tensor(0.0063)  ]	[ Pearson Coefficient:  0.8046029239287512  ]
------------------------------------------------------------------------------------------------------
[Epoch:  8 / 26 ]


100%|██████████| 479/479 [00:04<00:00, 115.02it/s]


Validation scores: [ MAE value:  tensor(0.0670)  ]	[ MSE value:  tensor(0.0078)  ]	[ Pearson Coefficient:  0.7737998009455546  ]
---------------------
Test scores: [ MAE value:  tensor(0.0676)  ]	[ MSE value:  tensor(0.0076)  ]	[ Pearson Coefficient:  0.735995163292467  ]
---------------------
Train scores: 

  3%|▎         | 13/479 [00:00<00:03, 119.34it/s]

[ MAE value:  tensor(0.0612)  ]	[ MSE value:  tensor(0.0062)  ]	[ Pearson Coefficient:  0.8118618112540008  ]
------------------------------------------------------------------------------------------------------
[Epoch:  9 / 26 ]


100%|██████████| 479/479 [00:04<00:00, 116.73it/s]


Validation scores: [ MAE value:  tensor(0.0663)  ]	[ MSE value:  tensor(0.0075)  ]	[ Pearson Coefficient:  0.7769265972548464  ]
---------------------
Test scores: [ MAE value:  tensor(0.0669)  ]	[ MSE value:  tensor(0.0074)  ]	[ Pearson Coefficient:  0.7392220951397461  ]
---------------------
Train scores: 

  3%|▎         | 13/479 [00:00<00:03, 120.71it/s]

[ MAE value:  tensor(0.0599)  ]	[ MSE value:  tensor(0.0059)  ]	[ Pearson Coefficient:  0.8168552700666823  ]
------------------------------------------------------------------------------------------------------
[Epoch:  10 / 26 ]


100%|██████████| 479/479 [00:04<00:00, 117.94it/s]


Validation scores: [ MAE value:  tensor(0.0666)  ]	[ MSE value:  tensor(0.0077)  ]	[ Pearson Coefficient:  0.7783322277734215  ]
---------------------
Test scores: [ MAE value:  tensor(0.0672)  ]	[ MSE value:  tensor(0.0075)  ]	[ Pearson Coefficient:  0.7406349985234699  ]
---------------------
Train scores: 

  3%|▎         | 12/479 [00:00<00:03, 117.32it/s]

[ MAE value:  tensor(0.0601)  ]	[ MSE value:  tensor(0.0059)  ]	[ Pearson Coefficient:  0.8214721800689718  ]
------------------------------------------------------------------------------------------------------
[Epoch:  11 / 26 ]


100%|██████████| 479/479 [00:04<00:00, 117.98it/s]


Validation scores: [ MAE value:  tensor(0.0667)  ]	[ MSE value:  tensor(0.0077)  ]	[ Pearson Coefficient:  0.7754272400393674  ]
---------------------
Test scores: [ MAE value:  tensor(0.0669)  ]	[ MSE value:  tensor(0.0074)  ]	[ Pearson Coefficient:  0.7418648098697063  ]
---------------------
Train scores: 

  3%|▎         | 13/479 [00:00<00:03, 118.89it/s]

[ MAE value:  tensor(0.0593)  ]	[ MSE value:  tensor(0.0058)  ]	[ Pearson Coefficient:  0.8246754671549403  ]
------------------------------------------------------------------------------------------------------
[Epoch:  12 / 26 ]


100%|██████████| 479/479 [00:04<00:00, 117.64it/s]


Validation scores: [ MAE value:  tensor(0.0659)  ]	[ MSE value:  tensor(0.0075)  ]	[ Pearson Coefficient:  0.7818509867908838  ]
---------------------
Test scores: [ MAE value:  tensor(0.0661)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.744742763452208  ]
---------------------
Train scores: 

  3%|▎         | 12/479 [00:00<00:03, 119.96it/s]

[ MAE value:  tensor(0.0585)  ]	[ MSE value:  tensor(0.0056)  ]	[ Pearson Coefficient:  0.8290374268946381  ]
------------------------------------------------------------------------------------------------------
[Epoch:  13 / 26 ]


100%|██████████| 479/479 [00:04<00:00, 117.22it/s]


Validation scores: [ MAE value:  tensor(0.0659)  ]	[ MSE value:  tensor(0.0074)  ]	[ Pearson Coefficient:  0.7825433040019005  ]
---------------------
Test scores: [ MAE value:  tensor(0.0660)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7465312116837964  ]
---------------------
Train scores: 

  3%|▎         | 12/479 [00:00<00:03, 118.01it/s]

[ MAE value:  tensor(0.0578)  ]	[ MSE value:  tensor(0.0055)  ]	[ Pearson Coefficient:  0.8320448219594603  ]
------------------------------------------------------------------------------------------------------
[Epoch:  14 / 26 ]


100%|██████████| 479/479 [00:04<00:00, 116.66it/s]


Validation scores: [ MAE value:  tensor(0.0657)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7831173830866062  ]
---------------------
Test scores: [ MAE value:  tensor(0.0659)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7485512944839983  ]
---------------------
Train scores: 

  3%|▎         | 12/479 [00:00<00:03, 118.72it/s]

[ MAE value:  tensor(0.0572)  ]	[ MSE value:  tensor(0.0053)  ]	[ Pearson Coefficient:  0.8356421674369213  ]
------------------------------------------------------------------------------------------------------
[Epoch:  15 / 26 ]


100%|██████████| 479/479 [00:04<00:00, 117.24it/s]


Validation scores: [ MAE value:  tensor(0.0664)  ]	[ MSE value:  tensor(0.0076)  ]	[ Pearson Coefficient:  0.7814342575857681  ]
---------------------
Test scores: [ MAE value:  tensor(0.0659)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7494068169092262  ]
---------------------
Train scores: 

  3%|▎         | 13/479 [00:00<00:03, 120.89it/s]

[ MAE value:  tensor(0.0575)  ]	[ MSE value:  tensor(0.0054)  ]	[ Pearson Coefficient:  0.8387352980735565  ]
------------------------------------------------------------------------------------------------------
[Epoch:  16 / 26 ]


100%|██████████| 479/479 [00:04<00:00, 117.11it/s]


Validation scores: [ MAE value:  tensor(0.0657)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7849443546393364  ]
---------------------
Test scores: [ MAE value:  tensor(0.0663)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7506310343476259  ]
---------------------
Train scores: 

  3%|▎         | 12/479 [00:00<00:04, 116.11it/s]

[ MAE value:  tensor(0.0567)  ]	[ MSE value:  tensor(0.0052)  ]	[ Pearson Coefficient:  0.8416755571369255  ]
------------------------------------------------------------------------------------------------------
[Epoch:  17 / 26 ]


100%|██████████| 479/479 [00:04<00:00, 116.32it/s]


Validation scores: [ MAE value:  tensor(0.0661)  ]	[ MSE value:  tensor(0.0075)  ]	[ Pearson Coefficient:  0.7815589690729016  ]
---------------------
Test scores: [ MAE value:  tensor(0.0657)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7503501609000949  ]
---------------------
Train scores: 

  3%|▎         | 13/479 [00:00<00:03, 122.56it/s]

[ MAE value:  tensor(0.0563)  ]	[ MSE value:  tensor(0.0052)  ]	[ Pearson Coefficient:  0.8445403648578012  ]
------------------------------------------------------------------------------------------------------
[Epoch:  18 / 26 ]


100%|██████████| 479/479 [00:04<00:00, 117.03it/s]


Validation scores: [ MAE value:  tensor(0.0655)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.786081216937926  ]
---------------------
Test scores: [ MAE value:  tensor(0.0653)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7529110674051788  ]
---------------------
Train scores: 

  3%|▎         | 12/479 [00:00<00:04, 115.66it/s]

[ MAE value:  tensor(0.0553)  ]	[ MSE value:  tensor(0.0050)  ]	[ Pearson Coefficient:  0.8482049460231269  ]
------------------------------------------------------------------------------------------------------
[Epoch:  19 / 26 ]


100%|██████████| 479/479 [00:04<00:00, 117.02it/s]


Validation scores: [ MAE value:  tensor(0.0654)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7876672993699795  ]
---------------------
Test scores: [ MAE value:  tensor(0.0653)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7543939947542335  ]
---------------------
Train scores: 

  3%|▎         | 13/479 [00:00<00:03, 121.04it/s]

[ MAE value:  tensor(0.0548)  ]	[ MSE value:  tensor(0.0049)  ]	[ Pearson Coefficient:  0.8517418595339576  ]
------------------------------------------------------------------------------------------------------
[Epoch:  20 / 26 ]


100%|██████████| 479/479 [00:04<00:00, 116.93it/s]


Validation scores: [ MAE value:  tensor(0.0657)  ]	[ MSE value:  tensor(0.0074)  ]	[ Pearson Coefficient:  0.7861493981442007  ]
---------------------
Test scores: [ MAE value:  tensor(0.0653)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7538860717582436  ]
---------------------
Train scores: 

  3%|▎         | 12/479 [00:00<00:03, 118.68it/s]

[ MAE value:  tensor(0.0545)  ]	[ MSE value:  tensor(0.0048)  ]	[ Pearson Coefficient:  0.8549916777263714  ]
------------------------------------------------------------------------------------------------------
[Epoch:  21 / 26 ]


100%|██████████| 479/479 [00:04<00:00, 116.51it/s]


Validation scores: [ MAE value:  tensor(0.0656)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7876041265764233  ]
---------------------
Test scores: [ MAE value:  tensor(0.0654)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7538934597575018  ]
---------------------
Train scores: 

  3%|▎         | 13/479 [00:00<00:03, 120.27it/s]

[ MAE value:  tensor(0.0540)  ]	[ MSE value:  tensor(0.0047)  ]	[ Pearson Coefficient:  0.8574408679179099  ]
------------------------------------------------------------------------------------------------------
[Epoch:  22 / 26 ]


100%|██████████| 479/479 [00:04<00:00, 115.77it/s]


Validation scores: [ MAE value:  tensor(0.0656)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7867264518641787  ]
---------------------
Test scores: [ MAE value:  tensor(0.0651)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7546861792950073  ]
---------------------
Train scores: 

  3%|▎         | 12/479 [00:00<00:04, 114.58it/s]

[ MAE value:  tensor(0.0536)  ]	[ MSE value:  tensor(0.0047)  ]	[ Pearson Coefficient:  0.860940304902537  ]
------------------------------------------------------------------------------------------------------
[Epoch:  23 / 26 ]


100%|██████████| 479/479 [00:04<00:00, 115.78it/s]


Validation scores: [ MAE value:  tensor(0.0652)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7889370986537197  ]
---------------------
Test scores: [ MAE value:  tensor(0.0657)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7564234043288679  ]
---------------------
Train scores: 

  3%|▎         | 13/479 [00:00<00:03, 120.78it/s]

[ MAE value:  tensor(0.0528)  ]	[ MSE value:  tensor(0.0045)  ]	[ Pearson Coefficient:  0.8641647871031568  ]
------------------------------------------------------------------------------------------------------
[Epoch:  24 / 26 ]


100%|██████████| 479/479 [00:04<00:00, 116.90it/s]


Validation scores: [ MAE value:  tensor(0.0651)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7903383247761153  ]
---------------------
Test scores: [ MAE value:  tensor(0.0654)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7563786023794378  ]
---------------------
Train scores: 

  3%|▎         | 12/479 [00:00<00:04, 115.58it/s]

[ MAE value:  tensor(0.0520)  ]	[ MSE value:  tensor(0.0044)  ]	[ Pearson Coefficient:  0.8674848528450326  ]
------------------------------------------------------------------------------------------------------
[Epoch:  25 / 26 ]


100%|██████████| 479/479 [00:04<00:00, 116.37it/s]


Validation scores: [ MAE value:  tensor(0.0652)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7882638125069839  ]
---------------------
Test scores: [ MAE value:  tensor(0.0654)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7555972070791628  ]
---------------------
Train scores: 

  3%|▎         | 13/479 [00:00<00:03, 121.23it/s]

[ MAE value:  tensor(0.0515)  ]	[ MSE value:  tensor(0.0043)  ]	[ Pearson Coefficient:  0.8704857584947416  ]
------------------------------------------------------------------------------------------------------
[Epoch:  26 / 26 ]


100%|██████████| 479/479 [00:04<00:00, 117.12it/s]


Validation scores: [ MAE value:  tensor(0.0649)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7906904655204966  ]
---------------------
Test scores: [ MAE value:  tensor(0.0655)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7552999219792492  ]
---------------------
Train scores: [ MAE value:  tensor(0.0508)  ]	[ MSE value:  tensor(0.0042)  ]	[ Pearson Coefficient:  0.8738569087401942  ]
------------------------------------------------------------------------------------------------------


In [142]:
predf3_train = np.array(fcnnf3.forward(np.concatenate((bertword_train,bertsent_train, dtrobsent_train, gloveword_train,glovesent_train),axis=1)).detach().cpu()).reshape(-1)
predf3 = np.array(fcnnf3.forward(np.concatenate((bertword_test,bertsent_test, dtrobsent_test, gloveword_test,glovesent_test),axis=1)).detach().cpu()).reshape(-1)
predf3_trial = np.array(fcnnf3.forward(np.concatenate((bertword_trial,bertsent_trial, dtrobsent_trial, gloveword_trial,glovesent_trial),axis=1)).detach().cpu()).reshape(-1)

In [143]:
stats.pearsonr(predf3, testdf['complexity'])

(0.7552999236814216, 4.3000527720487294e-170)

##### Case-4 - 0.7534 - (May Remove) (trainmf,bertword_train,bertsent_train,dtrobsent_train,gloveword_train)

In [109]:
# Define Model
fcnnf4 = Framework_v1(2659,1200,512,256,DEVICE,0.5,0.4,0.3)

# Train Model with selected hyperparams
MAX_EPOCHS = 25
BATCH_SIZE = 16
LEARNING_RATE = 1e-5

featmat_train = np.concatenate((trainmf,bertword_train,bertsent_train,dtrobsent_train,gloveword_train),axis=1)
featmat_val = np.concatenate((trialmf,bertword_trial,bertsent_trial,dtrobsent_trial,gloveword_trial),axis=1)
featmat_test = np.concatenate((testmf,bertword_test,bertsent_test,dtrobsent_test,gloveword_test),axis=1)

labels_train = np.array(traindf['complexity'])
labels_val = np.array(trialdf['complexity'])
labels_test = np.array(testdf['complexity'])

train(fcnnf4, featmat_train, labels_train, featmat_val, labels_val, featmat_test, labels_test, MAX_EPOCHS, BATCH_SIZE, LEARNING_RATE, DEVICE)

  3%|▎         | 15/479 [00:00<00:03, 146.07it/s]

[Epoch:  1 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 142.56it/s]


Validation scores: [ MAE value:  tensor(0.0947)  ]	[ MSE value:  tensor(0.0163)  ]	[ Pearson Coefficient:  0.3827008079309545  ]
---------------------
Test scores: [ MAE value:  tensor(0.0914)  ]	[ MSE value:  tensor(0.0144)  ]	[ Pearson Coefficient:  0.34609342605756704  ]
---------------------
Train scores: 

  3%|▎         | 15/479 [00:00<00:03, 144.32it/s]

[ MAE value:  tensor(0.0965)  ]	[ MSE value:  tensor(0.0158)  ]	[ Pearson Coefficient:  0.33945069981597215  ]
------------------------------------------------------------------------------------------------------
[Epoch:  2 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 143.32it/s]


Validation scores: [ MAE value:  tensor(0.0821)  ]	[ MSE value:  tensor(0.0120)  ]	[ Pearson Coefficient:  0.6665025195626397  ]
---------------------
Test scores: [ MAE value:  tensor(0.0801)  ]	[ MSE value:  tensor(0.0109)  ]	[ Pearson Coefficient:  0.6054556296136195  ]
---------------------
Train scores: 

  3%|▎         | 15/479 [00:00<00:03, 142.09it/s]

[ MAE value:  tensor(0.0803)  ]	[ MSE value:  tensor(0.0109)  ]	[ Pearson Coefficient:  0.6795256755657776  ]
------------------------------------------------------------------------------------------------------
[Epoch:  3 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 143.13it/s]


Validation scores: [ MAE value:  tensor(0.0734)  ]	[ MSE value:  tensor(0.0096)  ]	[ Pearson Coefficient:  0.7145468482261158  ]
---------------------
Test scores: [ MAE value:  tensor(0.0727)  ]	[ MSE value:  tensor(0.0089)  ]	[ Pearson Coefficient:  0.6732348608763691  ]
---------------------
Train scores: 

  3%|▎         | 14/479 [00:00<00:03, 137.20it/s]

[ MAE value:  tensor(0.0690)  ]	[ MSE value:  tensor(0.0080)  ]	[ Pearson Coefficient:  0.7505146051770647  ]
------------------------------------------------------------------------------------------------------
[Epoch:  4 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 141.88it/s]


Validation scores: [ MAE value:  tensor(0.0709)  ]	[ MSE value:  tensor(0.0089)  ]	[ Pearson Coefficient:  0.7338859735015846  ]
---------------------
Test scores: [ MAE value:  tensor(0.0706)  ]	[ MSE value:  tensor(0.0084)  ]	[ Pearson Coefficient:  0.6996801721933122  ]
---------------------
Train scores: 

  3%|▎         | 15/479 [00:00<00:03, 142.81it/s]

[ MAE value:  tensor(0.0657)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7777888623042859  ]
------------------------------------------------------------------------------------------------------
[Epoch:  5 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 141.43it/s]


Validation scores: [ MAE value:  tensor(0.0695)  ]	[ MSE value:  tensor(0.0086)  ]	[ Pearson Coefficient:  0.7498917586023944  ]
---------------------
Test scores: [ MAE value:  tensor(0.0700)  ]	[ MSE value:  tensor(0.0082)  ]	[ Pearson Coefficient:  0.7136603748992897  ]
---------------------
Train scores: 

  3%|▎         | 15/479 [00:00<00:03, 147.23it/s]

[ MAE value:  tensor(0.0643)  ]	[ MSE value:  tensor(0.0068)  ]	[ Pearson Coefficient:  0.7930359563183254  ]
------------------------------------------------------------------------------------------------------
[Epoch:  6 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 142.57it/s]


Validation scores: [ MAE value:  tensor(0.0681)  ]	[ MSE value:  tensor(0.0082)  ]	[ Pearson Coefficient:  0.7578704025129028  ]
---------------------
Test scores: [ MAE value:  tensor(0.0683)  ]	[ MSE value:  tensor(0.0078)  ]	[ Pearson Coefficient:  0.7236878886049868  ]
---------------------
Train scores: 

  3%|▎         | 15/479 [00:00<00:03, 144.11it/s]

[ MAE value:  tensor(0.0619)  ]	[ MSE value:  tensor(0.0063)  ]	[ Pearson Coefficient:  0.8044075804090073  ]
------------------------------------------------------------------------------------------------------
[Epoch:  7 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 141.72it/s]


Validation scores: [ MAE value:  tensor(0.0675)  ]	[ MSE value:  tensor(0.0079)  ]	[ Pearson Coefficient:  0.7643524231915833  ]
---------------------
Test scores: [ MAE value:  tensor(0.0678)  ]	[ MSE value:  tensor(0.0076)  ]	[ Pearson Coefficient:  0.7301813681295712  ]
---------------------
Train scores: 

  3%|▎         | 15/479 [00:00<00:03, 145.13it/s]

[ MAE value:  tensor(0.0607)  ]	[ MSE value:  tensor(0.0060)  ]	[ Pearson Coefficient:  0.8119080685415169  ]
------------------------------------------------------------------------------------------------------
[Epoch:  8 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 143.19it/s]


Validation scores: [ MAE value:  tensor(0.0671)  ]	[ MSE value:  tensor(0.0078)  ]	[ Pearson Coefficient:  0.7690889826582817  ]
---------------------
Test scores: [ MAE value:  tensor(0.0673)  ]	[ MSE value:  tensor(0.0075)  ]	[ Pearson Coefficient:  0.735495524498953  ]
---------------------
Train scores: 

  3%|▎         | 15/479 [00:00<00:03, 145.62it/s]

[ MAE value:  tensor(0.0598)  ]	[ MSE value:  tensor(0.0058)  ]	[ Pearson Coefficient:  0.8186242865300114  ]
------------------------------------------------------------------------------------------------------
[Epoch:  9 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 143.26it/s]


Validation scores: [ MAE value:  tensor(0.0671)  ]	[ MSE value:  tensor(0.0078)  ]	[ Pearson Coefficient:  0.7714671308479484  ]
---------------------
Test scores: [ MAE value:  tensor(0.0670)  ]	[ MSE value:  tensor(0.0075)  ]	[ Pearson Coefficient:  0.7393233683643624  ]
---------------------
Train scores: 

  3%|▎         | 15/479 [00:00<00:03, 145.95it/s]

[ MAE value:  tensor(0.0593)  ]	[ MSE value:  tensor(0.0057)  ]	[ Pearson Coefficient:  0.8243422862639984  ]
------------------------------------------------------------------------------------------------------
[Epoch:  10 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 141.35it/s]


Validation scores: [ MAE value:  tensor(0.0671)  ]	[ MSE value:  tensor(0.0077)  ]	[ Pearson Coefficient:  0.773695143906332  ]
---------------------
Test scores: [ MAE value:  tensor(0.0670)  ]	[ MSE value:  tensor(0.0074)  ]	[ Pearson Coefficient:  0.7416891336830549  ]
---------------------
Train scores: 

  3%|▎         | 14/479 [00:00<00:03, 138.03it/s]

[ MAE value:  tensor(0.0588)  ]	[ MSE value:  tensor(0.0056)  ]	[ Pearson Coefficient:  0.8293618566420335  ]
------------------------------------------------------------------------------------------------------
[Epoch:  11 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 142.61it/s]


Validation scores: [ MAE value:  tensor(0.0666)  ]	[ MSE value:  tensor(0.0077)  ]	[ Pearson Coefficient:  0.776701715677045  ]
---------------------
Test scores: [ MAE value:  tensor(0.0670)  ]	[ MSE value:  tensor(0.0074)  ]	[ Pearson Coefficient:  0.742183829462823  ]
---------------------
Train scores: 

  3%|▎         | 15/479 [00:00<00:03, 148.09it/s]

[ MAE value:  tensor(0.0581)  ]	[ MSE value:  tensor(0.0055)  ]	[ Pearson Coefficient:  0.8334100561866647  ]
------------------------------------------------------------------------------------------------------
[Epoch:  12 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 142.34it/s]


Validation scores: [ MAE value:  tensor(0.0665)  ]	[ MSE value:  tensor(0.0076)  ]	[ Pearson Coefficient:  0.7796199188307759  ]
---------------------
Test scores: [ MAE value:  tensor(0.0667)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7457441945483585  ]
---------------------
Train scores: 

  3%|▎         | 15/479 [00:00<00:03, 146.57it/s]

[ MAE value:  tensor(0.0575)  ]	[ MSE value:  tensor(0.0054)  ]	[ Pearson Coefficient:  0.8385090749460627  ]
------------------------------------------------------------------------------------------------------
[Epoch:  13 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 143.08it/s]


Validation scores: [ MAE value:  tensor(0.0663)  ]	[ MSE value:  tensor(0.0074)  ]	[ Pearson Coefficient:  0.781135839396509  ]
---------------------
Test scores: [ MAE value:  tensor(0.0665)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7460431538508412  ]
---------------------
Train scores: 

  3%|▎         | 15/479 [00:00<00:03, 146.81it/s]

[ MAE value:  tensor(0.0565)  ]	[ MSE value:  tensor(0.0052)  ]	[ Pearson Coefficient:  0.8423303529160535  ]
------------------------------------------------------------------------------------------------------
[Epoch:  14 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 141.35it/s]


Validation scores: [ MAE value:  tensor(0.0662)  ]	[ MSE value:  tensor(0.0074)  ]	[ Pearson Coefficient:  0.7836681149448282  ]
---------------------
Test scores: [ MAE value:  tensor(0.0662)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.748478794753594  ]
---------------------
Train scores: 

  3%|▎         | 15/479 [00:00<00:03, 146.38it/s]

[ MAE value:  tensor(0.0559)  ]	[ MSE value:  tensor(0.0051)  ]	[ Pearson Coefficient:  0.8465350233761346  ]
------------------------------------------------------------------------------------------------------
[Epoch:  15 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 141.96it/s]


Validation scores: [ MAE value:  tensor(0.0666)  ]	[ MSE value:  tensor(0.0075)  ]	[ Pearson Coefficient:  0.7828191878521961  ]
---------------------
Test scores: [ MAE value:  tensor(0.0664)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7489304399602005  ]
---------------------
Train scores: 

  3%|▎         | 15/479 [00:00<00:03, 143.15it/s]

[ MAE value:  tensor(0.0557)  ]	[ MSE value:  tensor(0.0050)  ]	[ Pearson Coefficient:  0.8504651314978425  ]
------------------------------------------------------------------------------------------------------
[Epoch:  16 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 142.77it/s]


Validation scores: [ MAE value:  tensor(0.0671)  ]	[ MSE value:  tensor(0.0078)  ]	[ Pearson Coefficient:  0.784579036080282  ]
---------------------
Test scores: [ MAE value:  tensor(0.0671)  ]	[ MSE value:  tensor(0.0075)  ]	[ Pearson Coefficient:  0.7485471211687508  ]
---------------------
Train scores: 

  3%|▎         | 15/479 [00:00<00:03, 144.68it/s]

[ MAE value:  tensor(0.0563)  ]	[ MSE value:  tensor(0.0051)  ]	[ Pearson Coefficient:  0.8538497873037825  ]
------------------------------------------------------------------------------------------------------
[Epoch:  17 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 141.80it/s]


Validation scores: [ MAE value:  tensor(0.0657)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7861876570008274  ]
---------------------
Test scores: [ MAE value:  tensor(0.0654)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7514574566013912  ]
---------------------
Train scores: 

  3%|▎         | 15/479 [00:00<00:03, 144.41it/s]

[ MAE value:  tensor(0.0535)  ]	[ MSE value:  tensor(0.0046)  ]	[ Pearson Coefficient:  0.8589982298171603  ]
------------------------------------------------------------------------------------------------------
[Epoch:  18 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 140.18it/s]


Validation scores: [ MAE value:  tensor(0.0671)  ]	[ MSE value:  tensor(0.0077)  ]	[ Pearson Coefficient:  0.7830737380695837  ]
---------------------
Test scores: [ MAE value:  tensor(0.0668)  ]	[ MSE value:  tensor(0.0074)  ]	[ Pearson Coefficient:  0.7500135148549958  ]
---------------------
Train scores: 

  3%|▎         | 15/479 [00:00<00:03, 145.22it/s]

[ MAE value:  tensor(0.0545)  ]	[ MSE value:  tensor(0.0048)  ]	[ Pearson Coefficient:  0.8620988251816663  ]
------------------------------------------------------------------------------------------------------
[Epoch:  19 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 141.96it/s]


Validation scores: [ MAE value:  tensor(0.0664)  ]	[ MSE value:  tensor(0.0074)  ]	[ Pearson Coefficient:  0.7843984912233518  ]
---------------------
Test scores: [ MAE value:  tensor(0.0659)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7511389816816801  ]
---------------------
Train scores: 

  3%|▎         | 15/479 [00:00<00:03, 144.00it/s]

[ MAE value:  tensor(0.0528)  ]	[ MSE value:  tensor(0.0045)  ]	[ Pearson Coefficient:  0.8661474327077744  ]
------------------------------------------------------------------------------------------------------
[Epoch:  20 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 139.51it/s]


Validation scores: [ MAE value:  tensor(0.0656)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7867701844919253  ]
---------------------
Test scores: [ MAE value:  tensor(0.0656)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7514084002599491  ]
---------------------
Train scores: 

  3%|▎         | 15/479 [00:00<00:03, 141.31it/s]

[ MAE value:  tensor(0.0517)  ]	[ MSE value:  tensor(0.0043)  ]	[ Pearson Coefficient:  0.8693771025805714  ]
------------------------------------------------------------------------------------------------------
[Epoch:  21 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 139.99it/s]


Validation scores: [ MAE value:  tensor(0.0653)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7910504655204813  ]
---------------------
Test scores: [ MAE value:  tensor(0.0662)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7534548694298225  ]
---------------------
Train scores: 

  3%|▎         | 14/479 [00:00<00:03, 138.00it/s]

[ MAE value:  tensor(0.0515)  ]	[ MSE value:  tensor(0.0043)  ]	[ Pearson Coefficient:  0.8732208193197336  ]
------------------------------------------------------------------------------------------------------
[Epoch:  22 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 139.82it/s]


Validation scores: [ MAE value:  tensor(0.0654)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7894922798997511  ]
---------------------
Test scores: [ MAE value:  tensor(0.0654)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7534743917211528  ]
---------------------
Train scores: 

  3%|▎         | 15/479 [00:00<00:03, 145.01it/s]

[ MAE value:  tensor(0.0503)  ]	[ MSE value:  tensor(0.0041)  ]	[ Pearson Coefficient:  0.8777738043075678  ]
------------------------------------------------------------------------------------------------------
[Epoch:  23 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 141.90it/s]


Validation scores: [ MAE value:  tensor(0.0655)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7889874066347006  ]
---------------------
Test scores: [ MAE value:  tensor(0.0654)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7525018367461694  ]
---------------------
Train scores: 

  3%|▎         | 15/479 [00:00<00:03, 145.12it/s]

[ MAE value:  tensor(0.0494)  ]	[ MSE value:  tensor(0.0039)  ]	[ Pearson Coefficient:  0.8819406710244678  ]
------------------------------------------------------------------------------------------------------
[Epoch:  24 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 141.50it/s]


Validation scores: [ MAE value:  tensor(0.0653)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.789200059651128  ]
---------------------
Test scores: [ MAE value:  tensor(0.0655)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7529559896560557  ]
---------------------
Train scores: 

  3%|▎         | 15/479 [00:00<00:03, 141.77it/s]

[ MAE value:  tensor(0.0485)  ]	[ MSE value:  tensor(0.0038)  ]	[ Pearson Coefficient:  0.8859630708514389  ]
------------------------------------------------------------------------------------------------------
[Epoch:  25 / 25 ]


100%|██████████| 479/479 [00:03<00:00, 142.19it/s]


Validation scores: [ MAE value:  tensor(0.0654)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7891672964875625  ]
---------------------
Test scores: [ MAE value:  tensor(0.0652)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7534803465066588  ]
---------------------
Train scores: [ MAE value:  tensor(0.0474)  ]	[ MSE value:  tensor(0.0037)  ]	[ Pearson Coefficient:  0.8904357451716709  ]
------------------------------------------------------------------------------------------------------


In [147]:
predf4_train = np.array(fcnnf4.forward(np.concatenate((trainmf, bertword_train,bertsent_train, dtrobsent_train, gloveword_train),axis=1)).detach().cpu()).reshape(-1)
predf4 = np.array(fcnnf4.forward(np.concatenate((testmf, bertword_test,bertsent_test, dtrobsent_test, gloveword_test),axis=1)).detach().cpu()).reshape(-1)
predf4_trial = np.array(fcnnf4.forward(np.concatenate((trialmf, bertword_trial,bertsent_trial, dtrobsent_trial, gloveword_trial),axis=1)).detach().cpu()).reshape(-1)

In [149]:
stats.pearsonr(predf4, testdf['complexity'])

(0.7534803484191827, 7.951043972916586e-169)

##### Case-5 - 0.75689 - (trainmf,bertword_train,bertsent_train,gloveword_train,glovesent_train)

In [128]:
# Define Model
fcnnf5 = Framework_v1(2191,1024,512,256,DEVICE,0.5,0.4,0.4)

# Train Model with selected hyperparams
MAX_EPOCHS = 25
BATCH_SIZE = 16
LEARNING_RATE = 1e-5

featmat_train = np.concatenate((trainmf,bertword_train,bertsent_train,glovesent_train,gloveword_train),axis=1)
featmat_val = np.concatenate((trialmf,bertword_trial,bertsent_trial,glovesent_trial,gloveword_trial),axis=1)
featmat_test = np.concatenate((testmf,bertword_test,bertsent_test,glovesent_test,gloveword_test),axis=1)

labels_train = np.array(traindf['complexity'])
labels_val = np.array(trialdf['complexity'])
labels_test = np.array(testdf['complexity'])

train(fcnnf5, featmat_train, labels_train, featmat_val, labels_val, featmat_test, labels_test, MAX_EPOCHS, BATCH_SIZE, LEARNING_RATE, DEVICE)

  4%|▍         | 18/479 [00:00<00:02, 171.10it/s]

[Epoch:  1 / 25 ]


100%|██████████| 479/479 [00:02<00:00, 167.61it/s]


Validation scores: [ MAE value:  tensor(0.0934)  ]	[ MSE value:  tensor(0.0160)  ]	[ Pearson Coefficient:  0.42509508942949187  ]
---------------------
Test scores: [ MAE value:  tensor(0.0906)  ]	[ MSE value:  tensor(0.0142)  ]	[ Pearson Coefficient:  0.3827193539517884  ]
---------------------
Train scores: 

  4%|▍         | 18/479 [00:00<00:02, 171.59it/s]

[ MAE value:  tensor(0.0956)  ]	[ MSE value:  tensor(0.0156)  ]	[ Pearson Coefficient:  0.37061228175354927  ]
------------------------------------------------------------------------------------------------------
[Epoch:  2 / 25 ]


100%|██████████| 479/479 [00:02<00:00, 169.06it/s]


Validation scores: [ MAE value:  tensor(0.0823)  ]	[ MSE value:  tensor(0.0120)  ]	[ Pearson Coefficient:  0.667476816337397  ]
---------------------
Test scores: [ MAE value:  tensor(0.0806)  ]	[ MSE value:  tensor(0.0109)  ]	[ Pearson Coefficient:  0.6076850100289147  ]
---------------------
Train scores: 

  4%|▍         | 18/479 [00:00<00:02, 171.68it/s]

[ MAE value:  tensor(0.0810)  ]	[ MSE value:  tensor(0.0110)  ]	[ Pearson Coefficient:  0.6727149104643894  ]
------------------------------------------------------------------------------------------------------
[Epoch:  3 / 25 ]


100%|██████████| 479/479 [00:02<00:00, 168.38it/s]


Validation scores: [ MAE value:  tensor(0.0745)  ]	[ MSE value:  tensor(0.0098)  ]	[ Pearson Coefficient:  0.7071546017067434  ]
---------------------
Test scores: [ MAE value:  tensor(0.0733)  ]	[ MSE value:  tensor(0.0090)  ]	[ Pearson Coefficient:  0.6675901967340861  ]
---------------------
Train scores: 

  4%|▍         | 18/479 [00:00<00:02, 175.86it/s]

[ MAE value:  tensor(0.0702)  ]	[ MSE value:  tensor(0.0082)  ]	[ Pearson Coefficient:  0.7403787393864666  ]
------------------------------------------------------------------------------------------------------
[Epoch:  4 / 25 ]


100%|██████████| 479/479 [00:02<00:00, 169.45it/s]


Validation scores: [ MAE value:  tensor(0.0715)  ]	[ MSE value:  tensor(0.0090)  ]	[ Pearson Coefficient:  0.7271050203725442  ]
---------------------
Test scores: [ MAE value:  tensor(0.0709)  ]	[ MSE value:  tensor(0.0084)  ]	[ Pearson Coefficient:  0.6921118564609162  ]
---------------------
Train scores: 

  4%|▍         | 18/479 [00:00<00:02, 171.00it/s]

[ MAE value:  tensor(0.0664)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7666481025368861  ]
------------------------------------------------------------------------------------------------------
[Epoch:  5 / 25 ]


100%|██████████| 479/479 [00:02<00:00, 168.69it/s]


Validation scores: [ MAE value:  tensor(0.0698)  ]	[ MSE value:  tensor(0.0086)  ]	[ Pearson Coefficient:  0.7416203744503614  ]
---------------------
Test scores: [ MAE value:  tensor(0.0694)  ]	[ MSE value:  tensor(0.0081)  ]	[ Pearson Coefficient:  0.7088179315945415  ]
---------------------
Train scores: 

  4%|▎         | 17/479 [00:00<00:02, 166.44it/s]

[ MAE value:  tensor(0.0645)  ]	[ MSE value:  tensor(0.0069)  ]	[ Pearson Coefficient:  0.7826250998081209  ]
------------------------------------------------------------------------------------------------------
[Epoch:  6 / 25 ]


100%|██████████| 479/479 [00:02<00:00, 168.78it/s]


Validation scores: [ MAE value:  tensor(0.0690)  ]	[ MSE value:  tensor(0.0084)  ]	[ Pearson Coefficient:  0.753283376257626  ]
---------------------
Test scores: [ MAE value:  tensor(0.0688)  ]	[ MSE value:  tensor(0.0080)  ]	[ Pearson Coefficient:  0.718962926705357  ]
---------------------
Train scores: 

  4%|▎         | 17/479 [00:00<00:02, 167.94it/s]

[ MAE value:  tensor(0.0637)  ]	[ MSE value:  tensor(0.0067)  ]	[ Pearson Coefficient:  0.7935851022071814  ]
------------------------------------------------------------------------------------------------------
[Epoch:  7 / 25 ]


100%|██████████| 479/479 [00:02<00:00, 169.16it/s]


Validation scores: [ MAE value:  tensor(0.0681)  ]	[ MSE value:  tensor(0.0080)  ]	[ Pearson Coefficient:  0.7599245883255338  ]
---------------------
Test scores: [ MAE value:  tensor(0.0678)  ]	[ MSE value:  tensor(0.0077)  ]	[ Pearson Coefficient:  0.7259640442223045  ]
---------------------
Train scores: 

  4%|▎         | 17/479 [00:00<00:02, 165.50it/s]

[ MAE value:  tensor(0.0622)  ]	[ MSE value:  tensor(0.0064)  ]	[ Pearson Coefficient:  0.8011656998896777  ]
------------------------------------------------------------------------------------------------------
[Epoch:  8 / 25 ]


100%|██████████| 479/479 [00:02<00:00, 167.52it/s]


Validation scores: [ MAE value:  tensor(0.0677)  ]	[ MSE value:  tensor(0.0078)  ]	[ Pearson Coefficient:  0.7658056200513836  ]
---------------------
Test scores: [ MAE value:  tensor(0.0674)  ]	[ MSE value:  tensor(0.0076)  ]	[ Pearson Coefficient:  0.7317256921289992  ]
---------------------
Train scores: 

  4%|▍         | 18/479 [00:00<00:02, 172.62it/s]

[ MAE value:  tensor(0.0614)  ]	[ MSE value:  tensor(0.0062)  ]	[ Pearson Coefficient:  0.8073081393201369  ]
------------------------------------------------------------------------------------------------------
[Epoch:  9 / 25 ]


100%|██████████| 479/479 [00:02<00:00, 167.85it/s]


Validation scores: [ MAE value:  tensor(0.0676)  ]	[ MSE value:  tensor(0.0079)  ]	[ Pearson Coefficient:  0.7676084685472387  ]
---------------------
Test scores: [ MAE value:  tensor(0.0671)  ]	[ MSE value:  tensor(0.0075)  ]	[ Pearson Coefficient:  0.7356686078068609  ]
---------------------
Train scores: 

  4%|▎         | 17/479 [00:00<00:02, 169.88it/s]

[ MAE value:  tensor(0.0607)  ]	[ MSE value:  tensor(0.0061)  ]	[ Pearson Coefficient:  0.8130144663120091  ]
------------------------------------------------------------------------------------------------------
[Epoch:  10 / 25 ]


100%|██████████| 479/479 [00:02<00:00, 168.67it/s]


Validation scores: [ MAE value:  tensor(0.0673)  ]	[ MSE value:  tensor(0.0078)  ]	[ Pearson Coefficient:  0.7737045897994071  ]
---------------------
Test scores: [ MAE value:  tensor(0.0668)  ]	[ MSE value:  tensor(0.0075)  ]	[ Pearson Coefficient:  0.7395255633242441  ]
---------------------
Train scores: 

  3%|▎         | 16/479 [00:00<00:02, 156.96it/s]

[ MAE value:  tensor(0.0605)  ]	[ MSE value:  tensor(0.0060)  ]	[ Pearson Coefficient:  0.817579566160366  ]
------------------------------------------------------------------------------------------------------
[Epoch:  11 / 25 ]


100%|██████████| 479/479 [00:02<00:00, 167.03it/s]


Validation scores: [ MAE value:  tensor(0.0668)  ]	[ MSE value:  tensor(0.0076)  ]	[ Pearson Coefficient:  0.7743083635712337  ]
---------------------
Test scores: [ MAE value:  tensor(0.0662)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7422730372267496  ]
---------------------
Train scores: 

  4%|▎         | 17/479 [00:00<00:02, 165.90it/s]

[ MAE value:  tensor(0.0594)  ]	[ MSE value:  tensor(0.0058)  ]	[ Pearson Coefficient:  0.8215343265945051  ]
------------------------------------------------------------------------------------------------------
[Epoch:  12 / 25 ]


100%|██████████| 479/479 [00:02<00:00, 166.56it/s]


Validation scores: [ MAE value:  tensor(0.0665)  ]	[ MSE value:  tensor(0.0075)  ]	[ Pearson Coefficient:  0.7773186517894192  ]
---------------------
Test scores: [ MAE value:  tensor(0.0659)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7453182948022036  ]
---------------------
Train scores: 

  4%|▍         | 18/479 [00:00<00:02, 170.48it/s]

[ MAE value:  tensor(0.0589)  ]	[ MSE value:  tensor(0.0057)  ]	[ Pearson Coefficient:  0.8250271764269421  ]
------------------------------------------------------------------------------------------------------
[Epoch:  13 / 25 ]


100%|██████████| 479/479 [00:02<00:00, 167.36it/s]


Validation scores: [ MAE value:  tensor(0.0664)  ]	[ MSE value:  tensor(0.0075)  ]	[ Pearson Coefficient:  0.7778771356729148  ]
---------------------
Test scores: [ MAE value:  tensor(0.0656)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7462153557504867  ]
---------------------
Train scores: 

  4%|▎         | 17/479 [00:00<00:02, 168.51it/s]

[ MAE value:  tensor(0.0584)  ]	[ MSE value:  tensor(0.0056)  ]	[ Pearson Coefficient:  0.8284466749747347  ]
------------------------------------------------------------------------------------------------------
[Epoch:  14 / 25 ]


100%|██████████| 479/479 [00:02<00:00, 166.00it/s]


Validation scores: [ MAE value:  tensor(0.0665)  ]	[ MSE value:  tensor(0.0075)  ]	[ Pearson Coefficient:  0.7793272819090852  ]
---------------------
Test scores: [ MAE value:  tensor(0.0658)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7461637408688245  ]
---------------------
Train scores: 

  4%|▍         | 18/479 [00:00<00:02, 170.81it/s]

[ MAE value:  tensor(0.0582)  ]	[ MSE value:  tensor(0.0055)  ]	[ Pearson Coefficient:  0.8311997422514823  ]
------------------------------------------------------------------------------------------------------
[Epoch:  15 / 25 ]


100%|██████████| 479/479 [00:02<00:00, 166.91it/s]


Validation scores: [ MAE value:  tensor(0.0663)  ]	[ MSE value:  tensor(0.0074)  ]	[ Pearson Coefficient:  0.7816257888398427  ]
---------------------
Test scores: [ MAE value:  tensor(0.0656)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7481499640283121  ]
---------------------
Train scores: 

  4%|▎         | 17/479 [00:00<00:02, 166.84it/s]

[ MAE value:  tensor(0.0576)  ]	[ MSE value:  tensor(0.0054)  ]	[ Pearson Coefficient:  0.8344274079979205  ]
------------------------------------------------------------------------------------------------------
[Epoch:  16 / 25 ]


100%|██████████| 479/479 [00:02<00:00, 166.72it/s]


Validation scores: [ MAE value:  tensor(0.0663)  ]	[ MSE value:  tensor(0.0075)  ]	[ Pearson Coefficient:  0.7798956817136556  ]
---------------------
Test scores: [ MAE value:  tensor(0.0651)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7491972699514919  ]
---------------------
Train scores: 

  4%|▍         | 18/479 [00:00<00:02, 176.22it/s]

[ MAE value:  tensor(0.0571)  ]	[ MSE value:  tensor(0.0053)  ]	[ Pearson Coefficient:  0.837649747013278  ]
------------------------------------------------------------------------------------------------------
[Epoch:  17 / 25 ]


100%|██████████| 479/479 [00:02<00:00, 166.96it/s]


Validation scores: [ MAE value:  tensor(0.0661)  ]	[ MSE value:  tensor(0.0074)  ]	[ Pearson Coefficient:  0.7823680604331501  ]
---------------------
Test scores: [ MAE value:  tensor(0.0650)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.749902318894181  ]
---------------------
Train scores: 

  4%|▍         | 18/479 [00:00<00:02, 171.88it/s]

[ MAE value:  tensor(0.0566)  ]	[ MSE value:  tensor(0.0052)  ]	[ Pearson Coefficient:  0.8404965032975781  ]
------------------------------------------------------------------------------------------------------
[Epoch:  18 / 25 ]


100%|██████████| 479/479 [00:02<00:00, 165.39it/s]


Validation scores: [ MAE value:  tensor(0.0658)  ]	[ MSE value:  tensor(0.0074)  ]	[ Pearson Coefficient:  0.7857137485898213  ]
---------------------
Test scores: [ MAE value:  tensor(0.0650)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7511510043275953  ]
---------------------
Train scores: 

  4%|▎         | 17/479 [00:00<00:02, 168.84it/s]

[ MAE value:  tensor(0.0562)  ]	[ MSE value:  tensor(0.0051)  ]	[ Pearson Coefficient:  0.8435900779556458  ]
------------------------------------------------------------------------------------------------------
[Epoch:  19 / 25 ]


100%|██████████| 479/479 [00:02<00:00, 165.68it/s]


Validation scores: [ MAE value:  tensor(0.0653)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7883271698657516  ]
---------------------
Test scores: [ MAE value:  tensor(0.0647)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7524624123752998  ]
---------------------
Train scores: 

  4%|▎         | 17/479 [00:00<00:02, 166.80it/s]

[ MAE value:  tensor(0.0557)  ]	[ MSE value:  tensor(0.0050)  ]	[ Pearson Coefficient:  0.8465996805880709  ]
------------------------------------------------------------------------------------------------------
[Epoch:  20 / 25 ]


100%|██████████| 479/479 [00:02<00:00, 167.76it/s]


Validation scores: [ MAE value:  tensor(0.0654)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7882964877453545  ]
---------------------
Test scores: [ MAE value:  tensor(0.0650)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7545399295811894  ]
---------------------
Train scores: 

  4%|▎         | 17/479 [00:00<00:02, 165.84it/s]

[ MAE value:  tensor(0.0553)  ]	[ MSE value:  tensor(0.0049)  ]	[ Pearson Coefficient:  0.8496333647214472  ]
------------------------------------------------------------------------------------------------------
[Epoch:  21 / 25 ]


100%|██████████| 479/479 [00:02<00:00, 167.09it/s]


Validation scores: [ MAE value:  tensor(0.0659)  ]	[ MSE value:  tensor(0.0073)  ]	[ Pearson Coefficient:  0.7844460225770649  ]
---------------------
Test scores: [ MAE value:  tensor(0.0645)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7546919514821934  ]
---------------------
Train scores: 

  4%|▎         | 17/479 [00:00<00:02, 168.40it/s]

[ MAE value:  tensor(0.0547)  ]	[ MSE value:  tensor(0.0048)  ]	[ Pearson Coefficient:  0.8520507885934606  ]
------------------------------------------------------------------------------------------------------
[Epoch:  22 / 25 ]


100%|██████████| 479/479 [00:02<00:00, 166.07it/s]


Validation scores: [ MAE value:  tensor(0.0655)  ]	[ MSE value:  tensor(0.0072)  ]	[ Pearson Coefficient:  0.7874651895726961  ]
---------------------
Test scores: [ MAE value:  tensor(0.0648)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7554102922654012  ]
---------------------
Train scores: 

  4%|▍         | 18/479 [00:00<00:02, 170.22it/s]

[ MAE value:  tensor(0.0541)  ]	[ MSE value:  tensor(0.0047)  ]	[ Pearson Coefficient:  0.8554562317088216  ]
------------------------------------------------------------------------------------------------------
[Epoch:  23 / 25 ]


100%|██████████| 479/479 [00:02<00:00, 164.58it/s]


Validation scores: [ MAE value:  tensor(0.0653)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7893310040278932  ]
---------------------
Test scores: [ MAE value:  tensor(0.0646)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7561587915961809  ]
---------------------
Train scores: 

  4%|▍         | 18/479 [00:00<00:02, 169.93it/s]

[ MAE value:  tensor(0.0535)  ]	[ MSE value:  tensor(0.0046)  ]	[ Pearson Coefficient:  0.8585398953912436  ]
------------------------------------------------------------------------------------------------------
[Epoch:  24 / 25 ]


100%|██████████| 479/479 [00:02<00:00, 164.73it/s]


Validation scores: [ MAE value:  tensor(0.0648)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7923591454431228  ]
---------------------
Test scores: [ MAE value:  tensor(0.0645)  ]	[ MSE value:  tensor(0.0070)  ]	[ Pearson Coefficient:  0.7565379479701627  ]
---------------------
Train scores: 

  4%|▍         | 18/479 [00:00<00:02, 173.52it/s]

[ MAE value:  tensor(0.0530)  ]	[ MSE value:  tensor(0.0046)  ]	[ Pearson Coefficient:  0.8617601355721737  ]
------------------------------------------------------------------------------------------------------
[Epoch:  25 / 25 ]


100%|██████████| 479/479 [00:02<00:00, 168.53it/s]


Validation scores: [ MAE value:  tensor(0.0649)  ]	[ MSE value:  tensor(0.0070)  ]	[ Pearson Coefficient:  0.7923137728259763  ]
---------------------
Test scores: [ MAE value:  tensor(0.0649)  ]	[ MSE value:  tensor(0.0071)  ]	[ Pearson Coefficient:  0.7568958788050352  ]
---------------------
Train scores: [ MAE value:  tensor(0.0527)  ]	[ MSE value:  tensor(0.0045)  ]	[ Pearson Coefficient:  0.8645342180155735  ]
------------------------------------------------------------------------------------------------------


In [151]:
predf5_train = np.array(fcnnf5.forward(np.concatenate((trainmf, bertword_train,bertsent_train, glovesent_train, gloveword_train),axis=1)).detach().cpu()).reshape(-1)
predf5 = np.array(fcnnf5.forward(np.concatenate((testmf, bertword_test,bertsent_test, glovesent_test, gloveword_test),axis=1)).detach().cpu()).reshape(-1)
predf5_trial = np.array(fcnnf5.forward(np.concatenate((trialmf, bertword_trial,bertsent_trial, glovesent_trial, gloveword_trial),axis=1)).detach().cpu()).reshape(-1)

In [153]:
stats.pearsonr(predf5, testdf['complexity'])

(0.7568958807949003, 3.258471518012556e-171)

## Other ML Models

Gradient-Boosting Regressor

In [136]:
gb1 = GradientBoostingRegressor(random_state=0, n_estimators=100)

featmat_train = np.concatenate((trainmf, gloveword_train, glovesent_train, bertword_train),axis=1)
featmat_test = np.concatenate((testmf, gloveword_test, glovesent_test, bertword_test),axis=1)

labels_train = np.array(traindf['complexity'])
labels_test = np.array(testdf['complexity'])

gb1.fit(featmat_train, labels_train)

predgb1 = gb1.predict(featmat_test)
predgb1_train = gb1.predict(featmat_train)

stats.pearsonr(predgb1, labels_test)

(0.7472669247530377, 1.3948368482915857e-164)

Adaptive-Boosting Regressor

In [137]:
ab1 = AdaBoostRegressor(random_state=0, n_estimators=100)

featmat_train = np.concatenate((trainmf, gloveword_train, glovesent_train, bertword_train),axis=1)
featmat_test = np.concatenate((testmf, gloveword_test, glovesent_test, bertword_test),axis=1)

labels_train = np.array(traindf['complexity'])
labels_test = np.array(testdf['complexity'])

ab1.fit(featmat_train, labels_train)

predab1 = ab1.predict(featmat_test)
predab1_train = ab1.predict(featmat_train)

stats.pearsonr(predab1, labels_test)

(0.7303881735116872, 1.1772726222207615e-153)

Random Forest Regressor

In [139]:
rf1 = RandomForestRegressor(max_depth=7, random_state=0)

featmat_train = np.concatenate((trainmf, gloveword_train, glovesent_train, bertword_train),axis=1)
featmat_test = np.concatenate((testmf, gloveword_test, glovesent_test, bertword_test),axis=1)

labels_train = np.array(traindf['complexity'])
labels_test = np.array(testdf['complexity'])

rf1.fit(featmat_train, labels_train)

predrf1 = rf1.predict(featmat_test)
predrf1_train = rf1.predict(featmat_train)

stats.pearsonr(predrf1, labels_test)

(0.7259772687934656, 6.175532466576186e-151)

# Combine Predictions Ensemble

Using KNN Regressor

In [168]:
neigh = KNeighborsRegressor(n_neighbors=800)

final_train = np.column_stack((predab1_train,predgb1_train,predrf1_train,predf1_train,predf2_train,predf5_train))
final_test = np.column_stack((predab1,predgb1,predrf1,predf1,predf2,predf5))

labels_train = np.array(traindf['complexity'])
labels_test = np.array(testdf['complexity'])

neigh.fit(final_train,labels_train)

pred_final1 = neigh.predict(final_test)

stats.pearsonr(pred_final1, labels_test)

(0.7652538929412512, 3.1624391921968604e-177)

# Submit Final Predictions

In [169]:
submit_df = pd.DataFrame()
submit_df['id'] = testdf['id']
submit_df['predictions'] = pred_final1

submit_df.to_csv('test_res_single.csv', header=False, index=False)