In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
from gensim.models import Word2Vec as wv

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

import math
#import PhysicallyInformedLossFunction as PhysLoss



In [2]:
#Vocabulary from Carbon corpus and Word2Vec model trained on all abstracts
#Opening contents of Word2Vec model1
data = '/Users/Thomas/Desktop/BETO2020-master/Ant_Syn_Scraping/all_abstracts_model'
os.chdir(data)
model1 = wv.load('all_abstract_model.model')
vocabulary1 = list(model1.wv.vocab)
#use model.build_vocab(sentence, update=True) to add missing words to model's vocabulary?
#or delete the rows that yield the KeyError?

In [21]:
data = '/Users/Thomas/Desktop/BETO2020-master/Ant_Syn_Scraping/'
os.chdir(data)
data_df = pd.read_excel('Carbon_SynAntList_Full_Refined_copy.xlsx', skip_rows=1, nrows=2000, index_col=0)
data_df = data_df.rename(columns = {'Unnamed: 1':'word 1', 'Unnamed: 2':'word 2','Unnamed: 3':'relationship', 'Unnamed: 4': 'label'})
#Adding columns for the syn and ant score labeling
data_df['syn score'] = np.nan
data_df['ant score'] = np.nan
data_df = data_df.fillna(0)
data_df = data_df[1:]

#finding which words are in the pd but not in vocabulary1
list1 = list(data_df['word 1'])
list2 = list(data_df['word 2'])
missing = list((set(list1).difference(vocabulary1))) + list((set(list2).difference(vocabulary1)))

#keeping only the rows in the pd that have words in vocabulary1
data_df = data_df[~data_df['word 1'].isin(missing)]
data_df = data_df[~data_df['word 2'].isin(missing)]

#reseting indeces after mask
data_df.reset_index(inplace = True)


In [22]:
for i in range(len(data_df)): 
    data_df['word 1'].iloc[i] = model1.wv.__getitem__(str(data_df['word 1'].iloc[i])).tolist()
    data_df['word 2'].iloc[i] = model1.wv.__getitem__(str(data_df['word 2'].iloc[i])).tolist()
    
    if data_df['relationship'].iloc[i] == 'syn' and data_df['label'].iloc[i] == 1:
        data_df['syn score'].iloc[i] = 1
        data_df['ant score'].iloc[i] = -1
       
    elif data_df['relationship'].iloc[i] == 'ant' and data_df['label'].iloc[i] == 1:
        data_df['syn score'].iloc[i] = -1 
        data_df ['ant score'].iloc[i] = 1
        
    else:
        data_df['syn score'].iloc[i] = 0  
        data_df['ant score'].iloc[i] = 0
    

In [23]:
data_df.to_json('Phase_I_DATA.json')

In [2]:
data_df = pd.read_json('Phase_I_DATA.json', dtype = np.float32)

In [3]:
X = data_df[['word 1', 'word 2']]
Y = data_df[['syn score', 'ant score']]

x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2, shuffle = True)

w1_train = x_train['word 1']
w1_test = x_test['word 1']
w2_train = x_train['word 2']
w2_test = x_test['word 2']
ss_train = y_train['syn score']
ss_test = y_test['syn score']
as_train = y_train['ant score']
as_test = y_test['ant score']

train_data = {'word 1': w1_train, 'word 2': w2_train, 'syn score': ss_train, 'ant score': as_train}
test_data = {'word 1': w1_test, 'word 2': w2_test, 'syn score': ss_test, 'ant score': as_test}
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)


In [4]:
train_df.to_json('Phase_I_Train.json')
test_df.to_json('Phase_I_Test.json')

In [5]:
#For demonstration
data_test = pd.read_json('Phase_I_Train.json', dtype = np.float32)
data_test

Unnamed: 0,word 1,word 2,syn score,ant score
851,"[2.2395038605, 3.5965807438, -1.2376616001, 1....","[-1.2664538622000001, 1.3258943558, -0.6398437...",0.0,0.0
194,"[-3.3570842743, 3.6897196770000003, -3.2487981...","[0.20641681550000002, 0.42983156440000003, 2.8...",1.0,-1.0
940,"[-2.1888554096, 0.4787855446, -0.3963083327, -...","[-0.3114672303, 0.271281451, 0.2920138836, -0....",0.0,0.0
163,"[5.435072422, -2.632392168, -0.1703399569, 8.6...","[-0.30101034050000003, 0.6878371835, 8.9787521...",-1.0,1.0
739,"[-2.3573684692, 2.4296035767, 0.37209683660000...","[-0.140732944, 1.2832295894999999, -0.07431017...",0.0,0.0
...,...,...,...,...
1025,"[-0.6243764758, -1.7411718369, 0.1345199645, 7...","[0.7221094966, 2.4051239491, 0.1043295786, 0.5...",0.0,0.0
466,"[1.9019999504, 6.5153040886, 0.9711658955, 1.0...","[0.4206728637, -0.1374336779, 0.2669203579, -0...",0.0,0.0
1132,"[1.4298911095, -0.4888526797, -0.2914942503000...","[-1.2425853014000001, -0.7078535557000001, 0.5...",0.0,0.0
1533,"[-1.0172487497, -0.9772694111, -5.5444698334, ...","[7.4718694687, 3.5576741695000003, -1.54801023...",1.0,-1.0


In [8]:
#For demonstration
test_list = list(zip(data_test['word 1'], data_test['word 2']))
test_list

[([2.2395038605,
   3.5965807438,
   -1.2376616001,
   1.7468365431000001,
   -3.1807043552,
   1.6663434504999999,
   -2.0863444805,
   -6.1954593658,
   3.5108673573,
   6.7212710381,
   2.5206851959,
   2.1309769154,
   2.6051411629,
   4.4279904366,
   -2.4078772068,
   1.8280955553,
   -6.2539834976,
   -1.5596629381,
   0.7587798238,
   1.9181656837,
   1.6043621302000002,
   0.19026087220000001,
   3.4019424915,
   2.2992506027,
   0.2812325954,
   0.3731970489,
   5.0699224472,
   1.6392213106,
   2.6245405674,
   3.7503473759,
   -0.5573862791,
   1.8348488808,
   4.1200008392,
   -4.7015891075,
   0.9878739119000001,
   -0.3640704453,
   -0.3355996013,
   -5.7318310738000005,
   3.9466567039,
   -6.4235076904,
   2.9458572865,
   -3.8856778145,
   -3.0618305206,
   2.780633688,
   3.9797980785,
   0.0560765676,
   4.6548089981,
   1.1819375753,
   3.5565371513,
   5.7520780563],
  [-1.2664538622000001,
   1.3258943558,
   -0.6398437023,
   -2.139064312,
   -0.6382690668000001

In [9]:
#For demonstration
test_list_tensor = torch.tensor(test_list)
test_list_tensor

tensor([[[ 2.2395,  3.5966, -1.2377,  ...,  1.1819,  3.5565,  5.7521],
         [-1.2665,  1.3259, -0.6398,  ..., -1.7995,  1.6326,  3.4495]],

        [[-3.3571,  3.6897, -3.2488,  ..., -4.1662,  0.3341,  4.0005],
         [ 0.2064,  0.4298,  2.8434,  ..., -1.6797, -2.4554,  0.7218]],

        [[-2.1889,  0.4788, -0.3963,  ...,  1.2591,  0.3033,  2.3079],
         [-0.3115,  0.2713,  0.2920,  ...,  0.0491,  0.1167,  0.7997]],

        ...,

        [[ 1.4299, -0.4889, -0.2915,  ..., -0.5280,  0.8058, -2.0203],
         [-1.2426, -0.7079,  0.5545,  ..., -2.3977,  2.7971,  1.5360]],

        [[-1.0172, -0.9773, -5.5445,  ...,  2.3440,  2.0958, -2.1268],
         [ 7.4719,  3.5577, -1.5480,  ...,  1.8921,  2.2664,  4.5135]],

        [[ 0.2229,  1.7293,  0.0486,  ..., -1.3237,  1.4602,  9.1237],
         [-0.5281,  0.1218,  0.1300,  ...,  0.1321,  0.0896,  0.0867]]])

In [15]:
#For demonstration
data_test = pd.read_json('Phase_I_Train.json', dtype = np.float32)
data_test.reset_index(inplace = True)

In [None]:
#Hyper parameters
num_epochs = 100
batch_size = 50
learning_rate = 0.008

# Device configuration (GPU if available, otherwise CPU)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
class Phase_I_Train_Dataset(Dataset):
    
    def __init__(self):
        
        data = pd.read_json('Phase_I_Train.json', dtype = np.float32)
        self.len = data.shape[0]
        
        data_x = list(zip(data['word 1'], data_test['word 2'])) #creating a list of tuples where [w1,w2] and [ss, as]
        data_y = list(zip(data['syn score'], data['ant score']))
            
        #split into x_data our features and y_data our targets
        self.x_data = torch.tensor(data_x)
        self.y_data = torch.tensor(data_y)
        
    def __len__(self):
        
        return self.len
    
    def __getitem__(self, index):
        
        return self.x_data, self.y_data

if __name__ == '__main__':
    dataset = Phase_I_Train_Dataset()

In [None]:
class Phase_I_Test_Dataset(Dataset):
    
    def __init__(self):
        
        data = pd.read_json('Phase_I_Test.json', dtype = np.float32)
        self.len = data.shape[0]
        
        data_x = list(zip(data['word 1'], data_test['word 2'])) #creating a list of tuples where [w1,w2] and [ss, as]
        data_y = list(zip(data['syn score'], data['ant score']))
            
        #split into x_data our features and y_data our targets
        self.x_data = torch.tensor(data_x)
        self.y_data = torch.tensor(data_y)

      
    def __len__(self):
        
        return self.len
    
    def __getitem__(self, index):
        
        return self.x_data[index], self.y_data[index]

if __name__ == '__main__':
    dataset = Phase_I_Test_Dataset()

In [None]:
#DataLoader cell here
training_data_set = torch.utils.data.DataLoader(dataset = Phase_I_Train_Dataset(), batch_size = batch_size, shuffle = True)
testing_data_set = torch.utils.data.DataLoader(dataset = Phase_I_Test_Dataset(), batch_size = batch_size, shuffle = True)

In [None]:
class Phase_I_NN(nn.Module):

    def __init__(self, in_dims, out_dims):
        super(SYN_TEST, self).__init__()
        
        #embedding layer
        self.nn.Embedding(2,50) #may be 100 cause we have one tensor with two words of 50 dim
        
        #hidden layers
        self.hidden_layer = nn.Linear(100, 32)
        self.hidden_layer1 = nn.Linear(32, 16)
        
        self.S_branch = nn.Sequential( #synonym subspace branch
        nn.Linear(in_dims,50)
        nn.Dropout(0.1), #to limit overfitting
        nn.Linear(50,100), #expand
        nn.Linear(100,300),
        nn.Linear(300,100),
        nn.Linear(100,50)) #compress
        
       
       
        
        self.A_branch = nn.Sequential(#need some activation function after each Linear function. Softmax is the NLP convention.
        nn.Linear(in_dims, 50)
        
        nn.Dropout(0.1), #to limit overfitting
        nn.Linear(50,100),
            #expand
        nn.Linear(100,300),
            
        nn.Linear(300,100),
            
        nn.Linear(100,50),
        
        #define activation function here, could be: nn.ReLu(), nn.Softmax(),nn.Softplus() 
    
   
    def forward(self, (w1, w2)):
       
        #pass through the embedding layer (to confirm)
        em_w1 = nn.Embedding(w1)
        em_w2 = nn.Embedding(w2)
        
        #or
        
        em_w1, em_w2 = nn.Embedding((w1,w2))
        
        #pass through hidden layers. For each linear layer in the hidden/branches, use the activation function to push
        out_w1 = self.hidden_layer(em_w1) 
        out_w1 = self.hidden1_layer(out_w1)
        out_w2 = self.hidden_layer(em_w2)
        out_w2 = self.hidden_layer1(out_w2)
        
        #pass each embedded data through each branch to be situated in subspaces
        S1_out = self.S_branch(out_w1)
        S2_out = self.S_branch(out_w2)
        A1_out = self.A_branch(out_w1)
        A2_out = self.A_branch(out_w2)
        
        #Need to find a way to collect encoder embeddings as well as their scoring
            
        synonymy_score = math.cos(S1_out,S2_out)
        antonymy_score = max(math.cos(A1_out,S2_out),math.cos(A2_out,S1_out))
                              
        return synonymy_score, antonymy_score #the encoders in each subspace

In [None]:
def S_loss(output, target):
    

In [None]:
def A_loss(output, target):
    

In [None]:
def Phase_I_train_model(model, Phase_I_training_data_set, optimizer)
    train_losses = []
    syn_train_losses = []
    ant_train_losses
    
    train_epoch_loss = []
    syn_train_epoch_loss = []
    ant_train_epoch_loss =[]
    
    train_total = 0
    
    #switch model to training mode
    model.train()
    
    syn_criterion = S_loss(?)
    ant_criterion = A_loss(?)
    
    for i, data in enumerate(Phase_I_training_data_set,0):
        
        features, labels = data
        
        #have been encountering an issue where the data is not a double() but a float()
        features, labels = np.double(features), np.double(labels)
        
        model.zero_grad() #zero out any gradients from prior loops 
        synonymy_score, antonymy_score = model(features) #gather model predictions for this loop
        
        #calculate error in the predictions
        syn_loss = syn_criterion(synonymy_score, label[0])
        ant_loss = ant_criterion(antonymy_score, label[1])
        total_loss = syn_loss + ant_loss #+ L_M
        
        #BACKPROPAGATE LIKE A MF
        torch.autograd.backward([syn_loss, ant_loss]) #This may be where we want to add the L_M part of the loss function
        optimizer.step()
        
        #save loss for this batch
        train_losses.append(total_loss.item())
        train_total+=1
        
        syn_train_losses.append(syn_loss.item())
        ant_train_losses.append(ant_loss.item())
        
    #calculate and save total error for this epoch of training
    epoch_loss = sum(train_losses)/train_total
    train_epoch_loss.append(epoch_loss)
    
    syn_train_epoch_loss.append(sum(syn_train_losses)/train_total)
    ant_train_epoch_loss.append(sum(ant_train_losses)/train_total)
    
    return model, train_epoch_loss, syn_train_epoch_loss, ant_train_epoch_loss

In [None]:
def Phase_I_eval_model(model, testing_data_set, optimizer):
    #evaluate the model
    model.eval()
    
    syn_criterion = S_loss(?) #? is where we will establish threshold
    ant_criterion = A_loss(?)
    #accuracy = #total number of correct predictions divided by the total number of predictions

    #don't update nodes during evaluation b/c not training
    with torch.no_grad():
        test_losses = []
        syn_test_losses = []
        ant_test_losses = []
        
        syn_test_acc_list = []
        ant_test_acc_list = []
        
        test_total = 0

        #for inputs, labels in testing_data_set:
        #similar change to the train_model portion due to the nature of our data
        #inputs = inputs.to(device)
        #labels = labels.to(device)
        
        for i, data in enumerate(testing_data_set,0):
        
            inputs, labels = data
        
            #have been encountering an issue where the data is not a double() but a float()
            inputs, labels = np.double(inputs), np.double(labels)
            
            synonymy_score, antonymy_score = model(inputs)

            # calculate loss per batch of testing data
            syn_test_loss = syn_criterion(synonymy_score, labels[0])
            ant_test_loss = ant_criterion(antonymy_score, labels[1])
            
            test_loss = syn_test_loss + ant_test_loss #+L_M
            
            test_losses.append(test_loss.item())
            syn_test_losses.append(syn_test_loss.item())
            test_total += 1 
            
            syn_acc = accuracy(synonymy_score)
            ant_acc = accuracy(antonymy_score)
            
            syn_test_acc_list.append(syn_acc.item())
            ant_test_acc_list.append(ant_acc.item())

        test_epoch_loss = sum(test_losses)/test_total
        syn_test_epoch_loss = sum(syn_test_losses)/test_total
        ant_test_epoch_loss = sum(ant_test_losses)/test_total
        
        syn_epoch_acc = sum(syn_test_acc_list)/test_total
        ant_epoch_acc = sum(ant_test_acc_list)/test_total

        print(f"Total Epoch Testing Loss is: {test_epoch_loss}")
        #print(f"Epoch MAPE: Syn = {syn_epoch_acc}")
    
    return test_epoch_loss, syn_test_epoch_loss, ant_test_epoch_loss, syn_epoch_acc, ant_epoch_acc


In [None]:
# Instantiate our beautiful NN model
# takes in 
# predicts synonymy
model = SYN_TEST(in_dims = 50, out_dims = 2).to(device)

#define the optimizer
optimizer = torch.optim.Adam(params = model.parameters(), lr = learning_rate)


In [None]:
#empty list to hold loss per epoch
train_epoch_losses = []
syn_train_epoch_losses = []
ant_train_epoches_losses = []

test_epoch_losses = []
syn_test_epoch_losses = []
ant_test_epoch_losses = []

syn_test_epoch_accuracies = []
ant_test_epohc_accuracies

for epoch in range(num_epochs):
    
    train_epoch_loss, syn_train_epoch_loss, ant_train_epoch_loss  = train_model(model = model, training_data_set = training_data_set, optimizer = optimizer)
    
    train_epoch_losses.append(train_epoch_loss)
    syn_train_epoch_losses.append(syn_train_epoch_loss)
    ant_train_epoch_losses.append(ant_train_epoch_loss)
   
    test_epoch_loss, syn_test_epoch_loss, ant_test_epoch_loss = eval_model(model = model, testing_data_set = testing_data_set, optimizer = optimizer)
    #syn_epoch_acc
    
    test_epoch_losses.append(test_epoch_loss)
    syn_test_epoch_losses.append(syn_test_epoch_loss)
    
    #pce_test_epoch_accuracies.append(pce_epoch_acc)
    #voc_test_epoch_accuracies.append(voc_epoch_acc)
    #jsc_test_epoch_accuracies.append(jsc_epoch_acc)
    #ff_test_epoch_accuracies.append(ff_epoch_acc)

In [None]:
def Phase_II_train_model(model, training_data_set, optimizer):
    train_losses = []
    syn_train_losses = []
    ant_trian_losses
    
    train_epoch_loss = []
    syn_train_epoch_loss = []
    ant_train_epoch_loss
    
    train_total = 0
    
    #switch model to training mode
    model.train()
    
    syn_criterion = nn.MSELoss()
    ant_criterion = 
    
    #for features, labels in training_data_set: 
    
    #may need to change above "features" portion here to accomodate for our custom dataset
    #below is proposed alternative
    
    for i, data in enumerate(training_data_set,0):
        
        features, labels = data
        
        #have been encountering an issue where the data is not a double() but a float()
        features, labels = np.double(features), np.double(labels)
        
        model.zero_grad() #zero out any gradients from prior loops 
        syn_out = model(features) #gather model predictions for this loop
        
        #calculate error in the predictions
        syn_loss = syn_criterion(syn_out, labels)
        total_loss = syn_loss
        
        #BACKPROPAGATE LIKE A MF
        torch.autograd.backward([syn_loss])
        optimizer.step()
        
        #save loss for this batch
        train_losses.append(total_loss.item())
        train_total+=1
        
        syn_train_losses.append(syn_loss.item())
        
    #calculate and save total error for this epoch of training
    epoch_loss = sum(train_losses)/train_total
    train_epoch_loss.append(epoch_loss)
    
    syn_train_epoch_loss.append(sum(syn_train_losses)/train_total)
    
    #update progress bar
    print(f"Total Epoch Training Loss is: {train_epoch_loss}")
    
    return train_epoch_loss, syn_train_epoch_loss

In [None]:
def Phase_II_eval_model(model, testing_data_set, optimizer):
    #evaluate the model
    model.eval()
    
    syn_criterion = nn.MSELoss()
    #accuracy = #total number of correct predictions divided by the total number of predictions

    #don't update nodes during evaluation b/c not training
    with torch.no_grad():
        test_losses = []
        syn_test_losses = []
        #syn_test_acc_list = []
        
        test_total = 0

        #for inputs, labels in testing_data_set:
        #similar change to the train_model portion due to the nature of our data
        #inputs = inputs.to(device)
        #labels = labels.to(device)
        
        for i, data in enumerate(testing_data_set,0):
        
            inputs, labels = data
        
            #have been encountering an issue where the data is not a double() but a float()
            inputs, labels = np.double(inputs), np.double(labels)
            
            syn_out = model(inputs)

            # calculate loss per batch of testing data
            syn_test_loss = syn_criterion(syn_out, labels)
            
            test_loss = syn_test_loss
            
            test_losses.append(test_loss.item())
            syn_test_losses.append(syn_test_loss.item())
            test_total += 1 
            #syn_acc = accuracy(syn_out)
            #syn_test_acc_list.append(syn_acc.item())

        test_epoch_loss = sum(test_losses)/test_total
        syn_test_epoch_loss = sum(syn_test_losses)/test_total
        
        #syn_epoch_acc = sum(syn_test_acc_list)/test_total

        print(f"Total Epoch Testing Loss is: {test_epoch_loss}")
        #print(f"Epoch MAPE: Syn = {syn_epoch_acc}")
    
    return test_epoch_loss, syn_test_epoch_loss, #syn_epoch_acc


In [None]:
# Instantiate our beautiful NN model
# takes in 
# predicts synonymy
model = SYN_TEST(in_dims = 50, out_dims = 2).to(device)

#define the optimizer
optimizer = torch.optim.Adam(params = model.parameters(), lr = learning_rate)



In [None]:
#empty list to hold loss per epoch
train_epoch_losses = []
syn_train_epoch_losses = []

test_epoch_losses = []
syn_test_epoch_losses = []

syn_test_epoch_accuracies = []


for epoch in range(num_epochs):
    
    train_epoch_loss, syn_train_epoch_loss  = train_model(model = model, training_data_set = training_data_set, optimizer = optimizer)
    
    train_epoch_losses.append(train_epoch_loss)
    syn_train_epoch_losses.append(syn_train_epoch_loss)
   
    test_epoch_loss, syn_test_epoch_loss = eval_model(model = model, testing_data_set = testing_data_set, optimizer = optimizer)
    #syn_epoch_acc
    
    test_epoch_losses.append(test_epoch_loss)
    syn_test_epoch_losses.append(syn_test_epoch_loss)
    
    #pce_test_epoch_accuracies.append(pce_epoch_acc)
    #voc_test_epoch_accuracies.append(voc_epoch_acc)
    #jsc_test_epoch_accuracies.append(jsc_epoch_acc)
    #ff_test_epoch_accuracies.append(ff_epoch_acc)