In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
from gensim.models import Word2Vec as wv

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
#import PhysicallyInformedLossFunction as PhysLoss



In [2]:
#Vocabulary from Carbon corpus and Word2Vec model trained on all abstracts
#Opening contents of Word2Vec model1
data = '/Users/Thomas/Desktop/BETO2020-master/Ant_Syn_Scraping/all_abstracts_model'
os.chdir(data)
model1 = wv.load('all_abstract_model.model')
vocabulary1 = list(model1.wv.vocab)
#use model.build_vocab(sentence, update=True) to add missing words to model's vocabulary?
#or delete the rows that yield the KeyError?

In [3]:
data = '/Users/Thomas/Desktop/BETO2020-master/Ant_Syn_Scraping/'
os.chdir(data)
data_df = pd.read_excel('Carbon_SynAntList_Full_Refined_copy.xlsx', skip_rows=1, nrows=2000, index_col=0)
data_df = data_df.rename(columns = {'Unnamed: 1':'word 1', 'Unnamed: 2':'word 2','Unnamed: 3':'relationship', 'Unnamed: 4': 'label'})
#Adding columns for the syn and ant score labeling
data_df['syn score'] = np.nan
data_df['ant score'] = np.nan
data_df = data_df.fillna(0)
data_df = data_df[1:]

#finding which words are in the pd but not in vocabulary1
list1 = list(data_df['word 1'])
list2 = list(data_df['word 2'])
missing = list((set(list1).difference(vocabulary1))) + list((set(list2).difference(vocabulary1)))

#keeping only the rows in the pd that have words in vocabulary1
data_df = data_df[~data_df['word 1'].isin(missing)]
data_df = data_df[~data_df['word 2'].isin(missing)]

#reseting indeces after mask
data_df.reset_index(inplace = True)


In [6]:
data_df

Unnamed: 0,Carbon_SynAntList_Full_Refined,word 1,word 2,relationship,label,syn score,ant score
0,1.0,"[1.0246488, -5.6508703, -1.4263288, -3.1607409...","[0.3912109, -2.6639938, -0.4191871, -0.3595066...",syn,1,1.0,-1.0
1,2.0,"[1.0246488, -5.6508703, -1.4263288, -3.1607409...","[0.67807263, -0.0778522, 3.3564792, -1.8280518...",syn,1,1.0,-1.0
2,3.0,"[1.0246488, -5.6508703, -1.4263288, -3.1607409...","[-0.40175724, 0.66337395, -1.5072205, -1.73012...",syn,0,0.0,0.0
3,4.0,"[1.0246488, -5.6508703, -1.4263288, -3.1607409...","[2.6374276, -0.8799803, 1.9580756, -3.1686919,...",syn,0,0.0,0.0
4,7.0,"[1.0246488, -5.6508703, -1.4263288, -3.1607409...","[-1.5558529, 2.824446, -3.416154, -0.963536, 0...",syn,0,0.0,0.0
...,...,...,...,...,...,...,...
1595,1994.0,"[-0.2082266, 1.1971192, 1.1562068, -5.095038, ...","[-1.9820197, 0.99323726, 1.025493, -1.2684621,...",ant,0,0.0,0.0
1596,1995.0,"[-0.2082266, 1.1971192, 1.1562068, -5.095038, ...","[0.38347286, 0.7061271, -0.007815216, -1.64924...",ant,0,0.0,0.0
1597,1996.0,"[-0.2082266, 1.1971192, 1.1562068, -5.095038, ...","[-2.8429897, 1.9741825, -0.3001447, -5.303599,...",ant,0,0.0,0.0
1598,1997.0,"[-0.2082266, 1.1971192, 1.1562068, -5.095038, ...","[0.26439202, 0.44663662, -0.2789563, -3.575421...",ant,0,0.0,0.0


In [5]:
for i in range(len(data_df)): 
    data_df['word 1'].iloc[i] = model1.wv.__getitem__(str(data_df['word 1'].iloc[i]))
    data_df['word 2'].iloc[i] = model1.wv.__getitem__(str(data_df['word 2'].iloc[i]))
    
    if data_df['relationship'].iloc[i] == 'syn' and data_df['label'].iloc[i] == 1:
        data_df['syn score'].iloc[i] = 1
        data_df['ant score'].iloc[i] = -1
       
    elif data_df['relationship'].iloc[i] == 'ant' and data_df['label'].iloc[i] == 1:
        data_df['syn score'].iloc[i] = -1 
        data_df ['ant score'].iloc[i] = 1
        
    else:
        data_df['syn score'].iloc[i] = 0  
        data_df['ant score'].iloc[i] = 0
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [7]:
data_df.to_json('Phase_I_DATA.json')

In [8]:
data_df = pd.read_json('Phase_I_DATA.json', dtype = np.float32)

In [9]:
X = data_df[['word 1', 'word 2']]
Y = data_df[['syn score', 'ant score']]

x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2, shuffle = True)

w1_train = x_train['word 1']
w1_test = x_test['word 1']
w2_train = x_train['word 2']
w2_test = x_test['word 2']
ss_train = y_train['syn score']
ss_test = y_test['syn score']
as_train = y_train['ant score']
as_test = y_test['ant score']

train_data = {'word 1': w1_train, 'word 2': w2_train, 'syn score': ss_train, 'ant score': as_train}
test_data = {'word 1': w1_test, 'word 2': w2_test, 'syn score': ss_test, 'ant score': as_test}
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)


In [10]:
train_df.to_json('Phase_I_Train.json')
test_df.to_json('Phase_I_Test.json')

In [18]:
data_test[['word 1','word 2']].values.astype(np.float32)

ValueError: setting an array element with a sequence.

In [11]:
train_df

Unnamed: 0,word 1,word 2,syn score,ant score
605,"[-1.0964479446, 1.8805480003000001, 7.23442029...","[-1.3677221537, -0.019086884300000002, 0.45717...",1.0,-1.0
961,"[-2.1888554096, 0.4787855446, -0.3963083327, -...","[-0.4821321964, -0.6129127145000001, -0.523134...",1.0,-1.0
1430,"[-4.5729045868, -6.443136692, 5.7944054604, -6...","[0.0011501729, 0.07786653190000001, 0.21224316...",0.0,0.0
64,"[-0.9819027185, 0.8721030354, -3.6479685307, -...","[0.1252089888, 0.0357500389, 0.1065470129, -0....",1.0,-1.0
446,"[1.9019999504, 6.5153040886, 0.9711658955, 1.0...","[-0.0920875221, 0.204548806, 0.353068590200000...",0.0,0.0
...,...,...,...,...
1327,"[0.18577489260000002, 5.0477662086, 0.08229728...","[2.3581871986, 2.4673128128, -0.7351281047, -5...",0.0,0.0
1263,"[-0.44115230440000003, 6.5217018127, -1.735963...","[-2.6894249916, 2.8726806641, -2.5519349575, -...",0.0,0.0
1420,"[-4.5729045868, -6.443136692, 5.7944054604, -6...","[-7.3409018517, -2.5313780308, 1.9750880003, -...",0.0,0.0
790,"[2.9429700375000003, 3.3651957512, -3.32590603...","[-4.2139658928, 3.4794118404, -2.6765217781, 0...",0.0,0.0


In [12]:
data_test = pd.read_json('Phase_I_Train.json', dtype = np.float32)
data_test.reset_index(inplace = True)

In [14]:
data_test1 = [[] for i in range(1280)]
for i in range(len(data_test1)):
    data_test1[i] = data_test[['word 1','word 2']].values[i][0],data_test[['word 1','word 2']].values[i][1]
data_test1 = np.array(data_test1)


array([[[-1.09644794e+00,  1.88054800e+00,  7.23442030e+00, ...,
         -1.78487390e-01,  3.47584724e+00,  4.74188423e+00],
        [-1.36772215e+00, -1.90868843e-02,  4.57179457e-01, ...,
         -3.98191623e-02,  1.12433410e+00,  2.05543622e-01]],

       [[-2.18885541e+00,  4.78785545e-01, -3.96308333e-01, ...,
          1.25908542e+00,  3.03345025e-01,  2.30789113e+00],
        [-4.82132196e-01, -6.12912715e-01, -5.23134172e-01, ...,
         -7.88991034e-01, -7.61382461e-01,  7.21247256e-01]],

       [[-4.57290459e+00, -6.44313669e+00,  5.79440546e+00, ...,
          3.90708256e+00, -2.60906339e+00,  8.76600265e+00],
        [ 1.15017290e-03,  7.78665319e-02,  2.12243169e-01, ...,
          2.03877643e-01,  1.32272854e-01,  7.95163587e-02]],

       ...,

       [[-4.57290459e+00, -6.44313669e+00,  5.79440546e+00, ...,
          3.90708256e+00, -2.60906339e+00,  8.76600265e+00],
        [-7.34090185e+00, -2.53137803e+00,  1.97508800e+00, ...,
          3.88094306e+00,  3.46689

In [None]:
#Hyper parameters
num_epochs = 100
batch_size = 50
learning_rate = 0.008

# Device configuration (GPU if available, otherwise CPU)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
class Phase_I_Train_Dataset(Dataset):
    
    def __init__(self):
        
        data = pd.read_json('Phase_I_Train.json', dtype = np.float32)
        self.len = data.shape[0]
        
        data_x = [[] for i in range(1280)] #creating empty lists to store our w1,w2 vectors in data_x and synscore,antscore in data_y
        data_y = [[] fro i in range(1280)] #these lists are filled with data, turned into np.arrays and then to tensors
        
        for i in range(len(data)):
            data_x[i] = data[['word 1','word 2']].values[i][0],data[['word 1','word 2']].values[i][1]
        
        data_x = np.array(data_x)
       
         for i in range(len(data)):
            data_y[i] = data_test[['syn score','ant score']].values[i][0],data[['syn score','ant score']].values[i][1]
        
        data_y = np.array(data_y)
            
        #split into x_data our features and y_data our targets
        self.x_data = torch.from_numpy(data_x)
        self.y_data = torch.from_numpy(data_y)
        
    def __len__(self):
        
        return self.len
    
    def __getitem__(self, index):
        
        return self.x_data, self.y_data

if __name__ == '__main__':
    dataset = Phase_I_Train_Dataset()

In [None]:
class Phase_I_Test_Dataset(Dataset):
    
    def __init__(self):
        
        data = pd.read_json('Phase_I_Test.json', dtype = np.float32)
        self.len = data.shape[0]
        
        data_x = [[] for i in range(1280)] #creating empty lists to store our w1,w2 vectors in data_x and synscore,antscore in data_y
        data_y = [[] fro i in range(1280)] #these lists are filled with data, turned into np.arrays and then to tensors
        
        for i in range(len(data)):
            data_x[i] = data[['word 1','word 2']].values[i][0],data[['word 1','word 2']].values[i][1]
        
        data_x = np.array(data_x)
       
         for i in range(len(data)):
            data_y[i] = data_test[['syn score','ant score']].values[i][0],data[['syn score','ant score']].values[i][1]
        
        data_y = np.array(data_y)
            
        #split into x_data our features and y_data our targets
        self.x_data = torch.from_numpy(data_x)
        self.y_data = torch.from_numpy(data_y)

      
    def __len__(self):
        
        return self.len
    
    def __getitem__(self, index):
        
        return self.x_data[index], self.y_data[index]

if __name__ == '__main__':
    dataset = Phase_I_Test_Dataset()

In [None]:
class Phase_I_NN(nn.Module):

    def __init__(self, in_dims, out_dims):
        super(SYN_TEST, self).__init__()
        
        #nn.Embedding.from_pretrained(weights) something to think about. weights = model1.wv.
        
        #hidden layers
        self.hidden_layer = nn.Linear(100, 32)
        self.hidden_layer1 = nn.Linear(32, 16)
        
        self.S_branch = nn.Sequential( #synonym subspace branch
        nn.Linear(in_dims,50) . #nn.Embedding should have vocab_size(1280),vector_size(50)
        nn.Dropout(0.2), #to limit overfitting
        nn.Linear(50,100), #expand
        nn.Linear(100,300),
        nn.Linear(300,100),
        nn.Linear(100,50)) #compress
        
        #nn.Softplus()
        
        self.A_branch = nn.Sequential(
        nn.Linear(in_dims, 50)
        nn.Dropout(0.2), #to limit overfitting
        nn.Linear(50,100), #expand
        nn.Linear(100,300),
        nn.Linear(300,100),
        nn.Linear(100,50)) #compress
    
   
    def forward(self, w1, x2):
       
        #pass through hidden layers
        w1 = self.hidden_layer(w1)
        w1 = self.hidden1_layer(w1)
        w2 = self.hidden_layer(w2)
        w2 = self.hidden_layer1(w2)
        
        #pass each embedded data through each branch to be situated in subspaces
        S1_out = self.S_branch(w1)
        S2_out = self.S_branch(w2)
        A1_out = self.A_branch(w1)
        A2_out = self.A_branch(w2)
        
        return S1_out, A1_out, S2_out, A2_out #the encoders in each subspace