In [1]:
import os
import sys
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

%load_ext autoreload
%matplotlib inline

module_path = os.path.abspath(os.path.join('./syn_ant_modules'))
if module_path not in sys.path:
    sys.path.append(module_path)
import model_dataset as dataset 
import model_functions_PhaseI as functions
import model_morphology as morpho
import model_testing_PhaseI as test 
import model_training_PhaseI as train

torch.manual_seed(28)

<torch._C.Generator at 0x1a2a1cae90>

In [2]:
word_pairs_df = pd.read_excel('/Users/wesleytatum/Desktop/post_doc/BETO/BETO2020/Ant_Syn_Scraping/data/new_syn_ant_list.xlsx')
word_pairs_df

Unnamed: 0.1,Unnamed: 0,word 1,word 2,label
0,0,infinite,bounded,2
1,1,infinite,brief,2
2,2,infinite,calculable,0
3,3,infinite,ceasing,2
4,4,infinite,ending,2
...,...,...,...,...
117245,117245,"(4S,7R,10S,13R,16S,22S,25S,31S,34S)-31-(4-amin...",ch-288,1
117246,117246,"4-[1-(4-aminophenyl)-2,2,2-trichloroethyl]aniline",dadt,1
117247,117247,1-azabicyclo[2.2.2]octan-3-yl 2-hydroxy-2-(4-i...,3-quinuclidinyl-4-iodobenzilate,1
117248,117248,(2S)-2-[[(2S)-2-[(2-aminoacetyl)amino]propanoy...,gad,1


In [3]:
word_pairs_df['label'].value_counts()

1    112118
0      4699
2       433
Name: label, dtype: int64

In [9]:
indices = pd.DataFrame(columns = ['index', 'word'])

index = 0

pbar = tqdm(total = len(word_pairs_df), position = 0)

for i in range(len(word_pairs_df)):
    
    word1 = word_pairs_df['word 1'].iloc[i]
    word2 = word_pairs_df['word 2'].iloc[i]
    
    if word1 not in indices['word']:
        indices.loc[index] = pd.Series({'index':index, 'word':word1})
        index+=1
    else:
        pass
    
    if word2 not in indices['word']:
        indices.loc[index] = pd.Series({'index':index, 'word':word2})
        index+=1
    else:
        pass
    
    pbar.update()

100%|██████████| 117250/117250 [47:04<00:00, 18.42it/s]

In [10]:
indices.to_json('/Users/wesleytatum/Desktop/post_doc/data/syn_ant_index.json')

100%|██████████| 117250/117250 [47:21<00:00, 18.42it/s]

In [4]:
indices = pd.read_json('/Users/wesleytatum/Desktop/post_doc/data/syn_ant_index.json')

In [5]:
#if new train-test-split is needed

%autoreload
X = word_pairs_df[['word 1', 'word 2']]
Y = word_pairs_df['label']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, shuffle = True)

train_save = '/Users/wesleytatum/Desktop/post_doc/data/nn_datasets/train.json'
test_save = '/Users/wesleytatum/Desktop/post_doc/data/nn_datasets/test.json'

train_set = dataset.DistillerDataset(x_train, y_train, indices, path = train_save)
test_set = dataset.DistillerDataset(x_test, y_test, indices, path = test_save)

100%|██████████| 23450/23450 [11:18<00:00, 34.73it/s]  

In [7]:
#if existing train-test-split is alright

train_path ='/Users/wesleytatum/Desktop/post_doc/data/nn_datasets/train.json'
test_path = '/Users/wesleytatum/Desktop/post_doc/data/nn_datasets/test.json'

wp = 'dummy_variable'
lbs = 'dummy_variable'

train_set = dataset.DistillerDataset(word_pairs = wp, labels = lbs,
                                 indices = indices, path = train_path)
test_set = dataset.DistillerDataset(word_pairs = wp, labels = lbs,
                                indices = indices, path = test_path)

ValueError: If using all scalar values, you must pass an index

In [6]:
#Hyper parameters
num_epochs = 25
batch_size = 100
learning_rate = 5e-5

# Device configuration (GPU if available, otherwise CPU)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

train_loader = DataLoader(train, batch_size = batch_size)
test_loader = DataLoader(test, batch_size = batch_size)

In [8]:
%autoreload

# Instantiate NN model. 'in_dims' = dimensions of embeddings in 'common'
model = morpho.Phase_I_NN(in_dims = 50, common = None).to(device)

#define the optimizer
optimizer = torch.optim.AdamW(params = model.parameters(),
                              lr = learning_rate,
                              amsgrad = False)

In [10]:
%autoreload

#empty list to hold loss per epoch
train_epoch_losses = []
syn_train_epoch_losses = []
ant_train_epoch_losses = []

test_epoch_losses = []
syn_test_epoch_losses = []
ant_test_epoch_losses = []

syn_test_epoch_accuracies = []
ant_test_epoch_accuracies = []

pbar = tqdm(total = num_epochs, position = 0)

for epoch in range(num_epochs):
    
    train_epoch_loss, syn_train_epoch_loss, ant_train_epoch_loss, Lm_train_epoch_loss = train.Phase_I_train_model(model = model, training_data_set = train_loader, optimizer = optimizer)
    
    train_epoch_losses.append(train_epoch_loss)
    syn_train_epoch_losses.append(syn_train_epoch_loss)
    ant_train_epoch_losses.append(ant_train_epoch_loss)
   
    test_epoch_loss, syn_test_epoch_loss, ant_test_epoch_loss, Lm_test_epoch_loss, syn_epoch_acc, ant_epoch_acc, syn_true, syn_predictions, ant_true, ant_predictions = test.Phase_I_eval_model(model = model, testing_data_set = test_loader, optimizer = optimizer)
    test_epoch_losses.append(test_epoch_loss)
    syn_test_epoch_losses.append(syn_test_epoch_loss)
    ant_test_epoch_losses.append(ant_test_epoch_loss)
    
    syn_test_epoch_accuracies.append(syn_epoch_acc)
    ant_test_epoch_accuracies.append(ant_epoch_acc)
    
    pbar.update()

  0%|          | 0/25 [00:00<?, ?it/s]

AttributeError: 'DistillerDataset' object has no attribute 'Phase_I_train_model'

In [None]:
fig, ax = plt.subplots(figsize = (8,6))

epochs = np.arange(1, (num_epochs+1), 1)

plt.plot(epochs, train_epoch_losses, c = 'k', label = 'training error')
plt.plot(epochs, test_epoch_losses, c = 'r', label = 'testing error')
plt.legend(loc = 'upper right')
plt.title("Total Training & Testing Error")
ax.set_xlabel('Epoch')
ax.set_ylabel('Total Custom Loss')
plt.show()

fig, ax = plt.subplots(figsize = (8,6))
plt.plot(epochs, syn_test_epoch_accuracies, c = 'k', label = 'syn accuracy')
plt.plot(epochs, ant_test_epoch_accuracies, c = 'r', label = 'ant accuracy')
plt.legend(loc = 'lower right')
plt.title("Phase I Labeling Accuracy")
ax.set_xlabel('Epoch')
ax.set_ylabel('Accuracy')
plt.show()

In [None]:
#thresholding the predicted values to be compared to the labels
syn_predictions = list(map(lambda x:1 if x >= 0.8 else x, syn_predictions))
syn_predictions = list(map(lambda x:0 if -0.8 < x < 0.8 else x, syn_predictions))
syn_predictions = list(map(lambda x:-1 if x <= -0.8 else x, syn_predictions))

ant_predictions = list(map(lambda x:1 if x >= 0.8 else x, ant_predictions))
ant_predictions = list(map(lambda x:0 if -0.8 < x < 0.8 else x, ant_predictions))
ant_predictions = list(map(lambda x:-1 if x <= -0.8 else x, ant_predictions))


#synonymy confusion matrix
syn_matrix = confusion_matrix(syn_true, syn_predictions)

#antonymy confusion matrix
ant_matrix = confusion_matrix(ant_true, ant_predictions)

In [None]:
syn_matrix

In [None]:
ant_matrix

In [18]:
os.path.exists('/Users/wesleytatum/Desktop/post_doc/data/nn_datasets/')

True