In [1]:
#Associating a vector to each of the terms found in the carbon corpus
import numpy as np
import pandas as pd
import os
import random
import sklearn
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from collections import defaultdict
import torch
import torch.nn as nn
import torch.nn.functional as F
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
import Levenshtein as lev



In [2]:
df = pd.read_excel('Carbon_SynAntList_Full_Refined_copy.xlsx', skip_rows=1)
df = df.rename(columns = {'Carbon_SynAntList_Full_Refined':'index', 'Unnamed: 1':'word 1', 'Unnamed: 2':'word 2','Unnamed: 3':'relationship', 'Unnamed: 4': 'label'})
df = df.fillna(0)

In [None]:
lev.distance('carbon','original')

In [None]:
df.insert()

In [17]:
#add Levenshtein distance, stem similarity, lemma similarity
stems_equal = []
lemmas_equal = []
lev_distance = []
for row in df.iterrows():
    word1 = row[1]['word 1']
    word2 = row[1]['word 2']
    distance = lev.distance(str(word1), str(word2))
    lev_distance.append(distance)
    
    word1_lemma = lemmatizer.lemmatize(str(word1))
    word2_lemma = lemmatizer.lemmatize(str(word2))
    lemmas_equal.append(word1_lemma==word2_lemma)
    
    word1_stem = stemmer.stem(str(word1))
    word2_stem = stemmer.stem(str(word2))
    stems_equal.append(word1_stem==word2_stem)

df['lev_distance'] = lev_distance
df['lemmas?'] = lemmas_equal
df['stems?'] = stems_equal

    

In [18]:
df2 = df.iloc[:2000]

In [19]:
df2

Unnamed: 0,index,word 1,word 2,relationship,label,lev_distance,lemmas?,stems?
0,0.0,0,1,2,0,1,False,False
1,0.0,carbon,original,ant,0,7,False,False
2,1.0,carbon,graphite,syn,1,7,False,False
3,2.0,carbon,soot,syn,1,5,False,False
4,3.0,carbon,imitate,syn,0,7,False,False
...,...,...,...,...,...,...,...,...
1995,1994.0,infinite,ending,ant,0,5,False,False
1996,1995.0,infinite,ephemeral,ant,0,9,False,False
1997,1996.0,infinite,finite,ant,0,2,False,False
1998,1997.0,infinite,intermittent,ant,0,7,False,False


In [None]:
#Recording each individual word in the pairs
words = []
for i in range(len(df)):
        if df.iloc[i,0] in words:
            continue
        else:
            words.append(df.iloc[i,0])
        
        if df.iloc[i,1] in words:
            continue
        else:
            words.append(df.iloc[i,1])

In [None]:
#Have list of list of words
def listoflist(lst):
    return[[word] for word in words]

In [None]:
words = listoflist(words)

In [None]:
#Training w2v model to get vectors for each word
model = Word2Vec(words, size=100, min_count=1, iter=30) #training model with token list from above
vocabulary = list(model.wv.vocab) #saving vocabulary as list for visualization


In [None]:
#Visualize the vector associated with a word in the vocabulary
model.wv.__getitem__('current')

In [None]:
print(vocabulary)

In [None]:
#creating the syn subspace
#sigmoidal activation
class syn_subspace(nn.Module):
    
    def __init__(self, in_reprs, out_reprs=60): #60 is the dimensionality of the syn subspace
        
        self.in_reprs = in_reprs #net knows that in_reprs is the value that is passed through me
        self.out_reprs = out_reprs 
    
        self.layers = []
        self.layers.append(nn.Linear(self.in_reprs, self.out_reprs))
        
        self.layers = nn.Sequential(*self.layers) #turn your layers list into an object you can pass tensors through
        
    def forward(self, x): #x is the tensor that we send through
        
        out = self.layers(x)
    
    return out
        
        
    

In [None]:
#creating the ant subspace
class ant_subspace(nn.Module):
    
    def __init__(self, in_reprs, out_reprs=60): #60 is the dimensionality of the ant subspace
        
        self.in_reprs = in_reprs #net knows that in_reprs is the value that is passed through it
        self.out_reprs = out_reprs 
    
        self.layers = []
        self.layers.append(nn.Linear(self.in_reprs, self.out_reprs))
        
        self.layers = nn.Sequential(*self.layers) #turn your layers list into an object you can pass tensors through
        
    def forward(self, x): #x is the tensor that we send through
        
        out = self.layers(x)
        
    return out



    

In [None]:
#creating master layer
class FullNet(nn.Module):

    def __init__(self, in_reprs, out_reprs)
    """
    Parameters:
    in_reprs(torch.tensor): the dimensionality of the concacenated input data from w2v, stem, lemma, Lev.
    out_reprs: the dimensionality of what we want for the whole NN
    
    """
        self.in_reprs = in_reprs #representation of w2v+ encodings
        self.out_reprs = out_reprs #representation that will be fed to create both subspaces
    
        self.layers = []
        self.layers.append(nn.Linear(self.in_reprs, self.out_reprs), nn.)
        
        
        
        
        