In [None]:
'''
Train sense embeddings from a disambiguated corpus using Word2vec.
'''
import spacy
# Load the spacy model that you have installed.
model = spacy.load(’en_core_web_md’)
# Process a sentence given the pre-trained model.
embeddings = model()
# Extract a word-vector for the 7-th word homework.
embeddings [6]. vector
# Get a sentence-vector as a mean of the individual word vectors.
embeddings.vector


#Define the definition getter function. It inputs an adjective in string form, and outputs a list of 
#parsed and pos tagged bows

#Imports
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords

#Define stopwords
sw=stopwords.words("english")

#Define the lemmatizer
wnl=nltk.WordNetLemmatizer()

def definition_getter(adj):
    adjdefs=[]                            #list to contain the adjective's definition bows
    syns=wn.synsets(adj,pos="a").copy()   #Define the wn synsets
    
    
    if len(syns)==0:
        return None
    else:
        for syn in syns:     #loop over the definitions (synsets)
            adef=syn.definition()     #define the adjective definition in string form
            pos_tagged=nltk.pos_tag(nltk.word_tokenize(adef))     #Tokenize and pos tag the sentence
            
            #Filter out stop words and tokens that are not alphabetical. Then filter out duplicates,
            #then append to adjdefs
            adjdefs.append(list(set([tup for tup in pos_tagged if tup[0] not in sw and tup[0].isalpha()])))
    
    #Include examples
    exampleList=[]
    for syn in syns:
        ex1=""
        examples=syn.examples().copy()
        #For each example, concatenate the strings into one large string, seperated by spaces
        for example in examples:
            ex1+=" "+example
        pos_tagged=nltk.pos_tag(nltk.word_tokenize(ex1)) #Tokenize and pos tag the examples
        
        #Filter out stop words and tokens that are not alphabetical. Then filter out duplicates,
        #then append to exampleList
        exampleList.append(list(set([tup for tup in pos_tagged if tup[0] not in sw and tup[0].isalpha()])))
        
    #Concatenate each bow in adjdefs with the example bows
    for i in range(len(adjdefs)):
        adjdefs[i]+=exampleList[i]
    
    #Lemmatize the words
    adjdefslem=[]          #list to contain lemmatized bows
    for bow in adjdefs:    #loop over each definition in adjdefs
        def1=[]            #list to contain lemmatized bow for a particular definition
        for tup in bow:     #loop over each tuple in the definition (bow)
            w=tup[0]  
            
            #Convert pos tags to suitable form for the lemmatizer
            pos1=tup[1]
            pos2=pos1[0].lower()
            if pos2=="j":
                pos2="a"
            
            #Lemmatize the word, w
            try:
                lem=wnl.lemmatize(w.lower(),pos=pos2)
            except:
                lem=wnl.lemmatize(w.lower())
            def1.append((lem,pos1))
        adjdefslem.append(def1)
    
    #Convert pos to the lower case of the first component. This converts pos to just the main pos
    adjdefs2=[[(tup[0],tup[1][0].lower()) for tup in definition] for definition in adjdefslem]
    
    #Discard duplicate words per definition
    adjdefs3=[list(set(x)) for x in adjdefs2]
    
    #discard duplicate definitions
    newlist2=[]
    for d in adjdefs3:
        d2=d.copy()
        d2.sort()
        d3=tuple(d2)
        if d3 not in newlist2:
            newlist2.append(d3)
            
    return [list(x) for x in newlist2]
import gensim
import sys
import logging 
from gensim.models.word2vec import LineSentence
from gensim.models import Word2Vec
import re
import itertools
import os

class IterableChain(object):
    def __init__(self, iterables):
        self.iterables = iterables
    def __iter__(self):
        return itertools.chain(*self.iterables)

if __name__ == '__main__':
    pretrained_model_path = sys.argv[1]
    in_path = sys.argv[2]
    if os.path.isfile(in_path):
        sentences = LineSentence(in_path)
    else:
        iters = []
        for root, dirs, fnames in os.walk(in_path):
            for fname in fnames:
                if re.search(r'\.txt\.gz$', fname):
                    child_path = os.path.join(root, fname)
                    iters.append(LineSentence(child_path))
        sentences = IterableChain(iters)
    out_path = sys.argv[3]

    # so that gensim will print something nice to the standard output
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = Word2Vec.load(pretrained_model_path)
#     model.workers = 1 # for debugging
    model.min_count = 1
    model.build_vocab(sentences, sense_delimiter='---', update=True)
    model.train(sentences, sense_delimiter='---', 
                total_examples=model.corpus_count, epochs=10)
    model.save(out_path)




In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

In [None]:

nlp = spacy.load("en_core_web_sm")
print("Pipeline:", nlp.pipe_names)
doc = nlp("I was reading the paper.")
token = doc[0]  # 'I'
print(token.morph)  # 'Case=Nom|Number=Sing|Person=1|PronType=Prs'
print(token.morph.get("PronType"))  # ['Prs']

In [None]:
#Define our nouns and adjectives
n1="coffee"
n2="woman"
adj="hot"

#Create nlp object by loading the lg english file (we could also load the small or medium file)
nlp=spacy.load("en_core_web_lg")

#Create nlp objects for each noun
nlp1=nlp(n1)
nlp2=nlp(n2)

#Let d1 represent the DAV for n1 and adj, and d2 represent the DAV for n2 and adj

#Loop over all definitions of adj and compute similarity scores between the noun and the definition bow.
#Append the similarity scores to the DAVs
d1=[]
d2=[]
for syn in wn.synsets(adj,pos="a"):
    nlpdef=nlp(syn.definition()) #nlp object for the definition
    d1.append(nlp1.similarity(nlpdef)) #append similarity score to d1
    d2.append(nlp2.similarity(nlpdef)) #append similarity score to d2