In [1]:
from gensim.models.fasttext import FastText 
from gensim.test.utils import datapath
import time
from gensim.test.utils import get_tmpfile
import logging
from gensim import utils
from itertools import chain
from gensim.utils import tokenize
import smart_open
import os

In [2]:
os.getcwd()

'D:\\german_legal_WE'

In [3]:
analogies = datapath(os.getcwd()+'\\german-legal-analogies.txt')

## open the corpus

In [4]:
#open the small legal corpus
class MyIterSmall(object):
    def __iter__(self):
        path = datapath(os.getcwd()+'\\German_legal_corpora\\cleaned-small-legal-corpus.txt')
        with smart_open.smart_open(path, 'r', encoding='utf-8') as fin:
            for line in fin:
                yield list(tokenize(line))
#open the huge legal corpus
class MyIterHuge(object):
    def __iter__(self):
        path = datapath(os.getcwd()+'\\German_legal_corpora\\cleaned-huge-legal-corpus.txt')
        with smart_open.smart_open(path, 'r', encoding='utf-8') as fin:
            for line in fin:
                yield list(tokenize(line))


## train_fasttext

In [5]:
def train_fasttext(small,window,size,sg):
    modelName='ft_s'+str(small)+'_w'+str(window)+'_d' +str(size)+'_sg' +str(sg)
    model = FastText(size=size, window=window, min_count=1,sg=sg,min_n=3,max_n=6,negative=5,word_ngrams=1)#?
    print("Reading corpus"+str(small)+"...")
    if(small==1):
        print("Reading the small corpus")
        model.build_vocab(sentences=MyIterSmall())
    else:  
        print("Reading the huge corpus")
        model.build_vocab(sentences=MyIterHuge())
        
    total_examples = model.corpus_count
    
    #train
    print("Model training: "+modelName)
    start = time.time()
    if(small==1):
        model.train(sentences=MyIterSmall(), total_examples=total_examples, epochs=5)
    else: model.train(sentences=MyIterHuge(), total_examples=total_examples, epochs=5)
        
    
    print("Model trained: "+modelName)
    print("Fasttext training time: "+str(time.time()-start))
    
    return modelName,model

## save the trained model

In [6]:
def ensure_dir(file_path):
    directory = os.path.dirname(file_path)
    #print(directory)
    if not os.path.exists(directory):
        print(file_path+" not existed, creating...")
        os.makedirs(directory)

In [12]:
def save_model(modelName,model):
    fnameS=os.getcwd()+"\\WEs\\gensim_fasttext\\"+modelName+"\\"+modelName+".model"
    ensure_dir(fnameS)
    fname = get_tmpfile(fnameS)
    print("Saving model to "+fnameS)
    
    model.save(fname)
    model.wv.save_word2vec_format(fnameS[:-5]+'vec',binary=False)
    print("Model saved: "+modelName)
    

    

## load the saved model

In [8]:
def load_model(small,window,size,sg):
    
    modelName='ft_s'+str(small)+'_w'+str(window)+'_d' +str(size)+'_sg' +str(sg)
    fnameS=os.getcwd()+"\\WEs\\gensim_fasttext\\"+modelName+"\\"+modelName+".model"
    fname = get_tmpfile(fnameS)
    print("Loading model from "+fnameS)
    model = FastText.load(fname)
    print("Model loaded: "+modelName)
    return modelName,model
    

## analogies evaluation

In [9]:
def setup_handler(modelName,topn):

    logName = "evalLogs\ "+ modelName +"_evalT"+str(topn)+".log"
    fhandler = logging.FileHandler(logName, 'w',encoding="UTF-8")
    
    # create formatter and add it to the handler
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    fhandler.setFormatter(formatter)
    return fhandler
    
def setup_logger(name, modelName,topn, level=logging.DEBUG):

    handler = setup_handler(modelName,topn) 
    
    logger = logging.getLogger(name +"_evalT"+str(topn))
    logger.setLevel(level)
    logger.addHandler(handler)

    return logger   


def evaluate_word_analogies2(self, modelName,topn,analogies, restrict_vocab=300000, case_insensitive=False, dummy4unknown=False):
        """Compute performance of the model on an analogy test set.

        This is modern variant of :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.accuracy`, see
        `discussion on GitHub #1935 <https://github.com/RaRe-Technologies/gensim/pull/1935>`_.

        The accuracy is reported (printed to log and returned as a score) for each section separately,
        plus there's one aggregate summary at the end.

        This method corresponds to the `compute-accuracy` script of the original C word2vec.
        See also `Analogy (State of the art) <https://aclweb.org/aclwiki/Analogy_(State_of_the_art)>`_.

        Parameters
        ----------
        analogies : str
            Path to file, where lines are 4-tuples of words, split into sections by ": SECTION NAME" lines.
            See `gensim/test/test_data/questions-words.txt` as example.
        restrict_vocab : int, optional
            Ignore all 4-tuples containing a word not in the first `restrict_vocab` words.
            This may be meaningful if you've sorted the model vocabulary by descending frequency (which is standard
            in modern word embedding models).
        case_insensitive : bool, optional
            If True - convert all words to their uppercase form before evaluating the performance.
            Useful to handle case-mismatch between training tokens and words in the test set.
            In case of multiple case variants of a single word, the vector for the first occurrence
            (also the most frequent if vocabulary is sorted) is taken.
        dummy4unknown : bool, optional
            If True - produce zero accuracies for 4-tuples with out-of-vocabulary words.
            Otherwise, these tuples are skipped entirely and not used in the evaluation.

        Returns
        -------
        score : float
            The overall evaluation score on the entire evaluation set
        sections : list of dict of {str : str or list of tuple of (str, str, str, str)}
            Results broken down by each section of the evaluation set. Each dict contains the name of the section
            under the key 'section', and lists of correctly and incorrectly predicted 4-tuples of words under the
            keys 'correct' and 'incorrect'.

        """
        print("evaluating T"+str(topn))
        print("d4u:"+str(dummy4unknown))
        if(dummy4unknown): modelName=modelName+"_d4u"
        #get default analogies score
        evalScoreT1=self.evaluate_word_analogies(analogies,dummy4unknown=dummy4unknown)[0]
        #get logger

        logger=setup_logger(modelName,modelName,topn)
        

        
        ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]]
        ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab)
        oov = 0
        logger.info("Evaluating word analogies for top %i words in the model on %s", restrict_vocab, analogies)
        sections, section = [], None
        quadruplets_no = 0

        for line_no, line in enumerate(utils.smart_open(analogies)):
            line = utils.to_unicode(line)
            if line.startswith(': '):
                # a new section starts => store the old section
                if section:
                    sections.append(section)
                    self._log_evaluate_word_analogies(section)
                section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []}
            else:
                if not section:
                    raise ValueError("Missing section header before line #%i in %s" % (line_no, analogies))
                try:
                    if case_insensitive:
                        a, b, c, expected = [word.upper() for word in line.split()]
                    else:
                        a, b, c, expected = [word for word in line.split()]
                except ValueError:
                    logger.info("Skipping invalid line #%i in %s", line_no, analogies)
                    continue
                quadruplets_no += 1
                if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab:
                    oov += 1
                    if dummy4unknown:
                        logger.debug('Zero accuracy for line #%d with OOV words: %s', line_no, line.strip())
                        section['incorrect'].append((a, b, c, expected))
                    else:
                        logger.debug("Skipping line #%i with OOV words: %s", line_no, line.strip())
                    continue
                original_vocab = self.vocab
                self.vocab = ok_vocab
                ignore = {a, b, c}  # input words to be ignored
                predicted = None
                logger.info('Start predicting: %s + %s - %s = %s',b, c,a, expected)
                # find the most likely prediction using 3CosAdd (vector offset) method
                # TODO: implement 3CosMul and set-based methods for solving analogies
                #print("topn="+str(topn))
                sims = self.most_similar(positive=[b, c], negative=[a], topn=topn, restrict_vocab=restrict_vocab)
                #print(a,b,c,sims)
                self.vocab = original_vocab
                predicted10=0
                topN=1
                for element in sims:
                    predicted = element[0].upper() if case_insensitive else element[0]
                    sim=element[1]
                    #print("predicted "+predicted)
                    #print("expected "+expected)
                    if predicted in ok_vocab and predicted not in ignore:
                        if predicted != expected:
                            logger.debug("%s: expected %s, predicted %s,sim %s,top %i", line.strip(), expected, predicted,sim,topN)
                            topN+=1
                        #break
                    if predicted == expected:
                        logger.info('Expected word found: %s + %s - %s = %s, sim %s, top %i',b, c,a, expected,sim,topN)
                        #print("!!!")
                        section['correct'].append((a, b, c, expected,predicted,sim,topN))
                        predicted10=1
                        break
                if predicted10==0:
                    section['incorrect'].append((a, b, c, expected,predicted))
        if section:
            # store the last section, too
            sections.append(section)
            self._log_evaluate_word_analogies(section)

        total = {
            'section': 'Total accuracy',
            'correct': list(chain.from_iterable(s['correct'] for s in sections)),
            'incorrect': list(chain.from_iterable(s['incorrect'] for s in sections)),
        }
        oov_ratio = float(oov) / quadruplets_no * 100
        logger.info('Quadruplets with out-of-vocabulary words: %.1f%%', oov_ratio)
        print('Quadruplets with out-of-vocabulary words: ', oov_ratio)
        if not dummy4unknown:
            logger.info(
                'NB: analogies containing OOV words were skipped from evaluation! '
                'To change this behavior, use "dummy4unknown=True"'
            )
        analogies_score = self._log_evaluate_word_analogies(total)
        sections.append(total)
        mean_sim,mean_top=correctWord_sim(total)
        logger.info('evalScoreT1: %.4f',evalScoreT1)
        logger.info('evalScoreT%i: %.4f',topn, analogies_score)
        logger.info( 'mean_sim: %.4f', mean_sim)
        logger.info('mean_top: %.4f',mean_top)
        print('evalScoreT1:',evalScoreT1)
        print('evalScoreT'+str(topn)+": "+str(analogies_score))
        print( 'mean_sim: ', mean_sim)
        print('mean_top: ',mean_top)
        # Return the overall score and the full lists of correct and incorrect analogies
        logging.shutdown()
        print("evaluating done.")
        
        return evalScoreT1,mean_sim,mean_top, analogies_score, sections
    
def correctWord_sim(total):
    corrects=total['correct']
    sim=[]
    top=[]
    for c in corrects:
        sim.append(c[5])
        top.append(c[6])
    #print(sim)
    if len(sim)!=0: mean_sim=sum(sim)/len(sim)
    else: mean_sim=0
    if len(top)!=0: mean_top=sum(top)/len(top)
    else: mean_top=0
    return mean_sim,mean_top

# workflow1: train fasttext models and evaluate

In [10]:
def ft_train_eval(small,window,size,sg,analogies):
    #train
    modelName,model=train_fasttext(small,window,size,sg)
    #save trained model
    save_model(modelName,model)
    #eval
    evaluate_word_analogies2(model.wv, modelName,3,analogies)
    evaluate_word_analogies2(model.wv, modelName,5,analogies)
    evaluate_word_analogies2(model.wv, modelName,10,analogies)
    
    

# workflow2: load trained fasttext models and evaluate

In [11]:
def ft_load_eval(small,window,size,sg,analogies,d4u=False):    
    #load trained model
    modelName,model=load_model(small,window,size,sg)
    #eval
    if not (d4u):
        evaluate_word_analogies2(model.wv, modelName,3,analogies)
        evaluate_word_analogies2(model.wv, modelName,5,analogies)
        evaluate_word_analogies2(model.wv, modelName,10,analogies)
    else:
        evaluate_word_analogies2(model.wv, modelName+"_d4u",3,analogies,dummy4unknown=True)
        evaluate_word_analogies2(model.wv, modelName+"_d4u",5,analogies,dummy4unknown=True)
        evaluate_word_analogies2(model.wv, modelName+"_d4u",10,analogies,dummy4unknown=True)

# training and evaluating models

## ft_s1_w3_d100_sg0 
(small=1,window=3,size=100,sg=0,
min_count=1,epochs=5,min_n=3,max_n=6,negative=5,word_ngrams=1)

In [25]:
ft_train_eval(1,3,100,0,analogies)

Reading corpus1...
Reading the small corpus
Model training: ft_s1_w3_d100_sg0
Model trained: ft_s1_w3_d100_sg0
Fasttext training time: 614.4952123165131
Saving model to C:\Users\KlaraRuanQian\Desktop\BA\german_legal_WE\WEs\gensim_fasttext\ft_s1_w3_d100_sg0\ft_s1_w3_d100_sg0.model
Model saved: ft_s1_w3_d100_sg0


In [12]:
ft_load_eval(1,3,100,0,analogies)

Loading model from C:\Users\KlaraRuanQian\Desktop\BA\german_legal_WE\WEs\gensim_fasttext\ft_s1_w3_d100_sg0\ft_s1_w3_d100_sg0.model
Model loaded: ft_s1_w3_d100_sg0


In [27]:
ft_train_eval(1,3,100,1,analogies)

Reading corpus1...
Reading the small corpus
Model training: ft_s1_w3_d100_sg1
Model trained: ft_s1_w3_d100_sg1
Fasttext training time: 867.0004193782806
Saving model to C:\Users\KlaraRuanQian\Desktop\BA\german_legal_WE\WEs\gensim_fasttext\ft_s1_w3_d100_sg1\ft_s1_w3_d100_sg1.model
Model saved: ft_s1_w3_d100_sg1


In [15]:
ft_load_eval(1,3,100,1,analogies)

Loading model from C:\Users\KlaraRuanQian\Desktop\BA\german_legal_WE\WEs\gensim_fasttext\ft_s1_w3_d100_sg1\ft_s1_w3_d100_sg1.model
Model loaded: ft_s1_w3_d100_sg1
evaluating T3
evaluating done.
evaluating T5
evaluating done.
evaluating T10
evaluating done.


In [43]:
ft_train_eval(1,3,300,0,analogies)

Reading corpus1...
Reading the small corpus
Model training: ft_s1_w3_d300_sg0
Model trained: ft_s1_w3_d300_sg0
Fasttext training time: 1921.7276992797852
C:\Users\KlaraRuanQian\Desktop\BA\german_legal_WE\WEs\gensim_fasttext\ft_s1_w3_d300_sg0\ft_s1_w3_d300_sg0.model not existed, creating...
Saving model to C:\Users\KlaraRuanQian\Desktop\BA\german_legal_WE\WEs\gensim_fasttext\ft_s1_w3_d300_sg0\ft_s1_w3_d300_sg0.model
Model saved: ft_s1_w3_d300_sg0


In [16]:
ft_load_eval(1,3,300,0,analogies)

Loading model from C:\Users\KlaraRuanQian\Desktop\BA\german_legal_WE\WEs\gensim_fasttext\ft_s1_w3_d300_sg0\ft_s1_w3_d300_sg0.model
Model loaded: ft_s1_w3_d300_sg0
evaluating T3
evaluating done.
evaluating T5
evaluating done.
evaluating T10
evaluating done.


In [17]:
ft_load_eval(1,3,300,1,analogies)

Loading model from C:\Users\KlaraRuanQian\Desktop\BA\german_legal_WE\WEs\gensim_fasttext\ft_s1_w3_d300_sg1\ft_s1_w3_d300_sg1.model
Model loaded: ft_s1_w3_d300_sg1
evaluating T3
evaluating done.
evaluating T5
evaluating done.
evaluating T10
evaluating done.


In [None]:
#ft_train_eval(1,5,100,0,analogies)

In [18]:
ft_load_eval(1,5,100,0,analogies)

Loading model from C:\Users\KlaraRuanQian\Desktop\BA\german_legal_WE\WEs\gensim_fasttext\ft_s1_w5_d100_sg0\ft_s1_w5_d100_sg0.model
Model loaded: ft_s1_w5_d100_sg0
evaluating T3
evaluating done.
evaluating T5
evaluating done.
evaluating T10
evaluating done.


In [None]:
#ft_train_eval(1,5,100,1,analogies)

In [19]:
ft_load_eval(1,5,100,1,analogies)

Loading model from C:\Users\KlaraRuanQian\Desktop\BA\german_legal_WE\WEs\gensim_fasttext\ft_s1_w5_d100_sg1\ft_s1_w5_d100_sg1.model
Model loaded: ft_s1_w5_d100_sg1
evaluating T3
evaluating done.
evaluating T5
evaluating done.
evaluating T10
evaluating done.


In [None]:
#ft_train_eval(1,5,300,0,analogies)

In [20]:
ft_load_eval(1,5,300,0,analogies)

Loading model from C:\Users\KlaraRuanQian\Desktop\BA\german_legal_WE\WEs\gensim_fasttext\ft_s1_w5_d300_sg0\ft_s1_w5_d300_sg0.model
Model loaded: ft_s1_w5_d300_sg0
evaluating T3
evaluating done.
evaluating T5
evaluating done.
evaluating T10
evaluating done.


In [61]:
ft_train_eval(1,5,300,1,analogies)

Reading corpus1...
Reading the small corpus
Model training: ft_s1_w5_d300_sg1
Model trained: ft_s1_w5_d300_sg1
Fasttext training time: 1900.5142829418182
C:\Users\KlaraRuanQian\Desktop\BA\german_legal_WE\WEs\gensim_fasttext\ft_s1_w5_d300_sg1\ft_s1_w5_d300_sg1.model not existed, creating...
Saving model to C:\Users\KlaraRuanQian\Desktop\BA\german_legal_WE\WEs\gensim_fasttext\ft_s1_w5_d300_sg1\ft_s1_w5_d300_sg1.model
Model saved: ft_s1_w5_d300_sg1


In [21]:
ft_load_eval(1,5,300,1,analogies)

Loading model from C:\Users\KlaraRuanQian\Desktop\BA\german_legal_WE\WEs\gensim_fasttext\ft_s1_w5_d300_sg1\ft_s1_w5_d300_sg1.model
Model loaded: ft_s1_w5_d300_sg1
evaluating T3
evaluating done.
evaluating T5
evaluating done.
evaluating T10
evaluating done.


In [None]:
#ft_train_eval(0,3,100,0,analogies)

In [22]:
ft_load_eval(0,3,100,0,analogies)

Loading model from C:\Users\KlaraRuanQian\Desktop\BA\german_legal_WE\WEs\gensim_fasttext\ft_s0_w3_d100_sg0\ft_s0_w3_d100_sg0.model
Model loaded: ft_s0_w3_d100_sg0
evaluating T3
evaluating done.
evaluating T5
evaluating done.
evaluating T10
evaluating done.


In [None]:
#ft_train_eval(0,3,100,1,analogies)

In [23]:
ft_load_eval(0,3,100,1,analogies)

Loading model from C:\Users\KlaraRuanQian\Desktop\BA\german_legal_WE\WEs\gensim_fasttext\ft_s0_w3_d100_sg1\ft_s0_w3_d100_sg1.model
Model loaded: ft_s0_w3_d100_sg1
evaluating T3
evaluating done.
evaluating T5
evaluating done.
evaluating T10
evaluating done.


In [None]:
#ft_train_eval(0,3,300,0,analogies)

In [24]:
ft_load_eval(0,3,300,0,analogies)

Loading model from C:\Users\KlaraRuanQian\Desktop\BA\german_legal_WE\WEs\gensim_fasttext\ft_s0_w3_d300_sg0\ft_s0_w3_d300_sg0.model
Model loaded: ft_s0_w3_d300_sg0
evaluating T3
evaluating done.
evaluating T5
evaluating done.
evaluating T10
evaluating done.


In [None]:
#ft_train_eval(0,3,300,1,analogies)

In [25]:
ft_load_eval(0,3,300,1,analogies)

Loading model from C:\Users\KlaraRuanQian\Desktop\BA\german_legal_WE\WEs\gensim_fasttext\ft_s0_w3_d300_sg1\ft_s0_w3_d300_sg1.model
Model loaded: ft_s0_w3_d300_sg1
evaluating T3
evaluating done.
evaluating T5
evaluating done.
evaluating T10
evaluating done.


In [62]:
ft_train_eval(0,5,100,0,analogies)

Reading corpus0...
Reading the huge corpus
Model training: ft_s0_w5_d100_sg0
Model trained: ft_s0_w5_d100_sg0
Fasttext training time: 6163.8315098285675
C:\Users\KlaraRuanQian\Desktop\BA\german_legal_WE\WEs\gensim_fasttext\ft_s0_w5_d100_sg0\ft_s0_w5_d100_sg0.model not existed, creating...
Saving model to C:\Users\KlaraRuanQian\Desktop\BA\german_legal_WE\WEs\gensim_fasttext\ft_s0_w5_d100_sg0\ft_s0_w5_d100_sg0.model
Model saved: ft_s0_w5_d100_sg0


In [26]:
ft_load_eval(0,5,100,0,analogies)

Loading model from C:\Users\KlaraRuanQian\Desktop\BA\german_legal_WE\WEs\gensim_fasttext\ft_s0_w5_d100_sg0\ft_s0_w5_d100_sg0.model
Model loaded: ft_s0_w5_d100_sg0
evaluating T3
evaluating done.
evaluating T5
evaluating done.
evaluating T10
evaluating done.


In [63]:
ft_train_eval(0,5,100,1,analogies)

Reading corpus0...
Reading the huge corpus
Model training: ft_s0_w5_d100_sg1
Model trained: ft_s0_w5_d100_sg1
Fasttext training time: 7360.7729778289795
C:\Users\KlaraRuanQian\Desktop\BA\german_legal_WE\WEs\gensim_fasttext\ft_s0_w5_d100_sg1\ft_s0_w5_d100_sg1.model not existed, creating...
Saving model to C:\Users\KlaraRuanQian\Desktop\BA\german_legal_WE\WEs\gensim_fasttext\ft_s0_w5_d100_sg1\ft_s0_w5_d100_sg1.model
Model saved: ft_s0_w5_d100_sg1


In [27]:
#ft_load_eval(0,5,100,1,analogies)

Loading model from C:\Users\KlaraRuanQian\Desktop\BA\german_legal_WE\WEs\gensim_fasttext\ft_s0_w5_d100_sg1\ft_s0_w5_d100_sg1.model
Model loaded: ft_s0_w5_d100_sg1
evaluating T3
evaluating done.
evaluating T5
evaluating done.
evaluating T10
evaluating done.


In [46]:
ft_load_eval(0,5,100,1,analogies,d4u=True)

Loading model from C:\Users\KlaraRuanQian\Desktop\BA\german_legal_WE\WEs\gensim_fasttext\ft_s0_w5_d100_sg1\ft_s0_w5_d100_sg1.model
Model loaded: ft_s0_w5_d100_sg1
evaluating T3
d4u:True
evaluating done.
evaluating T5
d4u:True
evaluating done.
evaluating T10
d4u:True
evaluating done.


In [64]:
ft_train_eval(0,5,300,0,analogies)

Reading corpus0...
Reading the huge corpus
Model training: ft_s0_w5_d300_sg0
Model trained: ft_s0_w5_d300_sg0
Fasttext training time: 15975.812193632126
C:\Users\KlaraRuanQian\Desktop\BA\german_legal_WE\WEs\gensim_fasttext\ft_s0_w5_d300_sg0\ft_s0_w5_d300_sg0.model not existed, creating...
Saving model to C:\Users\KlaraRuanQian\Desktop\BA\german_legal_WE\WEs\gensim_fasttext\ft_s0_w5_d300_sg0\ft_s0_w5_d300_sg0.model
Model saved: ft_s0_w5_d300_sg0


In [28]:
ft_load_eval(0,5,300,0,analogies)

Loading model from C:\Users\KlaraRuanQian\Desktop\BA\german_legal_WE\WEs\gensim_fasttext\ft_s0_w5_d300_sg0\ft_s0_w5_d300_sg0.model
Model loaded: ft_s0_w5_d300_sg0
evaluating T3
evaluating done.
evaluating T5
evaluating done.
evaluating T10
evaluating done.


In [12]:
ft_train_eval(0,10,100,1,analogies)

Reading corpus0...
Reading the huge corpus
Model training: ft_s0_w10_d100_sg1
Model trained: ft_s0_w10_d100_sg1
Fasttext training time: 4194.701753616333
D:\german_legal_WE\WEs\gensim_fasttext\ft_s0_w10_d100_sg1\ft_s0_w10_d100_sg1.model not existed, creating...
Saving model to D:\german_legal_WE\WEs\gensim_fasttext\ft_s0_w10_d100_sg1\ft_s0_w10_d100_sg1.model
Model saved: ft_s0_w10_d100_sg1
evaluating T3
d4u:False
Quadruplets with out-of-vocabulary words:  60.747663551401864
evalScoreT1: 0.4523809523809524
evalScoreT3: 0.6190476190476191
mean_sim:  0.861471597964947
mean_top:  1.4230769230769231
evaluating done.
evaluating T5
d4u:False
Quadruplets with out-of-vocabulary words:  60.747663551401864
evalScoreT1: 0.4523809523809524
evalScoreT5: 0.6666666666666666
mean_sim:  0.8551896320922034
mean_top:  1.6785714285714286
evaluating done.
evaluating T10
d4u:False
Quadruplets with out-of-vocabulary words:  60.747663551401864
evalScoreT1: 0.4523809523809524
evalScoreT10: 0.7619047619047619
me

In [13]:
ft_train_eval(0,15,100,1,analogies)

Reading corpus0...
Reading the huge corpus
Model training: ft_s0_w15_d100_sg1
Model trained: ft_s0_w15_d100_sg1
Fasttext training time: 6451.076989173889
D:\german_legal_WE\WEs\gensim_fasttext\ft_s0_w15_d100_sg1\ft_s0_w15_d100_sg1.model not existed, creating...
Saving model to D:\german_legal_WE\WEs\gensim_fasttext\ft_s0_w15_d100_sg1\ft_s0_w15_d100_sg1.model
Model saved: ft_s0_w15_d100_sg1
evaluating T3
d4u:False
Quadruplets with out-of-vocabulary words:  60.747663551401864
evalScoreT1: 0.2857142857142857
evalScoreT3: 0.5952380952380952
mean_sim:  0.8640835785865784
mean_top:  1.84
evaluating done.
evaluating T5
d4u:False
Quadruplets with out-of-vocabulary words:  60.747663551401864
evalScoreT1: 0.2857142857142857
evalScoreT5: 0.6666666666666666
mean_sim:  0.8605573028326035
mean_top:  2.142857142857143
evaluating done.
evaluating T10
d4u:False
Quadruplets with out-of-vocabulary words:  60.747663551401864
evalScoreT1: 0.2857142857142857
evalScoreT10: 0.7619047619047619
mean_sim:  0.853

In [14]:
ft_train_eval(0,20,100,1,analogies)

Reading corpus0...
Reading the huge corpus
Model training: ft_s0_w20_d100_sg1
Model trained: ft_s0_w20_d100_sg1
Fasttext training time: 5944.9982697963715
D:\german_legal_WE\WEs\gensim_fasttext\ft_s0_w20_d100_sg1\ft_s0_w20_d100_sg1.model not existed, creating...
Saving model to D:\german_legal_WE\WEs\gensim_fasttext\ft_s0_w20_d100_sg1\ft_s0_w20_d100_sg1.model
Model saved: ft_s0_w20_d100_sg1
evaluating T3
d4u:False
Quadruplets with out-of-vocabulary words:  60.747663551401864
evalScoreT1: 0.23809523809523808
evalScoreT3: 0.5714285714285714
mean_sim:  0.8656863744060198
mean_top:  2.0
evaluating done.
evaluating T5
d4u:False
Quadruplets with out-of-vocabulary words:  60.747663551401864
evalScoreT1: 0.23809523809523808
evalScoreT5: 0.6190476190476191
mean_sim:  0.8631909581331106
mean_top:  2.1923076923076925
evaluating done.
evaluating T10
d4u:False
Quadruplets with out-of-vocabulary words:  60.747663551401864
evalScoreT1: 0.23809523809523808
evalScoreT10: 0.6904761904761905
mean_sim:  0

In [15]:
ft_train_eval(0,25,100,1,analogies)

Reading corpus0...
Reading the huge corpus
Model training: ft_s0_w25_d100_sg1
Model trained: ft_s0_w25_d100_sg1
Fasttext training time: 4911.504615783691
D:\german_legal_WE\WEs\gensim_fasttext\ft_s0_w25_d100_sg1\ft_s0_w25_d100_sg1.model not existed, creating...
Saving model to D:\german_legal_WE\WEs\gensim_fasttext\ft_s0_w25_d100_sg1\ft_s0_w25_d100_sg1.model
Model saved: ft_s0_w25_d100_sg1
evaluating T3
d4u:False
Quadruplets with out-of-vocabulary words:  60.747663551401864
evalScoreT1: 0.14285714285714285
evalScoreT3: 0.4523809523809524
mean_sim:  0.8722635068391499
mean_top:  1.9473684210526316
evaluating done.
evaluating T5
d4u:False
Quadruplets with out-of-vocabulary words:  60.747663551401864
evalScoreT1: 0.14285714285714285
evalScoreT5: 0.5476190476190477
mean_sim:  0.8668414457984592
mean_top:  2.391304347826087
evaluating done.
evaluating T10
d4u:False
Quadruplets with out-of-vocabulary words:  60.747663551401864
evalScoreT1: 0.14285714285714285
evalScoreT10: 0.6666666666666666

In [16]:
ft_train_eval(0,30,100,1,analogies)

Reading corpus0...
Reading the huge corpus
Model training: ft_s0_w30_d100_sg1
Model trained: ft_s0_w30_d100_sg1
Fasttext training time: 5338.544718980789
D:\german_legal_WE\WEs\gensim_fasttext\ft_s0_w30_d100_sg1\ft_s0_w30_d100_sg1.model not existed, creating...
Saving model to D:\german_legal_WE\WEs\gensim_fasttext\ft_s0_w30_d100_sg1\ft_s0_w30_d100_sg1.model
Model saved: ft_s0_w30_d100_sg1
evaluating T3
d4u:False
Quadruplets with out-of-vocabulary words:  60.747663551401864
evalScoreT1: 0.09523809523809523
evalScoreT3: 0.42857142857142855
mean_sim:  0.8678166667620341
mean_top:  2.2777777777777777
evaluating done.
evaluating T5
d4u:False
Quadruplets with out-of-vocabulary words:  60.747663551401864
evalScoreT1: 0.09523809523809523
evalScoreT5: 0.5238095238095238
mean_sim:  0.8620118959383531
mean_top:  2.6363636363636362
evaluating done.
evaluating T10
d4u:False
Quadruplets with out-of-vocabulary words:  60.747663551401864
evalScoreT1: 0.09523809523809523
evalScoreT10: 0.61904761904761

In [13]:
ft_train_eval(0,5,400,1,analogies)

Reading corpus0...
Reading the huge corpus
Model training: ft_s0_w5_d400_sg1
Model trained: ft_s0_w5_d400_sg1
Fasttext training time: 6355.152570486069
D:\german_legal_WE\WEs\gensim_fasttext\ft_s0_w5_d400_sg1\ft_s0_w5_d400_sg1.model not existed, creating...
Saving model to D:\german_legal_WE\WEs\gensim_fasttext\ft_s0_w5_d400_sg1\ft_s0_w5_d400_sg1.model
Model saved: ft_s0_w5_d400_sg1
evaluating T3
d4u:False
Quadruplets with out-of-vocabulary words:  60.747663551401864
evalScoreT1: 0.35714285714285715
evalScoreT3: 0.4523809523809524
mean_sim:  0.7688824503045333
mean_top:  1.3157894736842106
evaluating done.
evaluating T5
d4u:False
Quadruplets with out-of-vocabulary words:  60.747663551401864
evalScoreT1: 0.35714285714285715
evalScoreT5: 0.5238095238095238
mean_sim:  0.7559415968981656
mean_top:  1.7727272727272727
evaluating done.
evaluating T10
d4u:False
Quadruplets with out-of-vocabulary words:  60.747663551401864
evalScoreT1: 0.35714285714285715
evalScoreT10: 0.5476190476190477
mean_

In [14]:
ft_train_eval(0,5,500,1,analogies)

Reading corpus0...
Reading the huge corpus
Model training: ft_s0_w5_d500_sg1
Model trained: ft_s0_w5_d500_sg1
Fasttext training time: 5914.944427728653
D:\german_legal_WE\WEs\gensim_fasttext\ft_s0_w5_d500_sg1\ft_s0_w5_d500_sg1.model not existed, creating...
Saving model to D:\german_legal_WE\WEs\gensim_fasttext\ft_s0_w5_d500_sg1\ft_s0_w5_d500_sg1.model
Model saved: ft_s0_w5_d500_sg1
evaluating T3
d4u:False
Quadruplets with out-of-vocabulary words:  60.747663551401864
evalScoreT1: 0.2619047619047619
evalScoreT3: 0.4523809523809524
mean_sim:  0.7557979884900545
mean_top:  1.5263157894736843
evaluating done.
evaluating T5
d4u:False
Quadruplets with out-of-vocabulary words:  60.747663551401864
evalScoreT1: 0.2619047619047619
evalScoreT5: 0.5
mean_sim:  0.7508777550288609
mean_top:  1.7619047619047619
evaluating done.
evaluating T10
d4u:False
Quadruplets with out-of-vocabulary words:  60.747663551401864
evalScoreT1: 0.2619047619047619
evalScoreT10: 0.5476190476190477
mean_sim:  0.7418074659

## check vocabulary size and thevector diemsion

In [15]:
modelName,model=load_model(0,5,100,1)

Loading model from D:\german_legal_WE\WEs\gensim_fasttext\ft_s0_w5_d100_sg1\ft_s0_w5_d100_sg1.model
Model loaded: ft_s0_w5_d100_sg1


In [15]:
print("Model: "+modelName)
print("Vocabulary size", len(list(model.wv.vocab.keys())))
print("Word vector length:", len(model.wv["Mann"]))

Model: ft_s0_w5_d100_sg1
Vocabulary size 695675
Word vector length: 100


In [16]:
def print_basemodel_info(modelPath, model):
    print("###modelPath: ", modelPath)
    print("window: ",model.window)
    print("embed size: ",len(model.wv["mann"]))
    print("sg: "  , model.sg)
    print("epochs: ", model.epochs)
    print("min_count: ",model.vocabulary.min_count)
    print("negative sampling: ", model.negative)
    print("min_n:",model.min_n)
    print("max_n:",model.max_n)
    print("Vocabulary sie: ",len(model.wv.vectors_vocab))
  

In [17]:
print_basemodel_info(modelName,model)

###modelPath:  ft_s0_w5_d100_sg1
window:  5
embed size:  100
sg:  1
epochs:  5
min_count:  1
negative sampling:  5
min_n: 3
max_n: 6
Vocabulary sie:  695675


  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


In [21]:
modelName,model=load_model(1,5,100,1)

Loading model from D:\german_legal_WE\WEs\gensim_fasttext\ft_s1_w5_d100_sg1\ft_s1_w5_d100_sg1.model
Model loaded: ft_s1_w5_d100_sg1


In [22]:
print_basemodel_info(modelName,model)

###modelPath:  ft_s1_w5_d100_sg1
window:  5
embed size:  100
sg:  1
epochs:  5
min_count:  1
negative sampling:  5
min_n: 3
max_n: 6
Vocabulary sie:  282731


  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


In [20]:
#query: Vermieter Mieter Verkäufer Käufer
model.wv.most_similar(positive=["Mieter", "Verkäufer"], negative=["Vermieter"], topn=10)

[('Sexverkäufer', 0.8573232889175415),
 ('Käufer', 0.8427373170852661),
 ('Systemverkäufer', 0.8417536020278931),
 ('Autoverkäufer', 0.8347867727279663),
 ('Endkäufer', 0.8296511769294739),
 ('Endverkäufer', 0.8273990154266357),
 ('Hofverkäufern', 0.826798141002655),
 ('Einkäufer', 0.8267784118652344),
 ('Verkäufern', 0.8264048099517822),
 ('Weiterverkäufer', 0.8252648115158081)]

In [None]:
#query: Vermieter Mieter Verkäufer Käufer
model.most_similar(positive=["Käufer", "Verkäufer"], negative=["Vermieter"], topn=10)

In [None]:
model.most_similar(positive=["Mieter", "Verkäufer"], negative=["Vermieter"], topn=10)

In [None]:
model.most_similar(positive=["Mieter", "Verkäufer"], negative=["Vermieter"], topn=10)

In [None]:
model.most_similar(positive=["Mieter", "Verkäufer"], negative=["Vermieter"], topn=10)

In [17]:
print("Model: "+modelName)
print("Vocabulary size", len(list(model.wv.vocab.keys())))
print("Word vector length:", len(model.wv["Mann"]))

Model: ft_s1_w5_d100_sg1
Vocabulary size 282731
Word vector length: 100


In [18]:
type(model)

gensim.models.fasttext.FastText

In [19]:
type(model.wv)

gensim.models.keyedvectors.FastTextKeyedVectors

In [1]:
def check_vsd(modelName,model):
    print("Vocabulary size", len(list(model.wv.vocab.keys())))
    print("Word vector length:", len(model.wv["Mann"]))

## continue training prtrained models on legal corpus:

In [None]:
pt_ft_d300_ky = FastText.load_fasttext_format(modelPath4,full_model=True)