In [15]:
# Uncomment this cell if it's your first time running spacy
#!pip install -U spacy
#!python -m spacy download en_core_web_sm
#!python -m spacy download zh_core_web_sm
#!python -m spacy download xx_sent_ud_sm

In [16]:
import pandas as pd
import numpy as np
import os
import jieba
import string 
import re 
from nltk.tokenize import MWETokenizer, word_tokenize
from tqdm import tqdm
import spacy

In [17]:
language_pairs = [
    'en-fi', 'en-zh', 'cs-en'
    , 'de-en', 'ru-en', 'zh-en'
]
translations = ['fi', 'zh', 'en']

In [18]:
corpus_fi = []
corpus_zh = []
corpus_en = []
for pair in language_pairs:
    languages = pair.split('-')
    for i in range(2):
        all_sentences = pd.read_csv(os.path.join('..', 'corpus',pair, 'scores.csv'))
        if i == 0:
            sentences = all_sentences['source']
        if i == 1:
            sentences = all_sentences['reference'].append(all_sentences['translation'], ignore_index=True)
              
        if languages[i]=='fi':
            corpus_fi.append(sentences)
        elif languages[i]=='zh':
            corpus_zh.append(sentences)
        elif languages[i]=='en':
            corpus_en.append(sentences)


In [None]:
corpus_fi = list(pd.concat(corpus_fi))
corpus_en = list(pd.concat(corpus_en))


In [22]:
def tokenization(corpus, tool, language = 'en', pos_to_remove = ['PUNCT','NUM'], ent_to_remove = ['PERSON','ORG'], stop_words_to_remove= False, lowercase = True):
    """
    tool: one of two strings - 'spacy' or 'NLTK'
    languages (string ISO code): supports 'en', 'fi' or 'zh'
    pos_to_remove (list): part-of-speech tag from spacy
    ent_to_remove (list): entities from spacy
    
    
    """
    tokenized_corpus = []
    if tool == 'spacy':
        if language == 'en':
            sc = spacy.load('en_core_web_sm')
            for doc in tqdm(sc.pipe(corpus, disable=["lemmatizer", "textcat", "custom"])):
                if stop_words_to_remove:
                    doc_list = [word.text for word in doc if word.pos_ not in pos_to_remove if word.ent_type_ not in ent_to_remove if word.is_stop]
                else: 
                    doc_list = [word.text for word in doc if word.pos_ not in pos_to_remove if word.ent_type_ not in ent_to_remove]

                if lowercase:
                    doc_list = [word.lower() for word in doc_list]

                tokenized_corpus.append(doc_list)
        if language == 'zh':
            sc = spacy.load('zh_core_web_sm')
            for doc in tqdm(sc.pipe(corpus, disable=["lemmatizer", "textcat", "custom"])):
                if stop_words_to_remove:
                    doc_list = [word.text for word in doc if word.pos_ not in ['PUNCT'] if word.is_stop]
                else: 
                    doc_list = [word.text for word in doc if word.pos_ not in ['PUNCT']]
                
                tokenized_corpus.append(doc_list)
        if language == 'fi':
            sc = spacy.load('xx_sent_ud_sm')
            for doc in tqdm(sc.pipe(corpus, disable=["lemmatizer", "textcat", "custom"])): 
                doc_list = [word.text for word in doc]
                
                if lowercase:
                    doc_list = [word.lower() for word in doc_list]

                tokenized_corpus.append(doc_list)
                
            
        
#    if tool == 'NLTK':
        
        
    return tokenized_corpus


### Finish Vocabulary

* Spacy:

In [23]:
# all as default ( with punctuation numbers, allll!)
tokenized_fi1 = tokenization(corpus_fi, tool = 'spacy', language = 'fi',lowercase = True)

13496it [00:07, 1813.50it/s]


In [25]:
def match_regex(tokenized_corpus, letters = True, letters_and_numbers = False):
    
    if letters:
        regex = r'[a-z]+'
    if letters_and_numbers:
        regex = r'([a-z]+|^\d+$)'
        
    new_tokenized_zh = []
    for sentence_list in tokenized_corpus:
        sentence_list2 = [word for word in sentence_list if re.search(regex, word)]
        new_tokenized_zh.append(sentence_list2)
        
    return new_tokenized_zh

In [26]:
# remove punctuation keeping the strings with letters
tokenized_fi2 = match_regex(tokenized_fi1, letters = True, letters_and_numbers = False)

In [28]:
# remove punctuation keeping the strings with letters or only numbers
tokenized_fi3 = match_regex(tokenized_fi1, letters = False, letters_and_numbers = True)

### English Vocabulary

* Spacy:

In [30]:
# all as default
spacy_tokenized_en1 = tokenization(corpus_en, tool = 'spacy', language = 'en', pos_to_remove = ['PUNCT','NUM'], ent_to_remove = ['PERSON','ORG'], stop_words_to_remove= False, lowercase = True)

172345it [13:43, 209.31it/s]


In [31]:
# not removing entities names
spacy_tokenized_en2 = tokenization(corpus_en, tool = 'spacy', language = 'en', pos_to_remove = ['PUNCT','NUM'], ent_to_remove = [], stop_words_to_remove= False, lowercase = True)

172345it [14:14, 201.77it/s]


In [32]:
# not removing entities names and numbers
spacy_tokenized_en3 = tokenization(corpus_en, tool = 'spacy', language = 'en', pos_to_remove = ['PUNCT'], ent_to_remove = [], stop_words_to_remove= False, lowercase = True)

172345it [13:17, 215.99it/s]


In [33]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
len(spacy_stopwords) #326
list(spacy_stopwords)[:20]

['been',
 'amount',
 'quite',
 'whereby',
 'really',
 'became',
 'besides',
 'no',
 'whereupon',
 'if',
 'seeming',
 'see',
 'it',
 'itself',
 'serious',
 'every',
 'too',
 'back',
 'then',
 'very']

### Chinese Vocabulary

* Jieba:

In [34]:
jieba_tokenized_zh = []
for sentence in corpus_zh:
    sentence_list = [word for word in jieba.cut(sentence)]
    jieba_tokenized_zh.append(sentence_list)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\berfi\AppData\Local\Temp\jieba.cache
Loading model cost 0.853 seconds.
Prefix dict has been built successfully.


In [35]:
def match_regex(tokenized_corpus, chinese = True, chinese_and_numbers = False):
    
    if chinese:
        regex = r'[\u4e00-\u9fff]+'
    if chinese_and_numbers:
        regex = r'([\u4e00-\u9fff]+|^\d+$)'
        
    new_tokenized_zh = []
    for sentence_list in tokenized_corpus:
        chinese = [word for word in sentence_list if re.search(regex, word)]
        new_tokenized_zh.append(chinese)
        
    return new_tokenized_zh

In [36]:
# to keep all words containing chinese characters
jieba_tokenized_zh2 = match_regex(jieba_tokenized_zh, chinese = True, chinese_and_numbers = False)

In [38]:
# to keep chinese characters and numbers
jieba_tokenized_zh3 = match_regex(jieba_tokenized_zh, chinese = False, chinese_and_numbers = True)

* Spacy:

In [40]:
spacy_tokenized_zh = tokenization(corpus_zh, tool = 'spacy', language = 'zh', stop_words_to_remove= False)

46861it [04:41, 166.70it/s]


In [42]:
spacy_stopwords = spacy.lang.zh.stop_words.STOP_WORDS
len(spacy_stopwords) #1891
list(spacy_stopwords)[:20]

['给',
 '何',
 '好的',
 '另',
 '逐渐',
 '哪怕',
 '是以',
 '［⑤ｂ］',
 'ｆ］',
 '保持',
 '使得',
 '一样',
 '这般',
 '该',
 '［③ｄ］',
 '∪φ∈',
 '促进',
 '串行',
 '^',
 '能够']

In [43]:
# to keep all words containing chinese characters
spacy_tokenized_zh2 = match_regex(spacy_tokenized_zh, chinese = True, chinese_and_numbers = False)

In [46]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec



In [47]:
model_en = Word2Vec(sentences=spacy_tokenized_en3, min_count=1)
model_en.save("word2vec_en.model")

model_en = Word2Vec.load("word2vec_en.model")
model_en.train(spacy_tokenized_en3, total_examples=1, epochs=1)

vector_en = model_en.wv['republican']  # get numpy vector of a word
sims_en = model_en.wv.most_similar('republican', topn=10)  # get other similar words
sims_en

[('pompeo', 0.7294467687606812),
 ('calvert', 0.7098347544670105),
 ('kansas', 0.7062785029411316),
 ('ken', 0.659417986869812),
 ('chairmen', 0.6563398838043213),
 ('mike', 0.6498099565505981),
 ('nominee', 0.645456075668335),
 ('democratic', 0.6383917331695557),
 ('candidate', 0.6352219581604004),
 ('vocal', 0.6339017748832703)]

In [48]:
model_fi = Word2Vec(sentences=tokenized_fi3, min_count=1)
model_fi.save("word2vec_fi.model")

model_fi = Word2Vec.load("word2vec_fi.model")
model_fi.train(tokenized_fi3, total_examples=1, epochs=1)

vector_fi = model_fi.wv['investoimaan']  # get numpy vector of a word
sims_fi = model_fi.wv.most_similar('investoimaan', topn=10)  # get other similar words
sims_fi

[('löydä', 0.9499693512916565),
 ('marcus', 0.9456921219825745),
 ('jatkoi', 0.9450461864471436),
 ('lanzmann', 0.94491046667099),
 ('käyttäjää', 0.9445856809616089),
 ('nations', 0.9436014890670776),
 ('eräänä', 0.9434420466423035),
 ('suositellaan', 0.9433552026748657),
 ('syyskuun', 0.9432809948921204),
 ('kommunistijohtaja', 0.9432252645492554)]

In [53]:
model_zh = Word2Vec(sentences=spacy_tokenized_zh3, min_count=1)
model_zh.save("word2vec_zh.model")

model_zh = Word2Vec.load("word2vec_zh.model")
model_zh.train(spacy_tokenized_zh3, total_examples=1, epochs=1)

vector_zh = model_zh.wv['中国']  # get numpy vector of a word
sims_zh = model_zh.wv.most_similar('中国', topn=10)  # get other similar words
sims_zh

[('中华', 0.6370081901550293),
 ('义诊', 0.6322122812271118),
 ('交流年', 0.627692699432373),
 ('多元性', 0.6226692795753479),
 ('内涵', 0.6203840970993042),
 ('传播者', 0.6175974607467651),
 ('遗产', 0.6144717335700989),
 ('重任', 0.6135461330413818),
 ('跟岗', 0.6088530421257019),
 ('文化', 0.5952016115188599)]