## English analogy

#### Import Spacy english model

In [11]:
# !python -m spacy download en_core_web_sm

#### Import Direct English Language module

In [4]:
import spacy

nlp_en = spacy.load("en_core_web_sm")

doc = nlp_en("This is a sentence.")

print([(w.text, w.pos_) for w in doc])

[('This', 'PRON'), ('is', 'AUX'), ('a', 'DET'), ('sentence', 'NOUN'), ('.', 'PUNCT')]


In [5]:
import en_core_web_sm

nlp_en = en_core_web_sm.load()

doc = nlp_en("This is a sentence.")

print([(w.text, w.pos_) for w in doc])

[('This', 'PRON'), ('is', 'AUX'), ('a', 'DET'), ('sentence', 'NOUN'), ('.', 'PUNCT')]


## Chinese NLP

#### Import Spacy chinese model

In [10]:
# !python -m spacy download zh_core_web_sm
# !pip install -U pkuseg
# !pip install jieba
# !pip install stopwordsiso
# !pip install matplotlib

#### Import Direct Chinese Language module

In [6]:
import spacy
nlp_zh = spacy.load("zh_core_web_sm")
doc = nlp_zh("这是一个用于示例的句子。")
print([(w.text, w.pos_) for w in doc])

[('这是', 'VERB'), ('一个', 'ADV'), ('用于', 'VERB'), ('示例', 'NOUN'), ('的', 'PART'), ('句子', 'NOUN'), ('。', 'PUNCT')]


In [7]:
import zh_core_web_sm
nlp_zh = zh_core_web_sm.load()
doc = nlp_zh("这是一个用于示例的句子。")
print([(w.text, w.pos_) for w in doc])

[('这是', 'VERB'), ('一个', 'ADV'), ('用于', 'VERB'), ('示例', 'NOUN'), ('的', 'PART'), ('句子', 'NOUN'), ('。', 'PUNCT')]


In [8]:
from spacy import displacy
# load language model
nlp_zh = spacy.load('zh_core_web_sm')## disable=["parser"]
# parse text 
doc = nlp_zh('這是一個中文的句子')

Linguistic Features

In [9]:
# parts of speech tagging
for token in doc:
    print(((token.text, 
            token.lemma_, 
            token.pos_, 
            token.tag_,
            token.dep_,
            token.shape_,
            token.is_alpha,
            token.is_stop,
            )))

('這', '', 'NOUN', 'NN', 'nsubj', 'x', True, False)
('是', '', 'VERB', 'VC', 'cop', 'x', True, True)
('一', '', 'NUM', 'CD', 'nummod', 'x', True, True)
('個', '', 'NUM', 'M', 'mark:clf', 'x', True, False)
('中文', '', 'NOUN', 'NN', 'nmod:assmod', 'xx', True, False)
('的', '', 'PART', 'DEG', 'case', 'x', True, True)
('句子', '', 'NOUN', 'NN', 'ROOT', 'xx', True, False)


In [10]:
## Output in different ways
for token in doc:
    print('%s_%s' % (token.text, token.tag_))
    
out = ''
for token in doc:
    out = out + ' '+ '/'.join((token.text, token.tag_))
print(out)

這_NN
是_VC
一_CD
個_M
中文_NN
的_DEG
句子_NN
 這/NN 是/VC 一/CD 個/M 中文/NN 的/DEG 句子/NN


In [11]:
spacy.explain('VC')

'是 (copula)'

#### Visualization Linguistic Features

In [12]:
# Visualize
displacy.render(doc, style="dep")

In [13]:
options = {"compact": True, "bg": "#09a3d5",
           "color": "white", "font": "Source Sans Pro",
          "distance": 120}
displacy.render(doc, style="dep", options=options)

In [14]:
## longer paragraphs
text_long = '''武漢肺炎全球肆虐，至今已有2906萬人確診、92萬染疫身亡，而流亡美國的中國大陸病毒學家閻麗夢，14日時開通了推特帳號，並公布一份長達26頁的科學論文，研究直指武肺病毒與自然人畜共通傳染病的病毒不同，並呼籲追查武漢P4實驗室及美國衛生研究院（NIH）之間的金流，引發討論。'''
text_long_list = text_long.split(sep="，")
len(text_long_list)

for c in text_long_list:
    print(c)

武漢肺炎全球肆虐
至今已有2906萬人確診、92萬染疫身亡
而流亡美國的中國大陸病毒學家閻麗夢
14日時開通了推特帳號
並公布一份長達26頁的科學論文
研究直指武肺病毒與自然人畜共通傳染病的病毒不同
並呼籲追查武漢P4實驗室及美國衛生研究院（NIH）之間的金流
引發討論。


In [15]:
## parse the texts
doc2 = list(nlp_zh.pipe(text_long_list))
len(doc2)

8

#### Chinese word segmentation module

In [16]:
import jieba
import jieba.posseg as pseg

text = '飞机是今天晚上七点钟准时降落在北京首都国际机场的'

words = pseg.cut(text)
for w in words:
    print('%s %s' % (w.word, w.flag))

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/rq/73_t64xn2zxf07vggynh53800000gq/T/jieba.cache
Loading model cost 0.641 seconds.
Prefix dict has been built successfully.


飞机 n
是 v
今天 t
晚上 t
七点钟 m
准时 v
降落 v
在 p
北京 ns
首都国际机场 nt
的 uj


In [17]:
import stopwordsiso
from stopwordsiso import stopwords
# Chinese
chin_stop_words = list(stopwords(["zh"]))  

In [18]:
with open("chin_stop_words.txt","wt") as file:
    for thestring in chin_stop_words:
        print(thestring, file=file)

In [19]:
#Visual dependency for each sentence-like chunk
sentence_spans = list(doc2)
options = {"compact": True, "bg": "#09a3d5",
           "color": "white", "font": "Source Sans Pro",
          "distance": 120}
displacy.render(sentence_spans, style="dep", options=options)

#### TF-IDF Preprocessing , Tokenization & Vocab Building on Chinese Text

In [21]:
import matplotlib
matplotlib.rcParams['font.sans-serif'] = 'Arial Unicode MS'
matplotlib.rcParams['axes.labelsize'] = '15'


import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import jieba

pd.options.display.max_columns = 30
%matplotlib inline

texts = [
  '翠花买了浅蓝色的鱼',
  '翠花买了浅蓝橙色的鱼',
  '猫在商店吃了一条鱼',
  '翠花去了商店。翠花买了一只虫子。翠花看到一条鱼',
  '它对这个虫子喵喵叫了一声，它现在仍然在对这只虫子和这条鱼喵喵叫',
  '这只猫在鱼店里。这只猫是橙色的。这只猫正在对这条鱼喵喵叫。',
  '翠花是鱼'  
]

from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(tokenizer=jieba.lcut)

# .fit_transfer TOKENIZES and COUNTS
X = count_vectorizer.fit_transform(texts)

count_vectorizer.get_feature_names_out()

array(['。', '一只', '一声', '一条', '买', '了', '仍然', '去', '只', '叫', '吃', '和',
       '商店', '喵', '在', '它', '对', '店里', '是', '橙色', '正在', '浅蓝', '浅蓝色', '猫',
       '现在', '的', '看到', '翠花', '虫子', '这', '这个', '这条', '鱼', '，'],
      dtype=object)

In [22]:
!pip show scikit-learn

Name: scikit-learn
Version: 1.2.1
Summary: A set of python modules for machine learning and data mining
Home-page: http://scikit-learn.org
Author: 
Author-email: 
License: new BSD
Location: /Users/shashanksahoo/miniconda3/envs/tools/lib/python3.9/site-packages
Requires: joblib, numpy, scipy, threadpoolctl
Required-by: lime, sentence-transformers, shap


## References

1. https://pypi.org/project/jieba/
2. https://spacy.io/models/zh/