<a href="https://colab.research.google.com/github/Nithin46/KDM_ICP2/blob/main/KDM2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Part of Speech (POS) Tagging**

In [1]:
import spacy
sp = spacy.load('en_core_web_sm')
data = sp("Xi Jinping is a Chinese politician who has served as General Secretary of the Chinese Communist Party (CCP) and Chairman of the Central Military Commission (CMC) since 2012, and President of the People's Republic of China (PRC) since 2013. He has been the paramount leader of China, the most prominent political leader in the country, since 2012. The son of Chinese Communist veteran Xi Zhongxun, he was exiled to rural Yanchuan County as a teenager following his father's purge during the Cultural Revolution and lived in a cave in the village of Liangjiahe, where he joined the CCP and worked as the party secretary.")
for word in data:
    print(f'{word.text:{12}} {word.pos_:{10}} {word.tag_:{8}} {spacy.explain(word.tag_)}')

Xi           PROPN      NNP      noun, proper singular
Jinping      PROPN      NNP      noun, proper singular
is           AUX        VBZ      verb, 3rd person singular present
a            DET        DT       determiner
Chinese      ADJ        JJ       adjective
politician   NOUN       NN       noun, singular or mass
who          PRON       WP       wh-pronoun, personal
has          AUX        VBZ      verb, 3rd person singular present
served       VERB       VBN      verb, past participle
as           SCONJ      IN       conjunction, subordinating or preposition
General      PROPN      NNP      noun, proper singular
Secretary    PROPN      NNP      noun, proper singular
of           ADP        IN       conjunction, subordinating or preposition
the          DET        DT       determiner
Chinese      PROPN      NNP      noun, proper singular
Communist    PROPN      NNP      noun, proper singular
Party        PROPN      NNP      noun, proper singular
(            PUNCT      -LRB-    le

# **Dividing into Tokens**

In [4]:
import gensim
from gensim import corpora
from pprint import pprint
text = "Xi Jinping is a Chinese politician who has served as General Secretary of the Chinese Communist Party (CCP) and Chairman of the Central Military Commission (CMC) since 2012, and President of the People's Republic of China (PRC) since 2013. He has been the paramount leader of China, the most prominent political leader in the country, since 2012. The son of Chinese Communist veteran Xi Zhongxun, he was exiled to rural Yanchuan County as a teenager following his father's purge during the Cultural Revolution and lived in a cave in the village of Liangjiahe, where he joined the CCP and worked as the party secretary."
tokens = [[token for token in text.split()] for sentence in text]
gensim_dictionary = corpora.Dictionary(tokens)

print("The dictionary has: " +str(len(gensim_dictionary)) + " tokens")
print(gensim_dictionary.token2id)

# Another way of printing
for k, v in gensim_dictionary.token2id.items():
  print(f'{k:{15}} {v:{10}}')

The dictionary has: 71 tokens
{'(CCP)': 0, '(CMC)': 1, '(PRC)': 2, '2012,': 3, '2012.': 4, '2013.': 5, 'CCP': 6, 'Central': 7, 'Chairman': 8, 'China': 9, 'China,': 10, 'Chinese': 11, 'Commission': 12, 'Communist': 13, 'County': 14, 'Cultural': 15, 'General': 16, 'He': 17, 'Jinping': 18, 'Liangjiahe,': 19, 'Military': 20, 'Party': 21, "People's": 22, 'President': 23, 'Republic': 24, 'Revolution': 25, 'Secretary': 26, 'The': 27, 'Xi': 28, 'Yanchuan': 29, 'Zhongxun,': 30, 'a': 31, 'and': 32, 'as': 33, 'been': 34, 'cave': 35, 'country,': 36, 'during': 37, 'exiled': 38, "father's": 39, 'following': 40, 'has': 41, 'he': 42, 'his': 43, 'in': 44, 'is': 45, 'joined': 46, 'leader': 47, 'lived': 48, 'most': 49, 'of': 50, 'paramount': 51, 'party': 52, 'political': 53, 'politician': 54, 'prominent': 55, 'purge': 56, 'rural': 57, 'secretary.': 58, 'served': 59, 'since': 60, 'son': 61, 'teenager': 62, 'the': 63, 'to': 64, 'veteran': 65, 'village': 66, 'was': 67, 'where': 68, 'who': 69, 'worked': 70}


# **Named entity recognizer (NER)**

In [2]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')
text = nlp ("Xi Jinping is a Chinese politician who has served as General Secretary of the Chinese Communist Party (CCP) and Chairman of the Central Military Commission (CMC) since 2012, and President of the People's Republic of China (PRC) since 2013. He has been the paramount leader of China, the most prominent political leader in the country, since 2012. The son of Chinese Communist veteran Xi Zhongxun, he was exiled to rural Yanchuan County as a teenager following his father's purge during the Cultural Revolution and lived in a cave in the village of Liangjiahe, where he joined the CCP and worked as the party secretary.")
displacy.render(text, style = 'ent', jupyter=True)

# **Lemmatization**

In [3]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
data = "Xi Jinping is a Chinese politician who has served as General Secretary of the Chinese Communist Party (CCP) and Chairman of the Central Military Commission (CMC) since 2012, and President of the People's Republic of China (PRC) since 2013. He has been the paramount leader of China, the most prominent political leader in the country, since 2012. The son of Chinese Communist veteran Xi Zhongxun, he was exiled to rural Yanchuan County as a teenager following his father's purge during the Cultural Revolution and lived in a cave in the village of Liangjiahe, where he joined the CCP and worked as the party secretary."
nltk_tokens = nltk.word_tokenize(data)
for w in nltk_tokens:
       print ("{0:20}{1:20}".format(w,wordnet_lemmatizer.lemmatize(w, pos="v")))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
Xi                  Xi                  
Jinping             Jinping             
is                  be                  
a                   a                   
Chinese             Chinese             
politician          politician          
who                 who                 
has                 have                
served              serve               
as                  as                  
General             General             
Secretary           Secretary           
of                  of                  
the                 the                 
Chinese             Chinese             
Communist           Communist           
Party               Party               
(                   (                   
CCP                 CCP                 
)       