In [1]:
Input_sentence = "Xi Jinping is a Chinese politician who has served as General Secretary of the Chinese Communist Party (CCP) and Chairman of the Central Military Commission (CMC) since 2012, and President of the People's Republic of China (PRC) since 2013. He has been the paramount leader of China, the most prominent political leader in the country, since 2012. The son of Chinese Communist veteran Xi Zhongxun, he was exiled to rural Yanchuan County as a teenager following his father's purge during the Cultural Revolutionandlived in a cave in the village of Liangjiahe, where he joined the CCP and worked as the party secretary."

In [2]:
import nltk
nltk.download('punkt')
nltk.download("popular")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    | 

True

#1. Part-of-speech (POS) tagger

In [3]:
tokenized_word=nltk.word_tokenize(Input_sentence)
tokenized_word[:10]

['Xi',
 'Jinping',
 'is',
 'a',
 'Chinese',
 'politician',
 'who',
 'has',
 'served',
 'as']

In [4]:
tagged_word = nltk.pos_tag(tokenized_word)
tagged_word[:20]


[('Xi', 'NN'),
 ('Jinping', 'NNP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('Chinese', 'JJ'),
 ('politician', 'NN'),
 ('who', 'WP'),
 ('has', 'VBZ'),
 ('served', 'VBN'),
 ('as', 'IN'),
 ('General', 'NNP'),
 ('Secretary', 'NNP'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('Chinese', 'NNP'),
 ('Communist', 'NNP'),
 ('Party', 'NNP'),
 ('(', '('),
 ('CCP', 'NNP'),
 (')', ')')]

#2. Named entity recognizer (NER)

In [5]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()


In [6]:
doc = nlp(Input_sentence)
print("Named entity recognizer (NER):\n", [(X, X.ent_type_) for X in doc])


Named entity recognizer (NER):
 [(Xi, 'PERSON'), (Jinping, 'PERSON'), (is, ''), (a, ''), (Chinese, 'NORP'), (politician, ''), (who, ''), (has, ''), (served, ''), (as, ''), (General, ''), (Secretary, ''), (of, ''), (the, 'ORG'), (Chinese, 'ORG'), (Communist, 'ORG'), (Party, 'ORG'), ((, ''), (CCP, 'ORG'), (), ''), (and, ''), (Chairman, ''), (of, ''), (the, 'ORG'), (Central, 'ORG'), (Military, 'ORG'), (Commission, 'ORG'), ((, ''), (CMC, ''), (), ''), (since, ''), (2012, 'DATE'), (,, ''), (and, ''), (President, ''), (of, ''), (the, 'GPE'), (People, 'GPE'), ('s, 'GPE'), (Republic, 'GPE'), (of, 'GPE'), (China, 'GPE'), ((, ''), (PRC, 'GPE'), (), ''), (since, ''), (2013, 'DATE'), (., ''), (He, ''), (has, ''), (been, ''), (the, ''), (paramount, ''), (leader, ''), (of, ''), (China, 'GPE'), (,, ''), (the, ''), (most, ''), (prominent, ''), (political, ''), (leader, ''), (in, ''), (the, ''), (country, ''), (,, ''), (since, ''), (2012, 'DATE'), (., ''), (The, ''), (son, ''), (of, ''), (Chinese, 'NOR

#3. Co-reference resolution

In [None]:

!pip install spacy
# NeuralCoref is a pipeline extension for spaCy 2.1+ which annotates and resolves coreference clusters using a neural network
!pip install neuralcoref --no-binary neuralcoref
!python -m spacy download en



In [None]:
# Co-reference occurs when  two or more expressions  in a text refer to the same person  or thing;  they have the same referent,
!git clone https://github.com/huggingface/neuralcoref.git
!python -m spacy download en

%cd neuralcoref

!pip install -r requirements.txt
!pip install -e .

In [9]:
# Load  usual spacy English model
import spacy
nlp = spacy.load('en')

# Add neural coref to spacy's pipe
import neuralcoref
neuralcoref.add_to_pipe(nlp)

  return f(*args, **kwds)
  return f(*args, **kwds)
100%|██████████| 40155833/40155833 [00:00<00:00, 67551481.46B/s]


<spacy.lang.en.English at 0x7f0c67553ef0>

In [15]:
#  use NeuralCoref as manipulate a spacy document annotations.
doc = nlp(Input_sentence)

print(doc._.has_coref)
print(doc._.coref_clusters)

True
[Xi Jinping: [Xi Jinping, He, he, his, he], China: [China, China, the country]]




#4. Lemmatization

In [17]:
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

sentence = Input_sentence
punctuations="?:!.,;()"
sentence_words = nltk.word_tokenize(sentence)

for word in sentence_words:
    if word in punctuations:
        sentence_words.remove(word)

sentence_words
print("{0:20}{1:20}".format("Word","Lemma"))
for word in sentence_words:
    print("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word)))

Word                Lemma               
Xi                  Xi                  
Jinping             Jinping             
is                  is                  
a                   a                   
Chinese             Chinese             
politician          politician          
who                 who                 
has                 ha                  
served              served              
as                  a                   
General             General             
Secretary           Secretary           
of                  of                  
the                 the                 
Chinese             Chinese             
Communist           Communist           
Party               Party               
CCP                 CCP                 
and                 and                 
Chairman            Chairman            
of                  of                  
the                 the                 
Central             Central             
Military        

#5. Parsing 

In [12]:
from textblob import TextBlob


In [13]:
blob_sentence = TextBlob(Input_sentence)

In [14]:
blob_sentence.parse()

"Xi/NNP/B-NP/O Jinping/NNP/I-NP/O is/VBZ/B-VP/O a/DT/B-NP/O Chinese/JJ/I-NP/O politician/NN/I-NP/O who/WP/O/O has/VBZ/B-VP/O served/VBN/I-VP/O as/IN/B-PP/B-PNP General/NNP/B-NP/I-PNP Secretary/NNP/I-NP/I-PNP of/IN/B-PP/B-PNP the/DT/B-NP/I-PNP Chinese/JJ/I-NP/I-PNP Communist/NNP/I-NP/I-PNP Party/NNP/I-NP/I-PNP (/(/O/O CCP/NN/B-NP/O )/)/O/O and/CC/O/O Chairman/NNP/B-NP/O of/IN/B-PP/B-PNP the/DT/B-NP/I-PNP Central/NNP/I-NP/I-PNP Military/NNP/I-NP/I-PNP Commission/NNP/I-NP/I-PNP (/(/O/O CMC/NN/B-NP/O )/)/O/O since/IN/B-PP/O 2012/CD/O/O ,/,/O/O and/CC/O/O President/NNP/B-NP/O of/IN/B-PP/B-PNP the/DT/B-NP/I-PNP People/NNS/I-NP/I-PNP '/POS/O/O s/PRP/B-NP/O Republic/NNP/I-NP/O of/IN/B-PP/B-PNP China/NNP/B-NP/I-PNP (/(/O/O PRC/NNP/B-NP/O )/)/O/O since/IN/B-PP/O 2013/CD/O/O ././O/O\nHe/PRP/B-NP/O has/VBZ/B-VP/O been/VBN/I-VP/O the/DT/B-NP/O paramount/JJ/I-NP/O leader/NN/I-NP/O of/IN/B-PP/B-PNP China/NNP/B-NP/I-PNP ,/,/O/O the/DT/B-NP/O most/RBS/I-NP/O prominent/JJ/I-NP/O political/JJ/I-NP/O lead