In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk import ne_chunk
from nltk.tree import Tree

import nltk
from nltk.corpus import stopwords

from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

In [2]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

nltk.download('stopwords')
# set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     D:\Utilisateurs\Gabriel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     D:\Utilisateurs\Gabriel\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     D:\Utilisateurs\Gabriel\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     D:\Utilisateurs\Gabriel\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     D:\Utilisateurs\Gabriel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# text = "Ceci est un exemple de texte destiné à tester les différentes librairies Python et à préparer une démonstration. Ce texte a été préparé par Gabriel pour LinkyStat."
text = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [4]:
print(text)

European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices


In [5]:
# Tokenisation

nltk.word_tokenize(text)

['European',
 'authorities',
 'fined',
 'Google',
 'a',
 'record',
 '$',
 '5.1',
 'billion',
 'on',
 'Wednesday',
 'for',
 'abusing',
 'its',
 'power',
 'in',
 'the',
 'mobile',
 'phone',
 'market',
 'and',
 'ordered',
 'the',
 'company',
 'to',
 'alter',
 'its',
 'practices']

In [6]:
# Stop words
stop_words = set(stopwords.words('english')) 

tokenized = nltk.word_tokenize(text)
filtered_sentence = [w for w in tokenized if not w in stop_words]

print(filtered_sentence)

['European', 'authorities', 'fined', 'Google', 'record', '$', '5.1', 'billion', 'Wednesday', 'abusing', 'power', 'mobile', 'phone', 'market', 'ordered', 'company', 'alter', 'practices']


In [7]:
# Stemming
porter = PorterStemmer()
lancaster = LancasterStemmer()

for word in filtered_sentence:
    print("{0:20}{1:20}{2:20}".format(word,porter.stem(word),lancaster.stem(word)))

European            european            europ               
authorities         author              auth                
fined               fine                fin                 
Google              googl               googl               
record              record              record              
$                   $                   $                   
5.1                 5.1                 5.1                 
billion             billion             bil                 
Wednesday           wednesday           wednesday           
abusing             abus                abus                
power               power               pow                 
mobile              mobil               mobl                
phone               phone               phon                
market              market              market              
ordered             order               ord                 
company             compani             company             
alter               alte

In [8]:
# POS tags

# NP: noun phrase
# DT: determiner
# JJ: adjective
# JJS: adjective, superlative
# NN: noun
# NNP: proper noun, singular
# NNS: noun, plural
# IN: preposition or subordinating conjunction
# VBD: verb, past tense
# VBZ: verb, 3rd person singular present

nltk.pos_tag(nltk.word_tokenize(text))

[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('$', '$'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('market', 'NN'),
 ('and', 'CC'),
 ('ordered', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('to', 'TO'),
 ('alter', 'VB'),
 ('its', 'PRP$'),
 ('practices', 'NNS')]

In [9]:
# GPE: location
# PERSON: person

tree = ne_chunk(pos_tag(word_tokenize(text)))
print(tree)

(S
  (GPE European/JJ)
  authorities/NNS
  fined/VBD
  (PERSON Google/NNP)
  a/DT
  record/NN
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  power/NN
  in/IN
  the/DT
  mobile/JJ
  phone/NN
  market/NN
  and/CC
  ordered/VBD
  the/DT
  company/NN
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [10]:
import spacy
from spacy import displacy
from collections import Counter
from pprint import pprint

# nlp = spacy.load("en_core_web_sm")
import en_core_web_sm
nlp = en_core_web_sm.load()

In [11]:
# Name-Entity Recognition (NER)

doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
pprint([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'),
 ('Google', 'ORG'),
 ('a record $5.1 billion', 'MONEY'),
 ('Wednesday', 'DATE')]


In [12]:
# Begin: first token of a multi-token entity
# In: inner token of a multi-token entity
# Last: last token of a multi-token entity
# Unit: single-token entity
# Out: non-entity token

pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(European, 'B', 'NORP'),
 (authorities, 'O', ''),
 (fined, 'O', ''),
 (Google, 'B', 'ORG'),
 (a, 'B', 'MONEY'),
 (record, 'I', 'MONEY'),
 ($, 'I', 'MONEY'),
 (5.1, 'I', 'MONEY'),
 (billion, 'I', 'MONEY'),
 (on, 'O', ''),
 (Wednesday, 'B', 'DATE'),
 (for, 'O', ''),
 (abusing, 'O', ''),
 (its, 'O', ''),
 (power, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (mobile, 'O', ''),
 (phone, 'O', ''),
 (market, 'O', ''),
 (and, 'O', ''),
 (ordered, 'O', ''),
 (the, 'O', ''),
 (company, 'O', ''),
 (to, 'O', ''),
 (alter, 'O', ''),
 (its, 'O', ''),
 (practices, 'O', '')]


In [13]:
displacy.render(nlp(str(text)), jupyter=True, style='ent')

In [14]:
displacy.render(nlp(str(text)), style='dep', jupyter = True, options = {'distance': 120})

In [15]:
# Lemmatisation
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(text)) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('European', 'ADJ', 'european'),
 ('authorities', 'NOUN', 'authority'),
 ('fined', 'VERB', 'fine'),
 ('Google', 'PROPN', 'Google'),
 ('record', 'NOUN', 'record'),
 ('$', 'SYM', '$'),
 ('5.1', 'NUM', '5.1'),
 ('billion', 'NUM', 'billion'),
 ('Wednesday', 'PROPN', 'Wednesday'),
 ('abusing', 'VERB', 'abuse'),
 ('power', 'NOUN', 'power'),
 ('mobile', 'ADJ', 'mobile'),
 ('phone', 'NOUN', 'phone'),
 ('market', 'NOUN', 'market'),
 ('ordered', 'VERB', 'order'),
 ('company', 'NOUN', 'company'),
 ('alter', 'VERB', 'alter'),
 ('practices', 'NOUN', 'practice')]