# Introduction to Natural Language Processing

# Installing the libraries

- spaCy: https://spacy.io/

In [None]:
!pip install spacy --upgrade

In [None]:
import spacy
spacy.__version__

'3.1.1'

In [None]:
!python -m spacy download pt_core_news_sm

In [None]:
!python -m spacy download fr_core_news_sm

In [None]:
!python -m spacy download en_core_web_sm

# POS (part-of-speech)

- POS (part-of-speech): noun, adjective, verb
- It is important to find named entities
- Tags: https://ashutoshtripathi.com/2020/04/13/parts-of-speech-tagging-and-dependency-parsing-using-spacy-nlp/

In [None]:
import en_core_web_sm

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
nlp

<spacy.lang.en.English at 0x7faa34052ad0>

In [None]:
document = nlp('I am learning natural language processing. The course is in London')

In [None]:
for token in document:
  print(token.text, token.pos_)

I PRON
am AUX
learning VERB
natural ADJ
language NOUN
processing NOUN
. PUNCT
The DET
course NOUN
is AUX
in ADP
London PROPN


## Legend

- lemma: "root" of the word
- pos: part-of-speech  
- tag: morfological information (present, future, past)
- dep: syntatic dependency
- shape: lowercase, uppercasa
- alpha: if it is alphanumeric
- stop: if it is a stop word

In [None]:
for token in document:
  print(token.text, token.pos_, token.lemma_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)

I PRON I PRP nsubj X True True
am AUX be VBP aux xx True True
learning VERB learn VBG ROOT xxxx True False
natural ADJ natural JJ amod xxxx True False
language NOUN language NN compound xxxx True False
processing NOUN processing NN dobj xxxx True False
. PUNCT . . punct . False False
The DET the DT det Xxx True True
course NOUN course NN nsubj xxxx True False
is AUX be VBZ ROOT xx True True
in ADP in IN prep xx True True
London PROPN London NNP pobj Xxxxx True False


In [None]:
for token in document:
  if token.pos_ == 'PROPN':
    print(token.text)

London


In [None]:
for token in document:
  if token.pos_ == 'VERB':
    print(token.text)

learning


# Lemmatization and stemming

- Lemmatization: meaning of the word based on the dictionary (morphological analysis) - extract the base word
- Stemming: extract the root of the word

In [None]:
for token in document:
  print(token.text, token.lemma_)

I I
am be
learning learn
natural natural
language language
processing processing
. .
The the
course course
is be
in in
London London


In [None]:
doc = nlp('learn learning watch watching watched')
[token.lemma_ for token in doc]

['learn', 'learn', 'watch', 'watch', 'watch']

## Lemmatization x Stemming

In [None]:
import nltk

In [None]:
stemmer = nltk.stem.PorterStemmer()
stemmer.stem('learning')

'learn'

In [None]:
stemmer.stem('watching')

'watch'

In [None]:
for token in document:
  print(token.text, token.lemma_, stemmer.stem(token.text))

I I I
am be am
learning learn learn
natural natural natur
language language languag
processing processing process
. . .
The the the
course course cours
is be is
in in in
London London london


# Named-entity recognition (NER)

- List of tags: https://towardsdatascience.com/named-entity-recognition-ner-using-spacy-nlp-part-4-28da2ece57c6

In [None]:
text = 'IBM is a US company on information technology. It is located in San Francisco and revenue in 2018 was approximately 320 billion dolars'

In [None]:
document = nlp(text)

In [None]:
for entity in document.ents:
  print(entity.text, entity.label_)

IBM ORG
US GPE
San Francisco GPE
2018 DATE
approximately 320 billion dolars MONEY


In [None]:
from spacy import displacy
displacy.render(document, style = 'ent', jupyter=True)

In [None]:
text = 'Bill Gates was born in Seattle on 1955-10-28 and is the founder of Microsoft'

In [None]:
document = nlp(text)
for entity in document.ents:
  print(entity.text, entity.label_)

Bill Gates PERSON
Seattle GPE
1955-10-28 DATE
Microsoft ORG


In [None]:
displacy.render(document, style = 'ent', jupyter=True)

In [None]:
for entity in document.ents:
  if entity.label_ == 'PERSON':
    print(entity.text)

Bill Gates


# Stopwords

- Words that appear very often and don't help to understand the context of the document

In [None]:
# it
from spacy.lang.fr.stop_words import STOP_WORDS
print(STOP_WORDS)

{'déja', 'là', 'rend', 'déjà', 'précisement', 'té', 'exactement', 'malgré', 'celles-là', 'elle-même', 'derriere', 'sent', 'si', 'peu', 'quand', 'mais', 'pres', 'seule', 'puis', 'quelconque', 'faisaient', 'deux', 'telle', 'voila', 'chez', 'envers', 'douzième', 'delà', 'dessus', 'parler', 'antérieures', 'restant', 'tente', 'a', 'ni', 'allaient', 'certaines', 'autres', 'fait', 'cinquantaine', 'la', 'cet', "j'", 'excepté', 'anterieure', 'mes', 'va', 'quelle', 'toi-même', 'plutot', 'dès', 'ha', 'comme', 'treize', 'antérieure', 'seraient', 'des', 'certes', 'egalement', 'onzième', 'cinquième', 'celles-la', 'après', 'entre', 'aussi', 'parle', 'ou', 'hors', 'lorsque', 'stop', 'elle', 'vas', 'le', 'douze', "n'", 'abord', 'duquel', 'trente', 'notamment', 'differents', 'un', 'juste', 'pourrait', 'seront', 'selon', 'toi-meme', 'auxquels', 'dejà', 'retour', 'sa', 'outre', 'mienne', 'telles', 'tiens', 'faisant', 'elles-mêmes', 'jusqu', 'que', 'celui-là', 'leur', 'ceux', 'hé', 'tout', 'diverse', 'ton'

In [None]:
from spacy.lang.pt.stop_words import STOP_WORDS
print(STOP_WORDS)

{'mesmo', 'grupo', 'atrás', 'às', 'meio', 'saber', 'nada', 'desta', 'ambos', 'quem', 'então', 'porquanto', 'inclusive', 'todas', 'sua', 'mais', 'pouca', 'dizer', 'nove', 'vosso', 'os', 'tudo', 'tais', 'local', 'elas', 'tente', 'a', 'isto', 'qual', 'partir', 'acerca', 'devem', 'cujo', 'através', 'pelo', 'momento', 'embora', 'área', 'todo', 'todos', 'baixo', 'lhe', 'falta', 'número', 'sabe', 'zero', 'des', 'tive', 'pôde', 'certamente', 'entre', 'como', 'após', 'geral', 'estará', 'ou', 'aí', 'pois', 'favor', 'meses', 'no', 'tens', 'estiveram', 'pode', 'tivemos', 'sete', 'parece', 'apenas', 'conselho', 'ligado', 'ver', 'alguns', 'já', 'diz', 'valor', 'talvez', 'vai', 'ainda', 'poderá', 'mal', 'fui', 'ter', 'quieta', 'ali', 'não', 'sétimo', 'posição', 'tua', 'deverá', 'daquela', 'que', 'muito', 'nas', 'sob', 'nova', 'possível', 'vós', 'primeira', 'nunca', 'uns', 'me', 'aos', 'nível', 'debaixo', 'dez', 'faço', 'longe', 'seus', 'caminho', 'teus', 'esse', 'eu', 'fostes', 'novos', 'usar', 'dar'

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS
print(STOP_WORDS)

{'well', 'the', 'with', 'among', 'ourselves', 'whatever', '‘ll', 'be', 'even', '’ll', 'across', 'fifty', 'call', 'eleven', 'whom', 'here', 'still', 'much', 'least', 'five', 'side', 'often', 'a', 'however', 'unless', 'thereupon', 'becomes', 'you', 'twenty', 'own', 'less', 'last', 'namely', 'will', 'how', 'toward', 'after', 'forty', 'four', 'hundred', 'until', 'could', 'anyone', 'all', 'keep', 'no', 'anywhere', 'thereafter', 'used', 'us', 'whoever', 'off', 'n‘t', 'thus', 'while', 'front', 'am', 'himself', 'already', 'someone', '‘d', 'otherwise', 'nor', 'if', 'does', 'regarding', 'top', 'me', 'wherein', 'none', 'whereupon', 'whence', 'many', 'quite', 'whenever', 'herself', 'everyone', 'fifteen', 'what', 'hereafter', "'s", 'some', 're', 'too', 'both', 'as', 'moreover', 'anyhow', 'show', 'of', 'twelve', 'itself', 'above', 'did', 'indeed', "'ve", 'several', 'made', 'to', 'our', 'doing', 'herein', 'using', 'anyway', 'on', 'down', 'more', 'nobody', 'most', 'out', 'whereas', 'elsewhere', 'they'

In [None]:
'it' in STOP_WORDS

True

In [None]:
len(STOP_WORDS)

326

In [None]:
nlp.vocab['it'].is_stop

True

In [None]:
nlp.vocab['walk'].is_stop

False

In [None]:
document = nlp('I am learning natural language processing. The course is in London')

In [None]:
for token in document:
  if nlp.vocab[token.text].is_stop:
    print(token.text)

I
am
The
is
in


In [None]:
for token in document:
  if not nlp.vocab[token.text].is_stop:
    print(token.text)

learning
natural
language
processing
.
course
London


# Dependency parsing

- Parent-child relation

## Example 1

In [None]:
document = nlp('book a ticket from London to Paris')

In [None]:
origin = document[4]
destiny = document[6]
print(origin, destiny)

London Paris


In [None]:
list(origin.ancestors)

[from, ticket, book]

In [None]:
list(destiny.ancestors)

[to, ticket, book]

In [None]:
document[0].is_ancestor(document[2])

True

## Example 2

In [None]:
document = nlp('Book a table for the restaurant and a taxi to the hotel')

In [None]:
tasks = document[2], document[8]
locations = document[5], document[11]
print(tasks, locations)

(table, taxi) (restaurant, hotel)


In [None]:
for local in locations:
  print('-----', local)
  for obj in local.ancestors:
    print(obj)

----- restaurant
for
table
Book
----- hotel
to
taxi
restaurant
for
table
Book


In [None]:
for local in locations:
  for obj in local.ancestors:
    if obj in tasks:
      print('Reservation of a {} to the {}'.format(obj, local))
      break

Reservation of a table to the restaurant
Reservation of a taxi to the hotel


In [None]:
list(document[5].children)

[the, and, taxi]

## Example 3

In [None]:
from spacy import displacy

In [None]:
document = nlp('Book a table for the restaurant and a taxi to the hotel')

In [None]:
displacy.render(document, style='dep', jupyter=True, options={'distance': 90})

In [None]:
list(document[2].ancestors)

[Book]

In [None]:
list(document[2].children)

[a, for]

## Example 4

In [None]:
document = nlp('What places can we visit in London and stay in Paris?')
locations = document[6], document[10]
actions = document[4], document[8]
print(locations, actions)

(London, Paris) (visit, stay)


In [None]:
for local in locations:
  #print(local)
  for action in local.ancestors:
    if action in actions:
      print('{} to {}'.format(local, action))
      break

London to visit
Paris to stay


In [None]:
displacy.render(document, style='dep', jupyter=True, options={'distance': 90})

# Similarity between words and sentences

- spaCy uses the GloVe algorithm (Global Vectors for Word Representation)
- Original paper: https://nlp.stanford.edu/pubs/glove.pdf

## Example 1

In [None]:
w1 = nlp('hello')
w2 = nlp('hi')
w3 = nlp('or')

In [None]:
w1.similarity(w2)

  """Entry point for launching an IPython kernel.


0.851778249322447

In [None]:
w2.similarity(w1)

  """Entry point for launching an IPython kernel.


0.851778249322447

In [None]:
w1.similarity(w3)

  """Entry point for launching an IPython kernel.


0.23884662756950303

In [None]:
w2.similarity(w3)

  """Entry point for launching an IPython kernel.


0.22028085855926335

In [None]:
text1 = nlp('When will the new movie be released?')
text2 = nlp('The new movie will be released next month')
text3 = nlp('What color is the car?')

In [None]:
text1.similarity(text2)

  """Entry point for launching an IPython kernel.


0.6713114919851509

In [None]:
text1.similarity(text3)

  """Entry point for launching an IPython kernel.


0.263474601483632

In [None]:
# New York
# Nw Yok

## Example 2

In [None]:
text = nlp('cat dog horse person')

In [None]:
for text1 in text:
  #print('----', text1)
  for text2 in text:
    #print(text2)
    similarity = text1.similarity(text2) * 100
    print('{} is {}% similar to {}'.format(text1, similarity, text2))

cat is 100.0% similar to cat
cat is 57.744014263153076% similar to dog
cat is 60.72404980659485% similar to horse
cat is 22.322461009025574% similar to person
dog is 57.744014263153076% similar to cat
dog is 100.0% similar to dog
dog is 68.99393796920776% similar to horse
dog is 40.91106057167053% similar to person
horse is 60.72404980659485% similar to cat
horse is 68.99393796920776% similar to dog
horse is 100.0% similar to horse
horse is 38.49261403083801% similar to person
person is 22.322461009025574% similar to cat
person is 40.91106057167053% similar to dog
person is 38.49261403083801% similar to horse
person is 100.0% similar to person


  """


# Tokenization

In [None]:
document1 = nlp('I am learning natural language processing. The course is in London. Ph.d John is coming')

In [None]:
for token in document1:
  print(token)

I
am
learning
natural
language
processing
.
The
course
is
in
London
.
Ph.d
John
is
coming


In [None]:
document2 = 'I am learning natural language processing. The course is in London. Ph.d John is coming'
document2.split('.')

['I am learning natural language processing',
 ' The course is in London',
 ' Ph',
 'd John is coming']