# The Stanford POS Tagger


Web app version: http://nlp.stanford.edu:8080/parser/ ; https://corenlp.run/

Newer version of the NLTK interface, requires running a java server locally: https://github.com/nltk/nltk/wiki/Stanford-CoreNLP-API-in-NLTK


### Downloading the tagger and models

Download and uzip the model. You can do the same thing on your own computer to be able to use it locally.

In [None]:
%%time
!wget 'https://nlp.stanford.edu/software/stanford-tagger-4.2.0.zip'
!unzip './stanford-tagger-4.2.0.zip'

--2023-03-25 19:59:21--  https://nlp.stanford.edu/software/stanford-tagger-4.2.0.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 302 FOUND
Location: https://downloads.cs.stanford.edu/nlp/software/stanford-tagger-4.2.0.zip [following]
--2023-03-25 19:59:21--  https://downloads.cs.stanford.edu/nlp/software/stanford-tagger-4.2.0.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 78034596 (74M) [application/zip]
Saving to: ‘stanford-tagger-4.2.0.zip’


2023-03-25 19:59:32 (6.96 MB/s) - ‘stanford-tagger-4.2.0.zip’ saved [78034596/78034596]

Archive:  ./stanford-tagger-4.2.0.zip
   creating: stanford-postagger-full-2020-11-17/
  inflating: stanford-postagger-full-2020

### Setting up and using the tagger with NLTK

In [None]:
model_path='./stanford-postagger-full-2020-11-17/models/english-bidirectional-distsim.tagger'
jar_tagger_path='./stanford-postagger-full-2020-11-17/stanford-postagger-4.2.0.jar'

In [None]:
from nltk.tag.stanford import StanfordPOSTagger # -- deprecated?


In [None]:
!pip freeze | grep nltk # needs nltk >= 3.5

nltk==3.8.1


In [None]:
tagger=StanfordPOSTagger(model_path, jar_tagger_path)


In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
tagger.tag(nltk.word_tokenize("I am eating a lot of candy."))

[('I', 'PRP'),
 ('am', 'VBP'),
 ('eating', 'VBG'),
 ('a', 'DT'),
 ('lot', 'NN'),
 ('of', 'IN'),
 ('candy', 'NN'),
 ('.', '.')]

In [None]:
tagger.tag(nltk.word_tokenize("Time flies like an arrow."))

[('Time', 'NNP'),
 ('flies', 'VBZ'),
 ('like', 'IN'),
 ('an', 'DT'),
 ('arrow', 'NN'),
 ('.', '.')]

In [None]:
tagger.tag(nltk.word_tokenize("Fruit flies like a banana."))

[('Fruit', 'NNP'),
 ('flies', 'VBZ'),
 ('like', 'IN'),
 ('a', 'DT'),
 ('banana', 'NN'),
 ('.', '.')]

In [None]:
tagger.tag(nltk.word_tokenize("I don't like fruit flies like a banana."))

[('I', 'PRP'),
 ('do', 'VBP'),
 ("n't", 'RB'),
 ('like', 'VB'),
 ('fruit', 'NN'),
 ('flies', 'NNS'),
 ('like', 'IN'),
 ('a', 'DT'),
 ('banana', 'NN'),
 ('.', '.')]

In [None]:
tagger.tag(nltk.word_tokenize("I am eating a lot of candy."))

[('I', 'PRP'),
 ('am', 'VBP'),
 ('eating', 'VBG'),
 ('a', 'DT'),
 ('lot', 'NN'),
 ('of', 'IN'),
 ('candy', 'NN'),
 ('.', '.')]

In [None]:
from nltk.stem import PorterStemmer
st = PorterStemmer()
tagger.tag([st.stem(t)
      for t in nltk.word_tokenize("I am eating a lot of candy.")])

[('i', 'PRP'),
 ('am', 'VBP'),
 ('eat', 'VB'),
 ('a', 'DT'),
 ('lot', 'NN'),
 ('of', 'IN'),
 ('candi', 'NN'),
 ('.', '.')]

### Using averaged_perceptron_tagger in NLTK

In [None]:
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
nltk.pos_tag(nltk.word_tokenize("I am eating a lot of candy."))

[('I', 'PRP'),
 ('am', 'VBP'),
 ('eating', 'VBG'),
 ('a', 'DT'),
 ('lot', 'NN'),
 ('of', 'IN'),
 ('candy', 'NN'),
 ('.', '.')]

In [None]:
nltk.pos_tag(nltk.word_tokenize("Time flies like an arrow."))

[('Time', 'NNP'),
 ('flies', 'NNS'),
 ('like', 'IN'),
 ('an', 'DT'),
 ('arrow', 'NN'),
 ('.', '.')]

In [None]:
nltk.pos_tag(nltk.word_tokenize("Fruit flies like a banana."))

[('Fruit', 'NNP'),
 ('flies', 'VBZ'),
 ('like', 'IN'),
 ('a', 'DT'),
 ('banana', 'NN'),
 ('.', '.')]

Understanding the tags

In [None]:
nltk.download('tagsets')


[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Unzipping help/tagsets.zip.


True

In [None]:
nltk.help.upenn_tagset('NNP')


NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...


# Tagged corpora

In [None]:
nltk.download('brown')
nltk.corpus.brown.tagged_words()

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


[('The', 'AT'), ('Fulton', 'NP-TL'), ...]

In [None]:
list(nltk.corpus.brown.tagged_sents(categories='news'))[:2]

[[('The', 'AT'),
  ('Fulton', 'NP-TL'),
  ('County', 'NN-TL'),
  ('Grand', 'JJ-TL'),
  ('Jury', 'NN-TL'),
  ('said', 'VBD'),
  ('Friday', 'NR'),
  ('an', 'AT'),
  ('investigation', 'NN'),
  ('of', 'IN'),
  ("Atlanta's", 'NP$'),
  ('recent', 'JJ'),
  ('primary', 'NN'),
  ('election', 'NN'),
  ('produced', 'VBD'),
  ('``', '``'),
  ('no', 'AT'),
  ('evidence', 'NN'),
  ("''", "''"),
  ('that', 'CS'),
  ('any', 'DTI'),
  ('irregularities', 'NNS'),
  ('took', 'VBD'),
  ('place', 'NN'),
  ('.', '.')],
 [('The', 'AT'),
  ('jury', 'NN'),
  ('further', 'RBR'),
  ('said', 'VBD'),
  ('in', 'IN'),
  ('term-end', 'NN'),
  ('presentments', 'NNS'),
  ('that', 'CS'),
  ('the', 'AT'),
  ('City', 'NN-TL'),
  ('Executive', 'JJ-TL'),
  ('Committee', 'NN-TL'),
  (',', ','),
  ('which', 'WDT'),
  ('had', 'HVD'),
  ('over-all', 'JJ'),
  ('charge', 'NN'),
  ('of', 'IN'),
  ('the', 'AT'),
  ('election', 'NN'),
  (',', ','),
  ('``', '``'),
  ('deserves', 'VBZ'),
  ('the', 'AT'),
  ('praise', 'NN'),
  ('and', 

### Penn Treebank Corpus

References:

Paper with Penn Treebank description: https://www.researchgate.net/publication/2873803_The_Penn_Treebank_An_overview

List of explanations for tag acronyms: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html


In [None]:
nltk.download('treebank')
len(list(nltk.corpus.treebank.tagged_words()))

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


100676

In [None]:
nltk.corpus.treebank.tagged_words()

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ...]

Other tools, POS Tagging performance & comparison: https://aclweb.org/aclwiki/index.php?title=POS_Tagging_(State_of_the_art)

# Syntactic Parsing

### Stanford Parser

You can try the usage below locally: (needs java)

In [None]:
!wget 'https://nlp.stanford.edu/software/stanford-parser-4.2.0.zip'
!unzip 'stanford-parser-4.2.0.zip'


--2023-03-25 20:00:03--  https://nlp.stanford.edu/software/stanford-parser-4.2.0.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 302 FOUND
Location: https://downloads.cs.stanford.edu/nlp/software/stanford-parser-4.2.0.zip [following]
--2023-03-25 20:00:03--  https://downloads.cs.stanford.edu/nlp/software/stanford-parser-4.2.0.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 182285548 (174M) [application/zip]
Saving to: ‘stanford-parser-4.2.0.zip’


2023-03-25 20:00:34 (5.71 MB/s) - ‘stanford-parser-4.2.0.zip’ saved [182285548/182285548]

Archive:  stanford-parser-4.2.0.zip
   creating: stanford-parser-full-2020-11-17/
  inflating: stanford-parser-full-2020-11-

In [None]:
# !wget 'https://nlp.stanford.edu/software/stanford-corenlp-4.2.0-models-english.jar'

In [None]:
import os
from nltk.parse.stanford import StanfordParser


In [None]:
os.environ['JAVAHOME'] = "/usr/bin/java"
os.environ['STANFORD_PARSER'] = '/content/stanford-parser-full-2020-11-17/stanford-parser.jar'
os.environ['STANFORD_MODELS'] = '/content/stanford-parser-full-2020-11-17/stanford-parser-4.2.0-models.jar'


In [None]:

parser = StanfordParser(model_path="/content/stanford-corenlp-4.2.0-models-english.jar")
propozitii = parser.raw_parse_sents(("I like to go to school.", "The cat is running through the room.","Where are you?"))
for prop in propozitii:
    print(list(prop))

Please use [91mnltk.parse.corenlp.CoreNLPParser[0m instead.
  parser = StanfordParser(model_path="/content/stanford-corenlp-4.2.0-models-english.jar")


[main] ERROR edu.stanford.nlp.parser.lexparser.LexicalizedParser - java.io.IOException: Unable to open "/content/stanford-corenlp-4.2.0-models-english.jar" as class path, filename or URL
  edu.stanford.nlp.io.IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(IOUtils.java:501)
  edu.stanford.nlp.io.IOUtils.readStreamFromString(IOUtils.java:402)
  edu.stanford.nlp.parser.lexparser.LexicalizedParser.getParserFromSerializedFile(LexicalizedParser.java:567)
  edu.stanford.nlp.parser.lexparser.LexicalizedParser.getParserFromFile(LexicalizedParser.java:373)
  edu.stanford.nlp.parser.lexparser.LexicalizedParser.loadModel(LexicalizedParser.java:183)
  edu.stanford.nlp.parser.lexparser.LexicalizedParser.main(LexicalizedParser.java:1373)
java.io.IOException: Unable to open "/content/stanford-corenlp-4.2.0-models-english.jar" as class path, filename or URL
	at edu.stanford.nlp.io.IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(IOUtils.java:501)
	at edu.stanford.nlp.io.IOUtils.readerFromStri

OSError: ignored

### Alternatively


In [None]:
!wget 'https://nlp.stanford.edu/software/stanford-corenlp-4.5.3.zip'
!unzip 'stanford-corenlp-4.5.3.zip'

--2023-03-25 20:01:13--  https://nlp.stanford.edu/software/stanford-corenlp-4.5.3.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 302 FOUND
Location: https://downloads.cs.stanford.edu/nlp/software/stanford-corenlp-4.5.3.zip [following]
--2023-03-25 20:01:14--  https://downloads.cs.stanford.edu/nlp/software/stanford-corenlp-4.5.3.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 505406322 (482M) [application/zip]
Saving to: ‘stanford-corenlp-4.5.3.zip’


2023-03-25 20:02:45 (5.29 MB/s) - ‘stanford-corenlp-4.5.3.zip’ saved [505406322/505406322]

Archive:  stanford-corenlp-4.5.3.zip
   creating: stanford-corenlp-4.5.3/
  inflating: stanford-corenlp-4.5.3/CoreNLP-t

In [None]:
!pip install stanfordcorenlp


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting stanfordcorenlp
  Downloading stanfordcorenlp-3.9.1.1-py2.py3-none-any.whl (5.7 kB)
Installing collected packages: stanfordcorenlp
Successfully installed stanfordcorenlp-3.9.1.1


### Dependency parsing

In [None]:
import stanfordcorenlp
sc = stanfordcorenlp.StanfordCoreNLP('stanford-corenlp-4.5.3')

In [None]:
text = "I eat a lot of candy."
dependencies = sc.dependency_parse(text)
dependencies

[('ROOT', 0, 2),
 ('nsubj', 2, 1),
 ('det', 4, 3),
 ('obj', 2, 4),
 ('case', 6, 5),
 ('nmod', 4, 6),
 ('punct', 2, 7)]

In [None]:
tokens = nltk.word_tokenize(text)
for (t, w1, w2) in dependencies:
  if w1 < len(tokens) and w2 < len(tokens):
    print("%s --> %s (%s)" % (
        tokens[w2-1] if w2>0 else "",
        tokens[w1-1] if w1>0 else "",
         t))


eat -->  (ROOT)
I --> eat (nsubj)
a --> lot (det)
lot --> eat (obj)
of --> candy (case)
candy --> lot (nmod)


Descriptions of dependency relations: https://universaldependencies.org/u/dep/

Demo: https://nlp.stanford.edu/software/stanford-dependencies.html


### Constituent parsing

In [None]:
parsed = sc.parse(text)
print(parsed)

(ROOT
  (S
    (NP (PRP I))
    (VP (VBP eat)
      (NP
        (NP (DT a) (NN lot))
        (PP (IN of)
          (NP (NN candy)))))
    (. .)))


## Parsing with custom grammar

https://www.nltk.org/book/ch08.html

In [None]:
import nltk
gram = nltk.CFG.fromstring("""
S -> NP VP | TO VB | VB
VP -> V NP | V NP PP | V S | V PP
PP -> P NP
V -> "caught" | "ate" | "likes" | "like" | "chase" | "go" | "fly" | "flies" | "eat" | "saw"
NP -> Det N | Det N PP | PRP
Det -> "the" | "a" | "an" | "my" | "some" | "The"
N -> "mice" | "cat" | "dog" |  "school" | "Time" | "arrow" | "fly" | "flies" | "candy" | "man" | "park"
P -> "in" | "to" | "on"
TO -> "to"
PRP -> "I"  """)


In [None]:
rdp = nltk.RecursiveDescentParser(gram)
rdp

<nltk.parse.recursivedescent.RecursiveDescentParser at 0x7fb8fb8158b0>

In [None]:
text = "I eat some candy"
for tree in rdp.parse(nltk.word_tokenize(text)):
    print(tree)

(S (NP (PRP I)) (VP (V eat) (NP (Det some) (N candy))))


Syntactic ambiguity:

In [None]:
import nltk
gram = nltk.CFG.fromstring("""
S -> NP VP | TO VB
PP -> P NP
V -> "caught" | "ate" | "likes" | "like" | "chase" | "go" | "fly" | "flies" | "eat" | "saw"
NP -> Det N | Det N PP | PRP
Det -> "the" | "a" | "an" | "my" | "some" | "The"
N -> "mice" | "cat" | "dog" |  "school" | "Time" | "arrow" | "fly" | "flies" | "candy" | "man" | "park"
P -> "in" | "to" | "on"
TO -> "to"
PRP -> "I"  """)

In [None]:
rdp = nltk.RecursiveDescentParser(gram, trace=2)
rdp

<nltk.parse.recursivedescent.RecursiveDescentParser at 0x7fb8fb44fc10>

In [None]:
text = "The dog saw a man in the park"
for tree in rdp.parse(nltk.word_tokenize(text)):
    print(tree)

Parsing 'The dog saw a man in the park'
    [ * S ]
  E [ * NP VP ]
  E [ * Det N VP ]
  E [ * 'the' N VP ]
  E [ * 'a' N VP ]
  E [ * 'an' N VP ]
  E [ * 'my' N VP ]
  E [ * 'some' N VP ]
  E [ * 'The' N VP ]
  M [ 'The' * N VP ]
  E [ 'The' * 'mice' VP ]
  E [ 'The' * 'cat' VP ]
  E [ 'The' * 'dog' VP ]
  M [ 'The' 'dog' * VP ]
  E [ 'The' * 'school' VP ]
  E [ 'The' * 'Time' VP ]
  E [ 'The' * 'arrow' VP ]
  E [ 'The' * 'fly' VP ]
  E [ 'The' * 'flies' VP ]
  E [ 'The' * 'candy' VP ]
  E [ 'The' * 'man' VP ]
  E [ 'The' * 'park' VP ]
  E [ * Det N PP VP ]
  E [ * 'the' N PP VP ]
  E [ * 'a' N PP VP ]
  E [ * 'an' N PP VP ]
  E [ * 'my' N PP VP ]
  E [ * 'some' N PP VP ]
  E [ * 'The' N PP VP ]
  M [ 'The' * N PP VP ]
  E [ 'The' * 'mice' PP VP ]
  E [ 'The' * 'cat' PP VP ]
  E [ 'The' * 'dog' PP VP ]
  M [ 'The' 'dog' * PP VP ]
  E [ 'The' 'dog' * P NP VP ]
  E [ 'The' 'dog' * 'in' NP VP ]
  E [ 'The' 'dog' * 'to' NP VP ]
  E [ 'The' 'dog' * 'on' NP VP ]
  E [ 'The' * 'school' PP VP

In [None]:
srp = nltk.ShiftReduceParser(gram, trace=2)




In [None]:
text = "I like my candy"
for tree in srp.parse(nltk.word_tokenize(text)):
    print(tree)


Parsing 'I like my candy'
    [ * I like my candy]
  S [ 'I' * like my candy]
  R [ PRP * like my candy]
  R [ NP * like my candy]
  S [ NP 'like' * my candy]
  R [ NP V * my candy]
  S [ NP V 'my' * candy]
  R [ NP V Det * candy]
  S [ NP V Det 'candy' * ]
  R [ NP V Det N * ]
  R [ NP V NP * ]


In [None]:
from nltk.draw.tree import draw_trees

# (works locally)
for tree in srp.parse(nltk.word_tokenize(text)):
    draw_trees(tree)

Parsing 'I like my candy'
    [ * I like my candy]
  S [ 'I' * like my candy]
  R [ PRP * like my candy]
  R [ NP * like my candy]
  S [ NP 'like' * my candy]
  R [ NP V * my candy]
  S [ NP V 'my' * candy]
  R [ NP V Det * candy]
  S [ NP V Det 'candy' * ]
  R [ NP V Det N * ]
  R [ NP V NP * ]


In [None]:
groucho_grammar = nltk.CFG.fromstring("""
 S -> NP VP
 PP -> P NP
 NP -> Det N | Det N PP | 'I'
 VP -> V NP | VP PP
 Det -> 'an' | 'my'
 N -> 'elephant' | 'pajamas'
 V -> 'shot'
 P -> 'in'
 """)

In [None]:
sent = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas']
parser = nltk.ChartParser(groucho_grammar)
for tree in parser.parse(sent):
  print(tree)

(S
  (NP I)
  (VP
    (VP (V shot) (NP (Det an) (N elephant)))
    (PP (P in) (NP (Det my) (N pajamas)))))
(S
  (NP I)
  (VP
    (V shot)
    (NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))


In [None]:
sent = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas']
parser = nltk.ShiftReduceParser(groucho_grammar)
for tree in parser.parse(sent):
  print(tree)

# Exercitii (1p total)

1. Completati functia scrisa in laboratorul interior cu POS-tagging: in final veti avea o functie care sa primeasca un text si sa faca toata preprocesarea (tokenizare, lematizare, normalizare) si pos-tagging, si sa intoarca textul cu tags pe cuvinte.

/

Add to the function implemented for the past lab instructions for POS-tagging: in the end you should have a function which receives an input text and performs preprocessing from beginning to end (tokenization, lemmatization, normalization) as well as POS-tagging, then returns the tagged text.

In [None]:
import nltk
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer, SnowballStemmer


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
blacklist_words = stopwords.words('english')

In [None]:
tagger=StanfordPOSTagger(model_path, jar_tagger_path)

In [None]:
def preprocesare(text, lowercase=True, remove_numbers=False, remove_stopwords=True, stemming=None, lemmatization=None, punctuation=True, pos_tagger=True):

    #remove text within paranthesis
    text = re.sub(r'[\(\[\{].*?[\)\]\}]', '', text)

    #tokenize words
    text = nltk.word_tokenize(text, language='english')

    #lowercase
    if lowercase:
        text = [t.lower() for t in text]

    #remove punctuation
    if punctuation:
        text = [t for t in text if t not in string.punctuation]


    #remove stopwords
    if remove_stopwords:
        text = [t for t in text if t not in blacklist_words]


    if remove_numbers:
        text = [t for t in text if re.match('^[a-z]+$', t)]

    #choose stemmer
    if stemming:
        if stemming == 'porter':
            stemmer = PorterStemmer()
        elif stemming == 'snowball':
            stemmer = SnowballStemmer('english')
        else:
            raise ValueError("Choose valid stemmer!")
        text = [stemmer.stem(t) for t in text]

    #choose lemmatizer
    if lemmatization:
        if lemmatization == 'wordnet':
            lemmatizer = WordNetLemmatizer()
        else:
            raise ValueError("Choose valid lemmatizer!")
        text = [lemmatizer.lemmatize(t) for t in text]

    if pos_tagger:
      text =  tagger.tag(text)


    return text


2. Aplicati functia pe un fragment de cateva sute de cuvinte din stiri din ultimele cateva zile (aveti grija sa fie intr-o limba pe care functia o suporta). Afisati distributia partilor de vorbire intr-un grafic.

/

Execute your function on a piece of news from the past days of at least a few hundred words (make sure it's in a language that the function supports). Illustrate the distribution of POSs in a graph.

In [None]:
news = "SEOUL, South Korea : A young lonely zebra had a rare day out when he ran away from a zoo in Seoul and trotted around the streets of the South Korean capital, before being sedated and captured a few hours later. Sero, a 3-year-old male whose Korean name refers to his vertical stripes, escaped from the Seoul Children’s Grand Park zoo Thursday afternoon by breaking through the wooden deck around his enclosure, according to zoo officials. In the hours that followed, Sero ran amok in a nearby residential area in eastern Seoul as people looked on in shock. Social media was instantly flooded with photos and videos of the zebra bumping into traffic and galloping through narrow alleyways."

In [None]:
preprocessed_text = preprocesare(news, lemmatization= 'wordnet')
print(preprocessed_text)

[('seoul', 'NNP'), ('south', 'NNP'), ('korea', 'NNP'), ('young', 'JJ'), ('lonely', 'JJ'), ('zebra', 'NN'), ('rare', 'JJ'), ('day', 'NN'), ('ran', 'VBD'), ('away', 'RB'), ('zoo', 'NN'), ('seoul', 'NN'), ('trotted', 'VBD'), ('around', 'IN'), ('street', 'NN'), ('south', 'RB'), ('korean', 'JJ'), ('capital', 'NN'), ('sedated', 'VBD'), ('captured', 'VBN'), ('hour', 'NN'), ('later', 'RB'), ('sero', 'VBD'), ('3-year-old', 'RB'), ('male', 'JJ'), ('whose', 'WP$'), ('korean', 'JJ'), ('name', 'NN'), ('refers', 'VBZ'), ('vertical', 'JJ'), ('stripe', 'NN'), ('escaped', 'VBD'), ('seoul', 'JJ'), ('child', 'NN'), ('’', "''"), ('grand', 'JJ'), ('park', 'NN'), ('zoo', 'NN'), ('thursday', 'NNP'), ('afternoon', 'NN'), ('breaking', 'VBG'), ('wooden', 'JJ'), ('deck', 'NN'), ('around', 'IN'), ('enclosure', 'NN'), ('according', 'VBG'), ('zoo', 'NN'), ('official', 'NN'), ('hour', 'NN'), ('followed', 'VBD'), ('sero', 'NN'), ('ran', 'VBD'), ('amok', 'RB'), ('nearby', 'RB'), ('residential', 'JJ'), ('area', 'NN'), 

3. Modificati functia de mai sus astfel incat sa efectueze POS-tagging inainte sau dupa lematizare. Comparati diferentele. Afisati cuvintele (top 20) pentru care POS tag-ul identificat difera cel mai des intre cele doua versiuni.

/

Modify your function such that POS-tagging is performed before or after lemmatization. Print the words in the vocabulary (top 20) for which the identified POS tag differs most often between the two versions.

In [None]:
def preprocesare(text, lowercase=True, remove_numbers=False, remove_stopwords=True, stemming=None, punctuation=True, pos_tagger=True, pos_tag_method=None):

    #remove text within paranthesis
    text = re.sub(r'[\(\[\{].*?[\)\]\}]', '', text)

    #tokenize words
    text = nltk.word_tokenize(text, language='english')

    #lowercase
    if lowercase:
        text = [t.lower() for t in text]

    #remove punctuation
    if punctuation:
        text = [t for t in text if t not in string.punctuation]


    #remove stopwords
    if remove_stopwords:
        text = [t for t in text if t not in blacklist_words]


    if remove_numbers:
        text = [t for t in text if re.match('^[a-z]+$', t)]

    if pos_tag_method == "before":
      tagged_tokens =  tagger.tag(text)
      lemmatizer = WordNetLemmatizer()
      tagged_text = []
      for token, pos_tag in tagged_tokens:
        lemma = lemmatizer.lemmatize(token)
        tagged_text.append((lemma, pos_tag))
      return tagged_text


    elif pos_tag_method == "after":
      lemmatizer = WordNetLemmatizer()
      text = [lemmatizer.lemmatize(t) for t in text]
      text =  tagger.tag(text)
      return text

    else:
      print("Choose a valid pos tag method!")




In [None]:
preprocessed_text_before = preprocesare(news,  pos_tag_method="before")
preprocessed_text_after = preprocesare(news,  pos_tag_method="after")
print(preprocessed_text_before)
print(preprocessed_text_after)


[('seoul', 'NNP'), ('south', 'NNP'), ('korea', 'NNP'), ('young', 'JJ'), ('lonely', 'JJ'), ('zebra', 'NN'), ('rare', 'JJ'), ('day', 'NN'), ('ran', 'VBD'), ('away', 'RB'), ('zoo', 'NN'), ('seoul', 'NN'), ('trotted', 'VBD'), ('around', 'IN'), ('street', 'NNS'), ('south', 'RB'), ('korean', 'JJ'), ('capital', 'NN'), ('sedated', 'VBD'), ('captured', 'VBN'), ('hour', 'NNS'), ('later', 'RB'), ('sero', 'VBD'), ('3-year-old', 'RB'), ('male', 'JJ'), ('whose', 'WP$'), ('korean', 'JJ'), ('name', 'NN'), ('refers', 'VBZ'), ('vertical', 'JJ'), ('stripe', 'NNS'), ('escaped', 'VBD'), ('seoul', 'JJ'), ('child', 'NNS'), ('’', "''"), ('grand', 'JJ'), ('park', 'NN'), ('zoo', 'NN'), ('thursday', 'NNP'), ('afternoon', 'NN'), ('breaking', 'VBG'), ('wooden', 'JJ'), ('deck', 'NN'), ('around', 'IN'), ('enclosure', 'NN'), ('according', 'VBG'), ('zoo', 'NN'), ('official', 'NNS'), ('hour', 'NNS'), ('followed', 'VBD'), ('sero', 'NN'), ('ran', 'VBD'), ('amok', 'RB'), ('nearby', 'RB'), ('residential', 'JJ'), ('area', '

In [None]:
for before, after in zip(preprocessed_text_before, preprocessed_text_after):
  if before != after:
    print(before, after)


('street', 'NNS') ('street', 'NN')
('hour', 'NNS') ('hour', 'NN')
('stripe', 'NNS') ('stripe', 'NN')
('child', 'NNS') ('child', 'NN')
('official', 'NNS') ('official', 'NN')
('hour', 'NNS') ('hour', 'NN')
('medium', 'NNS') ('medium', 'NN')
('alleyway', 'NNS') ('alleyway', 'NN')


4. Gasiti bigramele de POS tags cele mai frecvente in textul analizat, si apoi in propozitiile tagged din Brown corpus.

/

Find the most frequent POS tag bigrams occurring in the analyzed text, then in the tagged sentences in the Brown corpus.

In [None]:
freq_dist = nltk.FreqDist((word,tag) for word, tag in preprocessed_text_before)
most_common = freq_dist.most_common(10)
print( most_common)

[(('zoo', 'NN'), 3), (('zebra', 'NN'), 2), (('ran', 'VBD'), 2), (('around', 'IN'), 2), (('korean', 'JJ'), 2), (('hour', 'NNS'), 2), (('seoul', 'JJ'), 2), (('seoul', 'NNP'), 1), (('south', 'NNP'), 1), (('korea', 'NNP'), 1)]


In [None]:
brown_tags = nltk.corpus.brown.tagged_words(tagset='brown')
freq_dist = nltk.FreqDist((word.lower(), tag) for word, tag in brown_tags)
most_common = freq_dist.most_common(10)
print( most_common)

[(('the', 'AT'), 69013), ((',', ','), 58153), (('.', '.'), 48812), (('of', 'IN'), 35028), (('and', 'CC'), 28542), (('a', 'AT'), 22943), (('in', 'IN'), 20731), (('to', 'TO'), 14917), (('to', 'IN'), 11046), (('is', 'BEZ'), 10065)]


5. Write a grammar and code to produce two trees, one for each reading of the phrase "old men and women"

In [None]:
groucho_grammar = nltk.CFG.fromstring("""
 S -> NP
 NP ->  ADJ NP | NP CC N
 NP -> N CC N | ADJ N
 N -> 'men' | 'women'
 ADJ -> 'old'
 CC -> 'and'
 """)

In [None]:
sent = ['old', 'men', 'and', 'women']
parser = nltk.ChartParser(groucho_grammar)
for tree in parser.parse(sent):
  print(tree)

(S (NP (NP (ADJ old) (N men)) (CC and) (N women)))
(S (NP (ADJ old) (NP (N men) (CC and) (N women))))


# Mini-proiect / Homework (+3p)

(v PDF separat)