# Linguistic Features

In [1]:
%pip install -q nltk
import nltk
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('maxent_ne_chunker', quiet=True)
nltk.download('words', quiet=True)

Note: you may need to restart the kernel to use updated packages.


True

## PoS Tagging

In [2]:
corpus="""I have a dream that one day on the red hills of Georgia sons of former slaves and the sons of former slave-owners will be able to sit down together at the table of brotherhood. I have a dream that one day even the state of Mississippi, a state sweltering with the heat of injustice, sweltering with the heat of oppression, will be transformed into an oasis of freedom and justice.

I have a dream that my four little children will one day live in a nation where they will not be judged by the color of their skin but by the content of their character. I have a dream . . . I have a dream that one day in Alabama, with its vicious racists, with its governor having his lips dripping with the words ofinterposition and nullification, one day right there in Alabama little black boys and black girls will be ableto join hands with little white boys and white girls as sisters and brothers.

I have a dream today . . ."""

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

In [4]:
sentences = sent_tokenize(corpus)
sentences

['I have a dream that one day on the red hills of Georgia sons of former slaves and the sons of former slave-owners will be able to sit down together at the table of brotherhood.',
 'I have a dream that one day even the state of Mississippi, a state sweltering with the heat of injustice, sweltering with the heat of oppression, will be transformed into an oasis of freedom and justice.',
 'I have a dream that my four little children will one day live in a nation where they will not be judged by the color of their skin but by the content of their character.',
 'I have a dream .',
 '.',
 '.',
 'I have a dream that one day in Alabama, with its vicious racists, with its governor having his lips dripping with the words ofinterposition and nullification, one day right there in Alabama little black boys and black girls will be ableto join hands with little white boys and white girls as sisters and brothers.',
 'I have a dream today .',
 '.',
 '.']

In [5]:
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [6]:
for sentence in sentences:
    tokens = word_tokenize(sentence)
    filtered = [word for word in tokens if word.lower() not in stop_words]
    pos_tags = nltk.pos_tag(filtered)
    print(pos_tags)

[('dream', 'NN'), ('one', 'CD'), ('day', 'NN'), ('red', 'JJ'), ('hills', 'NNS'), ('Georgia', 'NNP'), ('sons', 'NNS'), ('former', 'JJ'), ('slaves', 'NNS'), ('sons', 'NNS'), ('former', 'JJ'), ('slave-owners', 'NNS'), ('able', 'JJ'), ('sit', 'NN'), ('together', 'RB'), ('table', 'JJ'), ('brotherhood', 'NN'), ('.', '.')]
[('dream', 'NN'), ('one', 'CD'), ('day', 'NN'), ('even', 'RB'), ('state', 'NN'), ('Mississippi', 'NNP'), (',', ','), ('state', 'NN'), ('sweltering', 'NN'), ('heat', 'NN'), ('injustice', 'NN'), (',', ','), ('sweltering', 'VBG'), ('heat', 'NN'), ('oppression', 'NN'), (',', ','), ('transformed', 'VBD'), ('oasis', 'NN'), ('freedom', 'NN'), ('justice', 'NN'), ('.', '.')]
[('dream', 'RB'), ('four', 'CD'), ('little', 'JJ'), ('children', 'NNS'), ('one', 'CD'), ('day', 'NN'), ('live', 'JJ'), ('nation', 'NN'), ('judged', 'VBD'), ('color', 'NN'), ('skin', 'NN'), ('content', 'NN'), ('character', 'NN'), ('.', '.')]
[('dream', 'NN'), ('.', '.')]
[('.', '.')]
[('.', '.')]
[('dream', 'NN')

In [7]:
"Martin Luther King Jr. has a dream".split()

['Martin', 'Luther', 'King', 'Jr.', 'has', 'a', 'dream']

In [8]:
print(nltk.pos_tag("Martin Luther King Jr. has a dream".split()))

[('Martin', 'NNP'), ('Luther', 'NNP'), ('King', 'NNP'), ('Jr.', 'NNP'), ('has', 'VBZ'), ('a', 'DT'), ('dream', 'NN')]


## Named Entity Recognition

In [9]:
ner_sentence = "My name is Miguel O'Hara. I'm this dimension's one and only Spider-Man. At least I was. But I'm not like the others. I don't always like what I have to do, but I know I have to be the one to do it. I've given up too much to stop now."

In [10]:
tokens = word_tokenize(ner_sentence)
tagged_tokens = nltk.pos_tag(tokens)
chunk_tree = nltk.ne_chunk(tagged_tokens)
entities = []
for subtree in chunk_tree:
    if hasattr(subtree, "label"):
        label = subtree.label()
        name = " ".join(token for token, _ in subtree.leaves())
        entities.append((name, label))
entities

[('Miguel', 'PERSON')]