# NLP Basics

In [26]:
import nltk
nltk.download('punkt')
nltk.download()

[nltk_data] Downloading package punkt to /home/peter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

### Tokenising

In [50]:
text = "I am learning natural language processing. Peter's conda is not working."

sentences = nltk.sent_tokenize(text)
print(sentences)
tokens = nltk.word_tokenize(text)
print(tokens)

normalised_tokens = [token.lower() for token in tokens]
print(normalised_tokens)

['I am learning natural language processing.', "Peter's conda is not working."]
['I', 'am', 'learning', 'natural', 'language', 'processing', '.', 'Peter', "'s", 'conda', 'is', 'not', 'working', '.']
['i', 'am', 'learning', 'natural', 'language', 'processing', '.', 'peter', "'s", 'conda', 'is', 'not', 'working', '.']


### Stemming

Stemming is the process of reducing words to a root-form, usually by removing suffixes. Stemming is done to remove the vocabulary size, as words which stem the same have very similar meanings (small semantic differences) big semantic differences. For understanding context/information retrival, stemming makes sense, for understanding the language semantics, it does not.

In [42]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
word = "fighting"
ps.stem(word)

'fight'

### POS Tagging

In [61]:
tagged_tokens = nltk.pos_tag(tokens)
print(tagged_tokens)

[('I', 'PRP'), ('am', 'VBP'), ('learning', 'VBG'), ('natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('.', '.'), ('Peter', 'NNP'), ("'s", 'POS'), ('conda', 'NN'), ('is', 'VBZ'), ('not', 'RB'), ('working', 'VBG'), ('.', '.')]


### Lemmaisation

Lemmaisation reduces words to their dictionary root form. Lemmatisation requires POS-tagging to reduce to base form.

In [65]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    return wordnet.NOUN

lemmatizer = WordNetLemmatizer()
    
lemmatized_tokens = []
for token, tag in tagged_tokens:
    wn_tag = get_wordnet_pos(tag)
    lemmatized_tokens.append(lemmatizer.lemmatize(token, pos=wn_tag))

print(lemmatised_tokens)

['I', 'am', 'learning', 'natural', 'language', 'processing', '.', 'Peter', "'s", 'conda', 'is', 'not', 'working', '.']


### Named Entity Recognition

In [47]:
entities = nltk.chunk.ne_chunk(tagged)
print(entities)

(S
  I/PRP
  am/VBP
  learning/VBG
  natural/JJ
  language/NN
  processing/NN
  ./.
  (PERSON Peter/NNP)
  's/POS
  conda/NN
  is/VBZ
  not/RB
  working/VBG
  ./.)


### Stop Word Filtering

In [66]:
from nltk.corpus import stopwords

stopwords = set(stopwords.words('english'))
normalised_tokens = set(normalised_tokens) - stopwords
print(list(normalised_tokens))

['.', 'processing', 'language', 'peter', 'learning', "'s", 'working', 'natural', 'conda']
