## Introduction to NLP

In [1]:
!pip install nltk
import nltk



In [2]:
text = "On Wednesday, the Association for Computing Machinery, the world’s largest society of computing professionals, announced that Hinton, LeCun and Bengio had won this year’s Turing Award for their work on neural networks. The Turing Award, which was introduced in 1966, is often called the Nobel Prize of computing, and it includes a $1 million prize, which the three scientists will share."

In [3]:
nltk.download('punkt')
nltk.download( 'punkt_tab' )

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
# Sentence tokenisation
from nltk.tokenize import sent_tokenize
sent_tk = sent_tokenize( text )

print( "Sentence tokenising the text: \n" )
print( sent_tk )

Sentence tokenising the text: 

['On Wednesday, the Association for Computing Machinery, the world’s largest society of computing professionals, announced that Hinton, LeCun and Bengio had won this year’s Turing Award for their work on neural networks.', 'The Turing Award, which was introduced in 1966, is often called the Nobel Prize of computing, and it includes a $1 million prize, which the three scientists will share.']


In [5]:
# word tokenisation
from nltk.tokenize import word_tokenize
word_tk = word_tokenize( text )

print( "Word tokenising the text: \n" )
print( word_tk )

Word tokenising the text: 

['On', 'Wednesday', ',', 'the', 'Association', 'for', 'Computing', 'Machinery', ',', 'the', 'world', '’', 's', 'largest', 'society', 'of', 'computing', 'professionals', ',', 'announced', 'that', 'Hinton', ',', 'LeCun', 'and', 'Bengio', 'had', 'won', 'this', 'year', '’', 's', 'Turing', 'Award', 'for', 'their', 'work', 'on', 'neural', 'networks', '.', 'The', 'Turing', 'Award', ',', 'which', 'was', 'introduced', 'in', '1966', ',', 'is', 'often', 'called', 'the', 'Nobel', 'Prize', 'of', 'computing', ',', 'and', 'it', 'includes', 'a', '$', '1', 'million', 'prize', ',', 'which', 'the', 'three', 'scientists', 'will', 'share', '.']


In [6]:
# removing stop words (noisy text)
nltk.download( 'stopwords' )

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
from nltk.corpus import stopwords
sw = set( stopwords.words( "english" ) )

print( "Stop words in English are: \n" )
print( sw )

Stop words in English are: 

{'further', 'our', "should've", 'again', 'can', 'up', 'him', 'which', 'you', 'these', 'yourselves', 'doing', 'weren', 'couldn', 'shan', 'they', 'until', "we've", "she'd", 'them', "he'll", "you'd", 'y', 'been', 'their', 'most', 'over', 'she', 'was', "wouldn't", 'isn', 'this', 'will', 'theirs', 'to', "he's", 'have', 'no', 'do', 'through', 'nor', 'once', 'he', 'only', 'own', 'for', 'where', 'but', 'those', 'be', 'as', 'from', 'its', 'too', "they'll", 'we', 'in', 're', 'while', 'll', "it'd", 'itself', 'after', 'mustn', 'so', 'my', 'when', 'ma', 'any', 'each', "she'll", 'has', 'not', 'same', "don't", 'hasn', 'didn', "couldn't", "i'll", 'between', "weren't", "mightn't", 'such', 'yourself', 'had', 'by', 'about', 'why', "it'll", 'd', 'mightn', 't', 'hers', "aren't", 'that', 'here', 'being', 'with', 'or', "that'll", 'aren', 'above', 'there', "it's", 'very', 'doesn', 'if', 'wouldn', 'a', 'needn', 'myself', "didn't", 'were', "hasn't", 'below', 'out', 'did', 'shouldn',

In [11]:
# print all words in word_tk if not in sw
filtered_words = [ w for w in word_tk if not w in sw ]

print( "Filtered words are: \n")
print( filtered_words )

Filtered words are: 

['On', 'Wednesday', ',', 'Association', 'Computing', 'Machinery', ',', 'world', '’', 'largest', 'society', 'computing', 'professionals', ',', 'announced', 'Hinton', ',', 'LeCun', 'Bengio', 'year', '’', 'Turing', 'Award', 'work', 'neural', 'networks', '.', 'The', 'Turing', 'Award', ',', 'introduced', '1966', ',', 'often', 'called', 'Nobel', 'Prize', 'computing', ',', 'includes', '$', '1', 'million', 'prize', ',', 'three', 'scientists', 'share', '.']


In [12]:
# Stemming
from nltk.stem import PorterStemmer
port_stem = PorterStemmer()

In [14]:
# empty list to load with stemmed words
stemmed_words = []

for w in filtered_words:
  stemmed_words.append( port_stem.stem( w ) )

print( "Filtered sentence: \n", filtered_words, "\n" )
print( "Stemmed sentence: \n", stemmed_words )

Filtered sentence: 
 ['On', 'Wednesday', ',', 'Association', 'Computing', 'Machinery', ',', 'world', '’', 'largest', 'society', 'computing', 'professionals', ',', 'announced', 'Hinton', ',', 'LeCun', 'Bengio', 'year', '’', 'Turing', 'Award', 'work', 'neural', 'networks', '.', 'The', 'Turing', 'Award', ',', 'introduced', '1966', ',', 'often', 'called', 'Nobel', 'Prize', 'computing', ',', 'includes', '$', '1', 'million', 'prize', ',', 'three', 'scientists', 'share', '.'] 

Stemmed sentence: 
 ['on', 'wednesday', ',', 'associ', 'comput', 'machineri', ',', 'world', '’', 'largest', 'societi', 'comput', 'profession', ',', 'announc', 'hinton', ',', 'lecun', 'bengio', 'year', '’', 'ture', 'award', 'work', 'neural', 'network', '.', 'the', 'ture', 'award', ',', 'introduc', '1966', ',', 'often', 'call', 'nobel', 'prize', 'comput', ',', 'includ', '$', '1', 'million', 'prize', ',', 'three', 'scientist', 'share', '.']
