In [1]:
'''Importing the NLTK library'''
import nltk  

In [2]:
'''Importing the Wikipedia API for extracting Wikipedia articles'''
import wikipedia

In [13]:
'''
word_tokenize - function to tokenize a given sentence into a list of words
sent_tokenize - function to tokenize a given paragraph into a list of sentences
stopwords - function to generate the list of stopwords pertaining to a given language
PorterStemmer - algorithm to find out the root stem of a given word
string - a class used to remove punctuations from the given text
'''
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string

In [31]:
def preprocess(wp):
    '''
    This function is used for preprocessing the data contained in the 
    supplied WikipediaPage object wp argument. It tokenises the given 
    data into a set of sentences first, followed by word level as well.
    '''
    
    '''The content function is used to extract the content of a the Wikipedia article'''
    webpageContent = wp.content
    
    '''Returns a list consisting of the sentences of the article'''
    webPageSentences = sent_tokenize(webpageContent)
    
    '''Returns a list consisting of the sentences of the article without punctuations'''
    webPageRemovedPunctuations = ["".join([char for char in s if char not in string.punctuation]) for s in webPageSentences]
    
    '''Returns a list consisting of the sentences of the article, where each sentence is now a list consisting 
        of the corresponding words that make it up'''
    webPageWords = [word_tokenize(n) for n in webPageRemovedPunctuations]
    
    '''Stop words are removed'''
    stop_words = stopwords.words("english")
    webPageFilteredWords = [[word for word in s if word not in stop_words] for s in webPageWords]
    
    '''Each word present is converted to its root stem using PorterStemmer Algorithm'''
    porter = PorterStemmer()
    webPageStemmed = [[porter.stem(word) for word in s] for s in webPageFilteredWords]
    
    return webPageStemmed
    

In [32]:
'''Creating a string object storing the content of the Wikipedia article'''
webPageContent = wikipedia.page("Natural_language_processing")

In [33]:
'''Processing the web page content'''
processedText = preprocess(webPageContent)
print(processedText)

Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.
['Natural', 'language', 'processing', 'NLP', 'is', 'a', 'subfield', 'of', 'linguistics', 'computer', 'science', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', 'language', 'in', 'particular', 'how', 'to', 'program', 'computers', 'to', 'process', 'and', 'analyze', 'large', 'amounts', 'of', 'natural', 'language', 'data']
['natur', 'languag', 'process', 'nlp', 'subfield', 'linguist', 'comput', 'scienc', 'artifici', 'intellig', 'concern', 'interact', 'comput', 'human', 'languag', 'particular', 'program', 'comput', 'process', 'analyz', 'larg', 'amount', 'natur', 'languag', 'data']
