In [None]:
import requests
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/Natural_language_processing" 
response = requests.get(url)
response.raise_for_status()  # Raise an error if the request fails

soup = BeautifulSoup(response.content, 'html.parser')

# get text from headings:
text_elements = soup.find_all(['p', 'h1', 'h2', 'h3'])
text = ""
for element in text_elements:
    text += element.get_text() + " "

In [None]:
print(len(text))

8637


In [None]:
import re
import nltk 
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Cleaning
clean_text = re.sub(r'[^a-zA-Z\s]', '', text)  

# Normalization
normalized_text = clean_text.lower() 

# Tokenization
tokens = nltk.word_tokenize(normalized_text)

# Stemming 
stemmer =  nltk.stem.PorterStemmer()
tokens = [stemmer.stem(word) for word in tokens]

# Lemmatization 
lemmatizer = nltk.WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

# Stop Words Removal
stop_words = set(nltk.corpus.stopwords.words('english'))
filtered_tokens = [word for word in lemmatized_tokens if word not in stop_words]


[nltk_data] Downloading package punkt to /Users/noornizar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/noornizar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/noornizar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
print(len(filtered_tokens))

763


In [None]:
unique_words = set(filtered_tokens)
print(unique_words) 
print(len(unique_words))

{'three', 'divis', 'languag', 'perspect', 'psycholinguist', 'follow', 'pursu', 'subdivid', 'colleg', 'possibl', 'acl', 'explicit', 'major', 'devi', 'veri', 'semant', 'note', 'conveni', 'rulebas', 'frequent', 'studi', 'extract', 'analyz', 'base', 'categor', 'inform', 'address', 'patient', 'otherwis', 'linksedit', 'karl', 'especi', 'contain', 'speak', 'content', 'idea', 'nuanc', 'introduct', 'given', 'among', 'approach', 'coauthor', 'well', 'million', 'old', 'steadi', 'sequencetosequ', 'becam', 'speech', 'probabilist', 'electron', 'import', 'though', 'realworld', 'wellsummar', 'winter', 'multimod', 'close', 'flurri', 'chine', 'turn', 'variou', 'thought', 'broadli', 'learn', 'abil', 'mikolov', 'discours', 'categori', 'time', 'tool', 'markov', 'howev', 'tag', 'test', 'best', 'appli', 'match', 'find', 'singl', 'partofspeech', 'machin', 'result', 'earliest', 'announc', 'recognit', 'document', 'articl', 'student', 'notion', 'help', 'tasksedit', 'historyedit', 'individu', 'mainstream', 'tree',