In [1]:
import requests
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/Natural_language_processing" 
response = requests.get(url)
response.raise_for_status()  # Raise an error if the request fails

soup = BeautifulSoup(response.content, 'html.parser')

# get text from headings:
text_elements = soup.find_all(['p', 'h1', 'h2', 'h3'])
text = ""
for element in text_elements:
    text += element.get_text() + " "

In [2]:
import re
import nltk 
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Cleaning
clean_text = re.sub(r'[^a-zA-Z\s]', '', text)  

# Normalization
normalized_text = clean_text.lower() 

# Tokenization
tokens = nltk.word_tokenize(normalized_text)

# Lemmatization 
lemmatizer = nltk.WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

# Stop Words Removal
stop_words = set(nltk.corpus.stopwords.words('english'))
filtered_tokens = [word for word in lemmatized_tokens if word not in stop_words]


[nltk_data] Downloading package punkt to /Users/noornizar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/noornizar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/noornizar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
unique_words = set(filtered_tokens)
print(unique_words) 


{'increase', 'principle', 'problem', 'inherent', 'extract', 'wa', 'decision', 'handwritten', 'external', 'syntactic', 'lessening', 'modeling', 'obsolete', 'machine', 'turn', 'corpus', 'sedit', 'functional', 'development', 'serve', 'document', 'along', 'aid', 'capture', 'concerned', 'one', 'following', 'approachedit', 'artificial', 'theoretical', 'analyze', 'bengio', 'seeking', 'revived', 'became', 'text', 'field', 'acquiring', 'question', 'machinelearning', 'test', 'earliest', 'several', 'machinery', 'increasingly', 'analysisedit', 'linguistics', 'interdisciplinary', 'early', 'notion', 'cognitionedit', 'towards', 'word', 'accurately', 'patient', 'involve', 'would', 'operationalizable', 'length', 'article', 'large', 'sentencesedit', 'ended', 'major', 'time', 'organize', 'tasksedit', 'discouraged', 'processing', 'possible', 'experience', 'theory', 'strong', 'including', 'due', 'period', 'mids', 'directionsedit', 'medicine', 'context', 'language', 'million', 'alignment', 'require', 'impor