In [17]:
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string



### Step 1: Read the Webpage
Here, I use the requests library to obtain the HTML content of the page.
I then print the first 700 characters so we can see what raw HTML looks like.


In [None]:
# dowmloading the nltk datasets (stopwords + wordnet) just in case not already
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sanketpatil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sanketpatil/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/sanketpatil/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Step 2: Remove HTML Tags
Here I use **BeautifulSoup** to strip away all HTML tags (like `<div>`, `<p>`, etc.) and keep only plain readable text.  
I again print the first 700 characters to compare with the raw HTML.


In [None]:
#reading the website content
url = "https://www.cnn.com/2025/06/13/style/why-luxury-brands-are-so-expensive"
response = requests.get(url)         # get request for webpage
html_content = response.text         # raw html text

In [None]:
#to print only the first 700 chars of html to check
print("RAW HTML START:\n")
print(html_content[:700])

RAW HTML START:

  <!DOCTYPE html>
<html lang="en" data-uri="cms.cnn.com/_pages/cmboyzvxs00d626qmalw1heqy@published" data-layout-uri="cms.cnn.com/_layouts/layout-with-rail/instances/style-article-feature-v1@published" >
  <head>
<link rel="dns-prefetch" href="//tpc.googlesyndication.com">

<link rel="preconnect" href="//tpc.googlesyndication.com">

<link rel="dns-prefetch" href="//pagead2.googlesyndication.com">

<link rel="preconnect" href="//pagead2.googlesyndication.com">

<link rel="dns-prefetch" href="//www.googletagservices.com">

<link rel="preconnect" href="//www.googletagservices.com">

<link rel="dns-prefetch" href="//www.google.com">

<link rel="preconnect" href="//www.google.com">

<link rel="dns


In [None]:

#removing the html tags -> using beautifulsoup
soup = BeautifulSoup(html_content, "html.parser")
text = soup.get_text()

print("\nAFTER REMOVING HTML:\n")
print(text[:700])   # print first 700 chars of clean text


AFTER REMOVING HTML:

 

































Luxury brands are more expensive than ever. They’re telling you why they’re worth it | CNN












































































CNN values your feedback




                                                        1. How relevant is this ad to you?
                                                






























                                                2. Did you encounter any technical issues?
                                        











                                                                        Video player was slow to load content
                                              


In [None]:
#Converting the text lowercase and remove punctuation
text = text.lower()   # all small letters
text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)  # remove punctuation

### Step 3: Lowercase & Remove Punctuation
To normalize the text:
- Convert everything to lowercase (so "Luxury" and "luxury" are treated the same).
- Use regex to remove punctuation marks like `. , ! ?`.


In [None]:
# Removal of stopwords
stop_words = set(stopwords.words("english"))  # list of common english words
words = text.split()  # split into tokens
filtered_words = [w for w in words if w not in stop_words]  # keep only important words

### Step 4: Remove Stopwords
Stopwords are common words (like *the, is, and, to*) that don’t add much meaning.  
I use NLTK’s built-in English stopword list and filter them out.


In [None]:
#Lemmatization
lemmatizer = WordNetLemmatizer()
lemm_words = [lemmatizer.lemmatize(w) for w in filtered_words]

#To Print onnly first 50 words to check
print("\nFIRST 50 LEMMATIZED WORDS:\n")
print(lemm_words[:50])


FIRST 50 LEMMATIZED WORDS:

['luxury', 'brand', 'expensive', 'ever', 'they’re', 'telling', 'they’re', 'worth', 'cnn', 'cnn', 'value', 'feedback', '1', 'relevant', 'ad', '2', 'encounter', 'technical', 'issue', 'video', 'player', 'slow', 'load', 'content', 'video', 'content', 'never', 'loaded', 'ad', 'froze', 'finish', 'loading', 'video', 'content', 'start', 'ad', 'audio', 'ad', 'loud', 'issue', 'ad', 'never', 'loaded', 'ad', 'prevented', 'slowed', 'page', 'loading', 'content', 'moved']


### Step 5: Lemmatization
I use **WordNetLemmatizer** to reduce words to their dictionary base form.  
Example: *"running" → "run"*, *"cars" → "car"*.  
This makes the text cleaner for analysis.
