# Import necessary libraries

First, make sure you have the required libraries installed. You might need to install them using pip if you haven't already.

In [10]:
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

# Fetch and Parse HTML

Use the requests library to fetch the HTML content of a webpage and then use BeautifulSoup to parse it.

In [11]:
url = "https://en.wikipedia.org/wiki/Independence_Day_(India)"  # Replace with the URL of the webpage you want to scrape
response = requests.get(url)
html_content = response.content

soup = BeautifulSoup(html_content, 'html.parser')

# Extract Text Data

Once you have the parsed HTML, extract the relevant text data using various methods such as .find(), .find_all(), and .get_text().

In [12]:
# Example: Extracting all paragraphs
paragraphs = soup.find_all('p')

# Extracting text from each paragraph
paragraph_texts = [paragraph.get_text() for paragraph in paragraphs]

# Text Preprocessing

Text preprocessing involves various steps to clean and normalize the extracted text.

In [13]:
# Convert to lowercase
lowercase_text = [text.lower() for text in paragraph_texts]

# Remove special characters using regex
cleaned_text = [re.sub(r'[^a-zA-Z0-9\s]', '', text) for text in lowercase_text]

# Tokenization
tokenized_text = [word_tokenize(text) for text in cleaned_text]

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_text = [[word for word in tokens if word not in stop_words] for tokens in tokenized_text]

# Stemming
stemmer = PorterStemmer()
stemmed_text = [[stemmer.stem(word) for word in tokens] for tokens in filtered_text]

# Further Processing

You can perform additional steps such as removing empty tokens, converting the processed text back to sentences or paragraphs, and so on, based on your requirements.

In [14]:
# Removing empty tokens
final_text = [[word for word in tokens if word.strip()] for tokens in stemmed_text]

# Converting tokens back to sentences
sentences = [' '.join(tokens) for tokens in final_text]

# Convert sentences back to paragraphs
processed_paragraphs = '\n\n'.join(sentences)

# Save Processed Text

Finally, you can save the processed text to a file for further analysis.

In [15]:
# saviing into  text file
with open('processed_text.txt', 'w', encoding='utf-8') as file:
    file.write(processed_paragraphs)

print(processed_paragraphs)




independ day celebr annual 15 august public holiday india commemor nation independ unit kingdom 15 august 1947 day provis indian independ act transfer legisl sovereignti indian constitu assembl came effect india retain king georg vi head state transit republ constitut india came effect 26 januari 1950 celebr indian republ day replac dominion prefix dominion india enact sovereign law constitut india india attain independ follow independ movement note larg nonviol resist civil disobedi led indian nation congress leadership mahatma gandhi

independ coincid partit india1 british india divid dominion india pakistan partit accompani violent riot mass casualti displac nearli 15 million peopl due religi violenc 15 august 1947 first prime minist india jawaharl nehru rais indian nation flag lahori gate red fort delhi subsequ independ day incumb prime minist customarili rais flag give address nation2 entir event broadcast doordarshan india nation broadcast usual begin shehnai music ustad bismil