In [14]:
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import requests

# nltk.download('punkt')
# nltk.download('stopwords')

def extract_text_from_html(html_content):
    try:
        soup = BeautifulSoup(html_content, 'html.parser')
        text = soup.get_text()
        return text
    except Exception as e:
        print("An error occurred while extracting text:", str(e))
        return None

def preprocess_text(text):
    # Clean HTML tags
    cleaned_text = BeautifulSoup(text, "html.parser").get_text()

    # Normalize text (convert to lowercase)
    normalized_text = cleaned_text.lower()

    # Tokenize text
    tokens = word_tokenize(normalized_text)

    # Remove punctuation and non-alphanumeric characters
    tokens = [re.sub(r'[^a-zA-Z0-9]', '', token) for token in tokens if token.isalnum()]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]

    # Convert tokens back to text
    preprocessed_text = ' '.join(stemmed_tokens)
    return preprocessed_text

def get_unique_words(preprocessed_text):
    words = preprocessed_text.split()
    unique_words = set(words)
    return unique_words

def get_words_less_than3(preprocessed_text):
    words = preprocessed_text.split()
    words_less_than_3 = [word for word in words if len(word) < 3]
    return words_less_than_3


# Example usage:
url = "https://www.wikipedia.org/"
html_content = requests.get(url).text
text_content = extract_text_from_html(html_content)
if text_content:
    preprocessed_text = preprocess_text(text_content)

    print("preprocessed_text:", preprocessed_text)
    unique_words = get_unique_words(preprocessed_text)
    print("Unique words:", unique_words)

    print("words less than 3",get_words_less_than3(preprocessed_text))

else:
    print("Failed to extract text from", url)


preprocessed_text: wikipedia wikipedia free encyclopedia english articl  1 967 deutsch artikel articl italiano voci  portugu artigo search wikipedia afrikaan polski asturianu catal cymraeg dansk deutsch eesti  english esperanto euskara galego hrvatski bahasa indonesia italiano ladin latina lietuvi magyar  bahasa melayu bahaso minangkabau nederland norsk norsk nynorsk zbekcha portugu simpl english sinugboanong binisaya srpski srpskohrvatski suomi svenska vi winaray  search read wikipedia languag articl polski deutsch english italiano  nederland portugu sinugboanong binisaya svenska vi winaray  articl afrikaan asturianu catal cymraeg dansk eesti  esperanto euskara galego hrvatski bahasa indonesia ladin latina lietuvi magyar bahasa melayu bahaso minangkabau zbekcha simpl english srpski srpskohrvatski suomi articl bahsa alemannisch bahasa hulontalo basa bali bahasa banjar basa banyumasan bikol central boarisch bosanski brezhoneg bizaad emigli fiji hindi frysk gaeilg g idhlig hausa hornjose