In [60]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [61]:
# Download NLTK resources
nltk.download('punkt')  # Download the Punkt tokenizer models
nltk.download('stopwords')  # Download the stopwords corpus
nltk.download('wordnet')  # Download the WordNet lexical database

[nltk_data] Downloading package punkt to /root/nltk_data...

[nltk_data]   Package punkt is already up-to-date!

[nltk_data] Downloading package stopwords to /root/nltk_data...

[nltk_data]   Package stopwords is already up-to-date!

[nltk_data] Downloading package wordnet to /root/nltk_data...

[nltk_data]   Package wordnet is already up-to-date!


True

In [62]:
# HTML code
html_code = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Artificial Intelligence</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            margin: 0;
            padding: 0;
            background-color: #f0f0f0;
            text-align: center;
        }
        header {
            background-color: #333;
            color: #fff;
            padding: 20px 0;
        }
        h1 {
            margin: 0;
            font-size: 36px;
        }
        p {
            font-size: 18px;
            line-height: 1.6;
            margin-top: 20px;
        }
        img {
            max-width: 100%;
            height: auto;
            margin-top: 20px;
        }
    </style>
</head>
<body>
    <header>
        <h1>Artificial Intelligence</h1>
    </header>
    <main>
        <p>Artificial Intelligence (AI) is the simulation of human intelligence processes by machines, especially computer systems. These processes include learning (the acquisition of information and rules for using the information), reasoning (using rules to reach approximate or definite conclusions), and self-correction.</p>
        <p>AI is becoming increasingly prevalent in our daily lives, from virtual assistants like Siri and Alexa to recommendation systems on platforms like Netflix and Amazon. It is also used in fields such as healthcare, finance, transportation, and education.</p>
        <p>There are various subfields of AI, including machine learning, natural language processing, computer vision, robotics, and expert systems. Researchers and engineers continue to make advancements in AI technology, pushing the boundaries of what machines can accomplish.</p>
    </main>
</body>
</html>

"""

In [63]:
from bs4 import BeautifulSoup

# Function to preprocess and normalize text
def preprocess_and_normalize(text):
    # Remove HTML tags
    soup = BeautifulSoup(text, 'html.parser')  # Parse the HTML content
    clean_content = soup.get_text()  # Extract text from HTML

    # Remove CSS styles
    clean_content = re.sub(r'<style.*?</style>', '', clean_content, flags=re.DOTALL)  # Remove style tags and their content
    clean_content = re.sub(r'<link.*?>', '', clean_content, flags=re.DOTALL)  # Remove link tags

    # Convert text to lowercase
    text = clean_content.lower()  # Convert text to lowercase
    # Remove punctuation using regular expression
    text = re.sub(r'[^\w\s]', '', text)  # Remove non-alphanumeric characters
    # Tokenize the text into words
    tokens = word_tokenize(text)  # Tokenize the text into words
    # Remove stopwords (common words that don't contribute much to the meaning)
    stop_words = set(stopwords.words('english'))  # Get English stopwords
    filtered_tokens = [word for word in tokens if word not in stop_words]  # Filter out stopwords
    # Lemmatize words to their base form
    lemmatizer = WordNetLemmatizer()  # Initialize WordNet lemmatizer
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]  # Lemmatize tokens
    # Join the lemmatized tokens back into a single string
    return ' '.join(lemmatized_tokens)  # Return preprocessed text as a single string


In [64]:
# Print raw HTML text
print("Raw HTML text:")
print(html_code)

Raw HTML text:



<!DOCTYPE html>

<html lang="en">

<head>

    <meta charset="UTF-8">

    <meta name="viewport" content="width=device-width, initial-scale=1.0">

    <title>Artificial Intelligence</title>

    <style>

        body {

            font-family: Arial, sans-serif;

            margin: 0;

            padding: 0;

            background-color: #f0f0f0;

            text-align: center;

        }

        header {

            background-color: #333;

            color: #fff;

            padding: 20px 0;

        }

        h1 {

            margin: 0;

            font-size: 36px;

        }

        p {

            font-size: 18px;

            line-height: 1.6;

            margin-top: 20px;

        }

        img {

            max-width: 100%;

            height: auto;

            margin-top: 20px;

        }

    </style>

</head>

<body>

    <header>

        <h1>Artificial Intelligence</h1>

    </header>

    <main>

        <p>Artificial Intelligence (AI)

In [65]:
# Preprocess and normalize HTML code
preprocessed_html = preprocess_and_normalize(html_code)

In [66]:
# Print preprocessed HTML text
print("\nPreprocessed HTML text:")
print(preprocessed_html)



Preprocessed HTML text:

artificial intelligence artificial intelligence artificial intelligence ai simulation human intelligence process machine especially computer system process include learning acquisition information rule using information reasoning using rule reach approximate definite conclusion selfcorrection ai becoming increasingly prevalent daily life virtual assistant like siri alexa recommendation system platform like netflix amazon also used field healthcare finance transportation education various subfields ai including machine learning natural language processing computer vision robotics expert system researcher engineer continue make advancement ai technology pushing boundary machine accomplish


In [67]:

# Tokenize the preprocessed text into words
words = word_tokenize(preprocessed_html)

In [68]:
# Remove duplicates by converting the list to a set
unique_words = set(words)

In [69]:
# Print unique words
print("\nUnique words:")
for word in unique_words:
    print(word)



Unique words:

artificial

include

recommendation

life

accomplish

transportation

using

machine

acquisition

also

boundary

virtual

information

advancement

prevalent

like

including

netflix

engineer

healthcare

siri

system

becoming

make

human

increasingly

daily

used

natural

ai

assistant

platform

reach

vision

learning

reasoning

subfields

finance

technology

process

pushing

robotics

conclusion

especially

processing

field

alexa

approximate

various

selfcorrection

rule

amazon

simulation

language

continue

researcher

definite

computer

education

intelligence

expert
