In [1]:
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from bs4 import BeautifulSoup

In [2]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

# Initialize the Porter Stemmer and WordNet Lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Sample corpus
# Corrected corpus
corpus = [
    "Data science is an interdisciplinary field that uses scientific methods, processes, algorithms, and systems to extract knowledge and insights from structured and unstructured data.",
    "It integrates various domains such as statistics, machine learning, data mining, and big data analytics.",
    "Data science is essential for making informed decisions in business, healthcare, finance, and many other industries.",
    "It involves data collection, data cleaning, data analysis, and data visualization.",
    "Data scientists use programming languages like Python and R to perform data analysis.",
    "They also employ tools such as Hadoop, Spark, and SQL for handling large datasets.",
    "The ultimate goal of data science is to uncover hidden patterns, correlations, and trends.",
    "By doing so, it helps organizations improve their operations and strategies.",
    "Data science is a rapidly growing field, with increasing demand for skilled professionals.",
    "It is transforming the way we understand and interact with data, leading to more accurate predictions and better decision-making."
]

# Display the corrected corpus
print(corpus)

# Function to preprocess text
def preprocess_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stop words
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Apply stemming
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    # Apply lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in filtered_tokens]
    return {
        'original': text,
        'tokens': tokens,
        'filtered_tokens': filtered_tokens,
        'stemmed_tokens': stemmed_tokens,
        'lemmatized_tokens': lemmatized_tokens
    }

# Function to get the part of speech tag for lemmatization
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# Process each document in the corpus
for document in corpus:
    result = preprocess_text(document)
    print("Original Document:", document)
    print("Text After HTML Removal:", result['original'])
    print("Tokens:", result['tokens'])
    print("Filtered Tokens:", result['filtered_tokens'])
    print("Stemmed Tokens:", result['stemmed_tokens'])
    print("Lemmatized Tokens:", result['lemmatized_tokens'])
    print("-" * 50)

[nltk_data] Downloading package punkt to /Users/binodrai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/binodrai/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/binodrai/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/binodrai/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/binodrai/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


['Data science is an interdisciplinary field that uses scientific methods, processes, algorithms, and systems to extract knowledge and insights from structured and unstructured data.', 'It integrates various domains such as statistics, machine learning, data mining, and big data analytics.', 'Data science is essential for making informed decisions in business, healthcare, finance, and many other industries.', 'It involves data collection, data cleaning, data analysis, and data visualization.', 'Data scientists use programming languages like Python and R to perform data analysis.', 'They also employ tools such as Hadoop, Spark, and SQL for handling large datasets.', 'The ultimate goal of data science is to uncover hidden patterns, correlations, and trends.', 'By doing so, it helps organizations improve their operations and strategies.', 'Data science is a rapidly growing field, with increasing demand for skilled professionals.', 'It is transforming the way we understand and interact wit

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Sample reviews
data = {
   "Statement": [
    "Data science is an interdisciplinary field that uses scientific methods, processes, algorithms, and systems to extract knowledge and insights from structured and unstructured data.",
    "It integrates various domains such as statistics, machine learning, data mining, and big data analytics.",
    "Data science is essential for making informed decisions in business, healthcare, finance, and many other industries.",
    "It involves data collection, data cleaning, data analysis, and data visualization.",
    "Data scientists use programming languages like Python and R to perform data analysis.",
    "They also employ tools such as Hadoop, Spark, and SQL for handling large datasets.",
    "The ultimate goal of data science is to uncover hidden patterns, correlations, and trends.",
    "By doing so, it helps organizations improve their operations and strategies.",
    "Data science is a rapidly growing field, with increasing demand for skilled professionals.",
    "It is transforming the way we understand and interact with data, leading to more accurate predictions and better decision-making."
]
}

# Create DataFrame
df = pd.DataFrame(data)

# Convert to lowercase
df['Statement'] = df['Statement'].str.lower()

# One-Hot Encoding
one_hot_encoder = CountVectorizer(binary=True)
one_hot_encoded = one_hot_encoder.fit_transform(df['Statement'])

# Bag of Words
count_vectorizer = CountVectorizer()
bow_encoded = count_vectorizer.fit_transform(df['Statement'])

# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_encoded = tfidf_vectorizer.fit_transform(df['Statement'])

# Output Results
print("Original Statement:\n", df['Statement'])
print("\nOne-Hot Encoded:\n", one_hot_encoded.toarray())
print("Vocabulary:", one_hot_encoder.get_feature_names_out())

print("\nBag of Words:\n", bow_encoded.toarray())
print("Vocabulary:", count_vectorizer.get_feature_names_out())

print("\nTF-IDF:\n", tfidf_encoded.toarray())
print("Vocabulary:", tfidf_vectorizer.get_feature_names_out())

Original Statement:
 0    data science is an interdisciplinary field tha...
1    it integrates various domains such as statisti...
2    data science is essential for making informed ...
3    it involves data collection, data cleaning, da...
4    data scientists use programming languages like...
5    they also employ tools such as hadoop, spark, ...
6    the ultimate goal of data science is to uncove...
7    by doing so, it helps organizations improve th...
8    data science is a rapidly growing field, with ...
9    it is transforming the way we understand and i...
Name: Statement, dtype: object

One-Hot Encoded:
 [[0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]
 [1 0 0 ... 1 1 1]]
Vocabulary: ['accurate' 'algorithms' 'also' 'an' 'analysis' 'analytics' 'and' 'as'
 'better' 'big' 'business' 'by' 'cleaning' 'collection' 'correlations'
 'data' 'datasets' 'decision' 'decisions' 'demand' 'doing' 'domains'
 'employ' 'essential' 'extract' 'fie

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Sample reviews
data = {
   "Statement": [
    "Data science is an interdisciplinary field that uses scientific methods, processes, algorithms, and systems to extract knowledge and insights from structured and unstructured data.",
    "It integrates various domains such as statistics, machine learning, data mining, and big data analytics.",
    "Data science is essential for making informed decisions in business, healthcare, finance, and many other industries.",
    "It involves data collection, data cleaning, data analysis, and data visualization.",
    "Data scientists use programming languages like Python and R to perform data analysis.",
    "They also employ tools such as Hadoop, Spark, and SQL for handling large datasets.",
    "The ultimate goal of data science is to uncover hidden patterns, correlations, and trends.",
    "By doing so, it helps organizations improve their operations and strategies.",
    "Data science is a rapidly growing field, with increasing demand for skilled professionals.",
    "It is transforming the way we understand and interact with data, leading to more accurate predictions and better decision-making."
]
}

# Create DataFrame
df = pd.DataFrame(data)

# Convert to lowercase
df['Statement'] = df['Statement'].str.lower()

# One-Hot Encoding
one_hot_encoder = CountVectorizer(binary=True)
one_hot_encoded = one_hot_encoder.fit_transform(df['Statement'])
one_hot_df = pd.DataFrame(one_hot_encoded.toarray(), columns=one_hot_encoder.get_feature_names_out())

# Bag of Words
count_vectorizer = CountVectorizer()
bow_encoded = count_vectorizer.fit_transform(df['Statement'])
bow_df = pd.DataFrame(bow_encoded.toarray(), columns=count_vectorizer.get_feature_names_out())

# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_encoded = tfidf_vectorizer.fit_transform(df['Statement'])
tfidf_df = pd.DataFrame(tfidf_encoded.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Output Results
print("Original Statement:\n", df['Statement'])
print("\nOne-Hot Encoded:\n", one_hot_df)
print("\nBag of Words:\n", bow_df)
print("\nTF-IDF:\n", tfidf_df)


Original Statement:
 0    data science is an interdisciplinary field tha...
1    it integrates various domains such as statisti...
2    data science is essential for making informed ...
3    it involves data collection, data cleaning, da...
4    data scientists use programming languages like...
5    they also employ tools such as hadoop, spark, ...
6    the ultimate goal of data science is to uncove...
7    by doing so, it helps organizations improve th...
8    data science is a rapidly growing field, with ...
9    it is transforming the way we understand and i...
Name: Statement, dtype: object

One-Hot Encoded:
    accurate  algorithms  also  an  analysis  analytics  and  as  better  big  \
0         0           1     0   1         0          0    1   0       0    0   
1         0           0     0   0         0          1    1   1       0    1   
2         0           0     0   0         0          0    1   0       0    0   
3         0           0     0   0         1          0    1

In [5]:
from IPython.display import display

print("Original Statement:\n")
display(df['Statement'])

print("\nOne-Hot Encoded:\n")
display(one_hot_df)

print("\nBag of Words:\n")
display(bow_df)

print("\nTF-IDF:\n")
display(tfidf_df)


Original Statement:



0    data science is an interdisciplinary field tha...
1    it integrates various domains such as statisti...
2    data science is essential for making informed ...
3    it involves data collection, data cleaning, da...
4    data scientists use programming languages like...
5    they also employ tools such as hadoop, spark, ...
6    the ultimate goal of data science is to uncove...
7    by doing so, it helps organizations improve th...
8    data science is a rapidly growing field, with ...
9    it is transforming the way we understand and i...
Name: Statement, dtype: object


One-Hot Encoded:



Unnamed: 0,accurate,algorithms,also,an,analysis,analytics,and,as,better,big,...,uncover,understand,unstructured,use,uses,various,visualization,way,we,with
0,0,1,0,1,0,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,0
1,0,0,0,0,0,1,1,1,0,1,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,1,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,0,0,1,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9,1,0,0,0,0,0,1,0,1,0,...,0,1,0,0,0,0,0,1,1,1



Bag of Words:



Unnamed: 0,accurate,algorithms,also,an,analysis,analytics,and,as,better,big,...,uncover,understand,unstructured,use,uses,various,visualization,way,we,with
0,0,1,0,1,0,0,3,0,0,0,...,0,0,1,0,1,0,0,0,0,0
1,0,0,0,0,0,1,1,1,0,1,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,1,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,0,0,1,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9,1,0,0,0,0,0,2,0,1,0,...,0,1,0,0,0,0,0,1,1,1



TF-IDF:



Unnamed: 0,accurate,algorithms,also,an,analysis,analytics,and,as,better,big,...,uncover,understand,unstructured,use,uses,various,visualization,way,we,with
0,0.0,0.228135,0.0,0.228135,0.0,0.0,0.277156,0.0,0.0,0.0,...,0.0,0.0,0.228135,0.0,0.228135,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.290683,0.117715,0.247107,0.0,0.290683,...,0.0,0.0,0.0,0.0,0.0,0.290683,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.114877,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.291978,0.0,0.13909,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.343466,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.281615,0.0,0.134153,0.0,0.0,0.0,...,0.0,0.0,0.0,0.331277,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.286741,0.0,0.0,0.0,0.116118,0.243756,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.126114,0.0,0.0,0.0,...,0.311426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.130691,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.283595
9,0.25609,0.0,0.0,0.0,0.0,0.0,0.207412,0.0,0.25609,0.0,...,0.0,0.25609,0.0,0.0,0.0,0.0,0.0,0.25609,0.25609,0.2177
