In [None]:

!pip install nltk scikit-learn

import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

corpus = [
    "I loved the movie! It was fantastic and thrilling.",
    "The film was terrible, I hated every moment of it.",
    "What a great experience, the actors did a wonderful job.",
    "I would not recommend this movie to anyone.",
    "An excellent and inspiring story, truly enjoyed it!"
]

print("Original Dataset (Corpus):")
for i, doc in enumerate(corpus, 1):
    print(f"{i}. {doc}")

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

stop_words = set(stopwords.words("english"))

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [w for w in tokens if w.isalpha() and w not in stop_words]
    return " ".join(tokens)

processed_corpus = [preprocess(doc) for doc in corpus]

print("\nPreprocessed Corpus:")
for i, doc in enumerate(processed_corpus, 1):
    print(f"{i}. {doc}")

from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer()
bow_matrix = bow_vectorizer.fit_transform(processed_corpus)

print("\nBag-of-Words Vocabulary:")
print(bow_vectorizer.get_feature_names_out())

print("\nBag-of-Words Representation (Document-Term Matrix):")
print(bow_matrix.toarray())

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_corpus)

print("\nTF-IDF Vocabulary:")
print(tfidf_vectorizer.get_feature_names_out())

print("\nTF-IDF Representation (Document-Term Matrix):")
print(tfidf_matrix.toarray())




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Original Dataset (Corpus):
1. I loved the movie! It was fantastic and thrilling.
2. The film was terrible, I hated every moment of it.
3. What a great experience, the actors did a wonderful job.
4. I would not recommend this movie to anyone.
5. An excellent and inspiring story, truly enjoyed it!

Preprocessed Corpus:
1. loved movie fantastic thrilling
2. film terrible hated every moment
3. great experience actors wonderful job
4. would recommend movie anyone
5. excellent inspiring story truly enjoyed

Bag-of-Words Vocabulary:
['actors' 'anyone' 'enjoyed' 'every' 'excellent' 'experience' 'fantastic'
 'film' 'great' 'hated' 'inspiring' 'job' 'loved' 'moment' 'movie'
 'recommend' 'story' 'terrible' 'thrilling' 'truly' 'wonderful' 'would']

Bag-of-Words Representation (Document-Term Matrix):
[[0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0]
 [0 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0]
 [1 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1]
 [0 0 1 0 1 0

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
