In [None]:
# Intro to NLP â€“ Sentiment Analysis using Movie Reviews
import nltk
from nltk.corpus import movie_reviews
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy

nltk.download('movie_reviews')

# Create feature set (word presence)
features = [
    (dict((w, True) for w in movie_reviews.words(fileid)), category)
    for category in movie_reviews.categories()
    for fileid in movie_reviews.fileids(category)
]

# Split data
train, test = features[:1500], features[1500:]

# Show sample input features
print("Sample Input Features (first review):")
print(train[0][0])
print("Actual Label:", train[0][1])

# Train classifier
classifier = NaiveBayesClassifier.train(train)

# Test accuracy
print("Model Accuracy:", accuracy(classifier, test))



# Predict sentiment for a custom review
custom_review = "The movie was interesting and i enjoyed watchign it."
custom_features = dict((word, True) for word in custom_review.split())

prediction = classifier.classify(custom_features)
print("-" * 60)
print("Custom Review:", custom_review)
print("Predicted Sentiment:", prediction)


[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\ashis\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Sample Input Features (first review):
{'plot': True, ':': True, 'two': True, 'teen': True, 'couples': True, 'go': True, 'to': True, 'a': True, 'church': True, 'party': True, ',': True, 'drink': True, 'and': True, 'then': True, 'drive': True, '.': True, 'they': True, 'get': True, 'into': True, 'an': True, 'accident': True, 'one': True, 'of': True, 'the': True, 'guys': True, 'dies': True, 'but': True, 'his': True, 'girlfriend': True, 'continues': True, 'see': True, 'him': True, 'in': True, 'her': True, 'life': True, 'has': True, 'nightmares': True, 'what': True, "'": True, 's': True, 'deal': True, '?': True, 'watch': True, 'movie': True, '"': True, 'sorta': True, 'find': True, 'out': True, 'critique': True, 'mind': True, '-': True, 'fuck': True, 'for': True, 'generation': True, 'that': True, 'touches': True, 'on': True, 'very': True, 'cool': True, 'idea': True, 'presents': True, 'it': True, 'bad': True, 'package': True, 'which': True, 'is': True, 'makes': True, 'this': True, 'review': Tr

In [None]:
# text summarisation
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer

text = """
Natural Language Processing (NLP) is a subfield of Artificial Intelligence that
focuses on enabling computers to understand, interpret, and generate human language.
It combines linguistics, computer science, and machine learning techniques.
NLP is widely used in real-world applications such as chatbots, virtual assistants,
search engines, sentiment analysis, and language translation systems.
With the rapid growth of data, NLP plays an important role in extracting meaningful
information from large volumes of text efficiently.
"""

parser = PlaintextParser.from_string(text, Tokenizer("english"))
summarizer = LsaSummarizer()

for sentence in summarizer(parser.document, 2):
    print(sentence)


It combines linguistics, computer science, and machine learning techniques.
NLP is widely used in real-world applications such as chatbots, virtual assistants, search engines, sentiment analysis, and language translation systems.


In [None]:
#2. Text Data Basics

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')

def text_info(text):
    sentences = sent_tokenize(text)
    tokens = word_tokenize(text)
    paragraphs = text.count("\n") + 1
    return sentences, tokens, len(tokens), len(sentences), paragraphs


text = """Natural Language Processing is a branch of Artificial Intelligence.
It helps computers understand human language.
NLP is used in chatbots, search engines, and voice assistants."""

sentences, tokens, token_count, sentence_count, para_count = text_info(text)

print("Sentences:", sentences)
print("Tokens:", tokens)
print("Token count:", token_count)
print("Sentence count:", sentence_count)
print("Paragraph count:", para_count)



Sentences: ['Natural Language Processing is a branch of Artificial Intelligence.', 'It helps computers understand human language.', 'NLP is used in chatbots, search engines, and voice assistants.']
Tokens: ['Natural', 'Language', 'Processing', 'is', 'a', 'branch', 'of', 'Artificial', 'Intelligence', '.', 'It', 'helps', 'computers', 'understand', 'human', 'language', '.', 'NLP', 'is', 'used', 'in', 'chatbots', ',', 'search', 'engines', ',', 'and', 'voice', 'assistants', '.']
Token count: 30
Sentence count: 3
Paragraph count: 3


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ashis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import nltk, re, emoji
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

text = """
Natural Language Processing (NLP) is AMAZING!!!
Researchers at OpenAI are studying NLP techniques @OpenAI #AI #NLP.
Emails like contact@nlp.org and admin@ai.com are often used.
In 2024, NLP systems are running very fast ðŸ˜ŠðŸ˜Š!!!
"""

print("\nOriginal Text:\n", text)

# A: Tokenization comparison
nltk_tokens = word_tokenize(text)
print("\nNLTK Tokens:\n", nltk_tokens)

# E: Lowercase and normalize
text = text.lower()
text = re.sub(r"\s+", " ", text)
print("\nNormalized Text:\n", text)

# D: Remove special characters and emojis
text = re.sub(r"[@#]\w+|[^\w\s]", "", text)
text = emoji.replace_emoji(text, replace="")
print("\nCleaned Text:\n", text)

# F: Regex cleaning
emails = re.findall(r"\S+@\S+", text)
text = re.sub(r"\d+", "", text)
text = re.sub(r"\s+", " ", text)

print("\nExtracted Emails:", emails)
print("\nText after Regex:\n", text)

# Tokenization
tokens = word_tokenize(text)

# B: Stopwords removal
stop_words = set(stopwords.words("english"))
filtered_tokens = [w for w in tokens if w not in stop_words]

print("\nBefore Stopwords:\n", tokens)
print("\nAfter Stopwords:\n", filtered_tokens)

# C: Stemming and Lemmatization
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

print("\nStemmed Words:\n", [stemmer.stem(w) for w in filtered_tokens])
print("\nLemmatized Words:\n", [lemmatizer.lemmatize(w) for w in filtered_tokens])



Original Text:
 
Natural Language Processing (NLP) is AMAZING!!!
Researchers at OpenAI are studying NLP techniques @OpenAI #AI #NLP.
Emails like contact@nlp.org and admin@ai.com are often used.
In 2024, NLP systems are running very fast ðŸ˜ŠðŸ˜Š!!!


NLTK Tokens:
 ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'AMAZING', '!', '!', '!', 'Researchers', 'at', 'OpenAI', 'are', 'studying', 'NLP', 'techniques', '@', 'OpenAI', '#', 'AI', '#', 'NLP', '.', 'Emails', 'like', 'contact', '@', 'nlp.org', 'and', 'admin', '@', 'ai.com', 'are', 'often', 'used', '.', 'In', '2024', ',', 'NLP', 'systems', 'are', 'running', 'very', 'fast', 'ðŸ˜ŠðŸ˜Š', '!', '!', '!']

Normalized Text:
  natural language processing (nlp) is amazing!!! researchers at openai are studying nlp techniques @openai #ai #nlp. emails like contact@nlp.org and admin@ai.com are often used. in 2024, nlp systems are running very fast ðŸ˜ŠðŸ˜Š!!! 

Cleaned Text:
  natural language processing nlp is amazing researchers at op

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ashis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ashis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ashis\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# 4. Bag-of-Words & TF-IDF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

sentences = [
    "Natural language processing is interesting",
    "NLP is very useful for text analysis",
    "Text analysis is an important part of NLP"
]

# Bag of Words
bow_vectorizer = CountVectorizer()
bow_matrix = bow_vectorizer.fit_transform(sentences)

print("Sentences:")
for s in sentences:
    print("-", s)

print("\nBoW Vocabulary:")
print(bow_vectorizer.vocabulary_)

print("\nBoW Matrix:")
print(bow_matrix.toarray())

# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)

print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())


Sentences:
- Natural language processing is interesting
- NLP is very useful for text analysis
- Text analysis is an important part of NLP

BoW Vocabulary:
{'natural': 7, 'language': 6, 'processing': 11, 'is': 5, 'interesting': 4, 'nlp': 8, 'very': 14, 'useful': 13, 'for': 2, 'text': 12, 'analysis': 1, 'an': 0, 'important': 3, 'part': 10, 'of': 9}

BoW Matrix:
[[0 0 0 0 1 1 1 1 0 0 0 1 0 0 0]
 [0 1 1 0 0 1 0 0 1 0 0 0 1 1 1]
 [1 1 0 1 0 1 0 0 1 1 1 0 1 0 0]]

TF-IDF Matrix:
[[0.         0.         0.         0.         0.47952794 0.28321692
  0.47952794 0.47952794 0.         0.         0.         0.47952794
  0.         0.         0.        ]
 [0.         0.33729513 0.44350256 0.         0.         0.26193976
  0.         0.         0.33729513 0.         0.         0.
  0.33729513 0.44350256 0.44350256]
 [0.40541935 0.30833187 0.         0.40541935 0.         0.2394472
  0.         0.         0.30833187 0.40541935 0.40541935 0.
  0.30833187 0.         0.        ]]
