In [41]:
!pip install pandas numpy scikit-learn nltk



**Sentiment Analysis**

In [75]:
import nltk
import string
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [76]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [77]:
df = pd.read_csv('moviereviews.tsv',sep='\t')

In [78]:
df

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...
...,...,...
1995,pos,"i like movies with albert brooks , and i reall..."
1996,pos,it might surprise some to know that joel and e...
1997,pos,the verdict : spine-chilling drama from horror...
1998,pos,i want to correct what i wrote in a former ret...


In [79]:
print(df["review"].isnull().sum())

35


In [80]:
df = df.dropna(subset=["review"])

In [81]:
df = df.drop_duplicates()

In [82]:
df

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...
...,...,...
1995,pos,"i like movies with albert brooks , and i reall..."
1996,pos,it might surprise some to know that joel and e...
1997,pos,the verdict : spine-chilling drama from horror...
1998,pos,i want to correct what i wrote in a former ret...


In [83]:
stop_words = set(stopwords.words("english"))

In [84]:
def preprocess(text):
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    return " ".join(tokens)

In [85]:
df["clean_text"] = df["review"].apply(preprocess)

In [86]:
df["label"] = df["label"].map({
    "neg": 0,
    "pos": 1
})

In [87]:
df

Unnamed: 0,label,review,clean_text
0,0,how do films like mouse hunt get into theatres...,films like mouse hunt get theatres isnt law so...
1,0,some talented actresses are blessed with a dem...,talented actresses blessed demonstrated wide a...
2,1,this has been an extraordinary year for austra...,extraordinary year australian films shine scoo...
3,1,according to hollywood movies made in last few...,according hollywood movies made last decades l...
4,0,my first press screening of 1998 and already i...,first press screening 1998 already ive gotten ...
...,...,...,...
1995,1,"i like movies with albert brooks , and i reall...",like movies albert brooks really like movies d...
1996,1,it might surprise some to know that joel and e...,might surprise know joel ethan coen brought un...
1997,1,the verdict : spine-chilling drama from horror...,verdict spinechilling drama horror maestro ste...
1998,1,i want to correct what i wrote in a former ret...,want correct wrote former retrospective david ...


In [88]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [89]:
X = vectorizer.fit_transform(df["clean_text"])
y = df["label"]
print(X.shape)

(1940, 46361)


In [91]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [92]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X_train, y_train)

In [93]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8324742268041238
              precision    recall  f1-score   support

           0       0.83      0.82      0.82       184
           1       0.84      0.84      0.84       204

    accuracy                           0.83       388
   macro avg       0.83      0.83      0.83       388
weighted avg       0.83      0.83      0.83       388



In [94]:
def predict_sentiment(text):
    clean = preprocess(text)
    vector = vectorizer.transform([clean])
    prediction = model.predict(vector)[0]

    if prediction == 1:
        return "Positive 😊"
    else:
        return "Negative 😡"

print(predict_sentiment("I really love this service"))
print(predict_sentiment("This is horrible"))

Positive 😊
Negative 😡


**summarization**

In [95]:
#! pip install sumy

Collecting sumy
  Downloading sumy-0.12.0-py3-none-any.whl.metadata (8.3 kB)
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting docopt-ng>=0.6.1 (from sumy)
  Downloading docopt_ng-0.9.0-py3-none-any.whl.metadata (13 kB)
Collecting lxml-html-clean (from sumy)
  Downloading lxml_html_clean-0.4.3-py3-none-any.whl.metadata (2.3 kB)
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-26.2.16-py3-none-any.whl.metadata (12 kB)
Collecting docopt<0.7,>=0.6.1 (from breadability>=0.1.20->sumy)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading sumy-0.12.0-py3-none-any.whl (73 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.5/73.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading docopt_ng-0.9.0-py3-none-any.whl (16 kB)
Downloading pycountry-26.2.16-py3-none-any.whl (8.0 MB)
[2K   [90m━━

In [96]:
import pandas as pd
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer

In [97]:
def summarize_text(text, sentence_count=2):

    if not isinstance(text, str) or len(text.split()) < 20:
        return text

    # Step 1: Parse text
    parser = PlaintextParser.from_string(text, Tokenizer("english"))

    # Step 2: Initialize summarizer
    summarizer = TextRankSummarizer()

    # Step 3: Generate summary (select top 3 sentences)
    summary = summarizer(parser.document, sentence_count)

    # Join summary sentences into one string
    return " ".join([str(sentence) for sentence in summary])

In [98]:
df["summary_review"] = df["review"].apply(lambda x: summarize_text(x, 2))

In [99]:
df

Unnamed: 0,label,review,clean_text,summary_review
0,0,how do films like mouse hunt get into theatres...,films like mouse hunt get theatres isnt law so...,mouse hunt takes the bare threads of a plot an...
1,0,some talented actresses are blessed with a dem...,talented actresses blessed demonstrated wide a...,"gloria , directed by respected director sidney..."
2,1,this has been an extraordinary year for austra...,extraordinary year australian films shine scoo...,"to that we can add the gritty "" life "" ( the a..."
3,1,according to hollywood movies made in last few...,according hollywood movies made last decades l...,"after the premiere , fellini rejected any clai..."
4,0,my first press screening of 1998 and already i...,first press screening 1998 already ive gotten ...,"using a ramp , he shoots straight out of the t..."
...,...,...,...,...
1995,1,"i like movies with albert brooks , and i reall...",like movies albert brooks really like movies d...,he comes to the conclusion that since his prob...
1996,1,it might surprise some to know that joel and e...,might surprise know joel ethan coen brought un...,the coens seem much more interested in the plo...
1997,1,the verdict : spine-chilling drama from horror...,verdict spinechilling drama horror maestro ste...,both the script and kathy bates' beautifully m...
1998,1,i want to correct what i wrote in a former ret...,want correct wrote former retrospective david ...,but the british high command sends a few soldi...


In [100]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [101]:
text = """
Natural Language Processing (NLP) is a fascinating field of Artificial Intelligence.
It enables computers to understand, interpret, and generate human language.

NLP is widely used in chatbots, sentiment analysis, machine translation,
text summarization, and many other real-world applications.
"""

**Extract List of Sentences**

In [102]:
sentences = sent_tokenize(text)

print("List of Sentences:\n")
for i, sentence in enumerate(sentences, 1):
    print(f"{i}. {sentence}")

List of Sentences:

1. 
Natural Language Processing (NLP) is a fascinating field of Artificial Intelligence.
2. It enables computers to understand, interpret, and generate human language.
3. NLP is widely used in chatbots, sentiment analysis, machine translation,
text summarization, and many other real-world applications.


**Extract List of Tokens (Words)**

In [103]:
tokens = word_tokenize(text)

# Remove punctuation tokens
tokens = [word for word in tokens if word.isalnum()]

print("List of Tokens:\n")
print(tokens)

List of Tokens:

['Natural', 'Language', 'Processing', 'NLP', 'is', 'a', 'fascinating', 'field', 'of', 'Artificial', 'Intelligence', 'It', 'enables', 'computers', 'to', 'understand', 'interpret', 'and', 'generate', 'human', 'language', 'NLP', 'is', 'widely', 'used', 'in', 'chatbots', 'sentiment', 'analysis', 'machine', 'translation', 'text', 'summarization', 'and', 'many', 'other', 'applications']


**Count Tokens, Sentences, Paragraphs**

In [104]:
# Count Sentences
sentence_count = len(sentences)

# Count Tokens
token_count = len(tokens)

paragraphs = [p for p in text.split("\n") if p.strip() != ""]
paragraph_count = len(paragraphs)

print("Sentence Count:", sentence_count)
print("Token Count:", token_count)
print("Paragraph Count:", paragraph_count)

Sentence Count: 3
Token Count: 37
Paragraph Count: 4


### 3. Text Preprocessing

In [105]:
!pip install nltk spacy emoji

Collecting emoji
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.15.0-py3-none-any.whl (608 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.15.0


In [114]:
import nltk
import spacy
import emoji
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [115]:
!python -m spacy download en_core_web_sm

nlp = spacy.load("en_core_web_sm")

Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


**A. TOKENIZATION**

In [116]:
text = "Natural Language Processing is amazing! It helps build chatbots."

In [117]:
nltk_tokens = word_tokenize(text)
print("NLTK Tokens:")
print(nltk_tokens)

NLTK Tokens:
['Natural', 'Language', 'Processing', 'is', 'amazing', '!', 'It', 'helps', 'build', 'chatbots', '.']


In [118]:
doc = nlp(text)
spacy_tokens = [token.text for token in doc]

print("spaCy Tokens:")
print(spacy_tokens)

spaCy Tokens:
['Natural', 'Language', 'Processing', 'is', 'amazing', '!', 'It', 'helps', 'build', 'chatbots', '.']


**B. STOPWORDS REMOVAL**

In [119]:
stop_words = set(stopwords.words("english"))

tokens = word_tokenize(text)

filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

print("Before Stopword Removal:")
print(tokens)

print("\nAfter Stopword Removal:")
print(filtered_tokens)

Before Stopword Removal:
['Natural', 'Language', 'Processing', 'is', 'amazing', '!', 'It', 'helps', 'build', 'chatbots', '.']

After Stopword Removal:
['Natural', 'Language', 'Processing', 'amazing', '!', 'helps', 'build', 'chatbots', '.']


**C. LEMMATIZATION & STEMMING**

In [120]:
chapter = "Studies studying studied runs running easily fair fairness"

tokens = word_tokenize(chapter)

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

print("Word | Stemmed | Lemmatized")
print("--------------------------------")

for word in tokens:
    stem = stemmer.stem(word)
    lemma = lemmatizer.lemmatize(word)
    print(f"{word} | {stem} | {lemma}")

Word | Stemmed | Lemmatized
--------------------------------
Studies | studi | Studies
studying | studi | studying
studied | studi | studied
runs | run | run
running | run | running
easily | easili | easily
fair | fair | fair
fairness | fair | fairness


**D. Handling punctuation, special characters, emojis**

In [121]:
dirty_text = "Hello!!! This is amazing 😍😍 #NLP @user123 Visit now!!!"

In [122]:
# Remove emojis
clean_text = emoji.replace_emoji(dirty_text, replace='')

# Remove hashtags, mentions, punctuation
clean_text = re.sub(r"[@#]\w+", "", clean_text)
clean_text = re.sub(r"[^\w\s]", "", clean_text)

In [123]:
print("Original Text:")
print(dirty_text)

print("\nCleaned Text:")
print(clean_text)

Original Text:
Hello!!! This is amazing 😍😍 #NLP @user123 Visit now!!!

Cleaned Text:
Hello This is amazing    Visit now


**E. Lowercasing & Normalization**

In [124]:
mixed_text = "  NLP   Is   VERY   Powerful!!!   "

# Lowercase
normalized_text = mixed_text.lower()

# Remove extra spaces
normalized_text = re.sub(r"\s+", " ", normalized_text).strip()

print("Original Text:")
print(mixed_text)

print("\nNormalized Text:")
print(normalized_text)

Original Text:
  NLP   Is   VERY   Powerful!!!   

Normalized Text:
nlp is very powerful!!!


**F. REGEX FOR TEXT CLEANING**

In [125]:
paragraph = """
Contact us at support@example.com or sales123@company.org.
Call 9876543210 for details.
"""

emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]+", paragraph)

print("Extracted Emails:")
print(emails)

Extracted Emails:
['support@example.com', 'sales123@company.org']


In [126]:
no_numbers = re.sub(r"\d+", "", paragraph)

print("Text Without Numbers:")
print(no_numbers)

Text Without Numbers:

Contact us at support@example.com or sales@company.org.
Call  for details.



In [127]:
messy_text = "This    is     NLP     text."

clean_spacing = re.sub(r"\s+", " ", messy_text)

print("After Removing Extra Spaces:")
print(clean_spacing)

After Removing Extra Spaces:
This is NLP text.


### 4. Bag-of-Words (BoW), TF-IDF

In [128]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [129]:
sentences = [
    "I love NLP",
    "I love machine learning",
    "NLP is part of machine learning",
    "I enjoy learning new things"
]

**Bag-of-Words (BoW)**

In [130]:
count_vectorizer = CountVectorizer()

bow_matrix = count_vectorizer.fit_transform(sentences)

# Convert to DataFrame for better visualization
bow_df = pd.DataFrame(
    bow_matrix.toarray(),
    columns=count_vectorizer.get_feature_names_out()
)

print("Bag-of-Words Matrix:")
bow_df

Bag-of-Words Matrix:


Unnamed: 0,enjoy,is,learning,love,machine,new,nlp,of,part,things
0,0,0,0,1,0,0,1,0,0,0
1,0,0,1,1,1,0,0,0,0,0
2,0,1,1,0,1,0,1,1,1,0
3,1,0,1,0,0,1,0,0,0,1


**TF-IDF**

In [131]:
tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)

tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

print("TF-IDF Matrix:")
tfidf_df

TF-IDF Matrix:


Unnamed: 0,enjoy,is,learning,love,machine,new,nlp,of,part,things
0,0.0,0.0,0.0,0.707107,0.0,0.0,0.707107,0.0,0.0,0.0
1,0.0,0.0,0.496816,0.613667,0.613667,0.0,0.0,0.0,0.0,0.0
2,0.0,0.463709,0.29598,0.0,0.365594,0.0,0.365594,0.463709,0.463709,0.0
3,0.541736,0.0,0.345783,0.0,0.0,0.541736,0.0,0.0,0.0,0.541736


**Compare Word Importance**

In [132]:
print("BoW counts for 'learning':")
print(bow_df["learning"])

print("\nTF-IDF scores for 'learning':")
print(tfidf_df["learning"])

BoW counts for 'learning':
0    0
1    1
2    1
3    1
Name: learning, dtype: int64

TF-IDF scores for 'learning':
0    0.000000
1    0.496816
2    0.295980
3    0.345783
Name: learning, dtype: float64
