In [15]:
!pip install pandas numpy nltk scikit-learn PyPDF2




In [16]:
import pandas as pd
import numpy as np
import nltk
import re
import PyPDF2

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


In [17]:

nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [18]:

from google.colab import files
uploaded = files.upload()


Saving NLP PDF.pdf to NLP PDF (1).pdf


In [19]:
sentences = [s.strip() for s in sent_tokenize(text) if len(s.strip()) > 40]
df = pd.DataFrame(sentences, columns=["review"])
df.head()


Unnamed: 0,review
0,ByCharlottePerkinsGilman\nIt isveryseldomthat ...
1,"A colonialmansion, ahereditaryestate, I\nwould..."
2,StillI willproudlydeclarethat thereis\nsomethi...
3,"Else, whyshouldit be\nlet socheaply?Andwhyhave..."
4,"Hehasnopatiencewithfaith, an\nintensehorrorof ..."


In [20]:

def clean_text(raw_text):
    cleaned = re.sub(r'<.*?>', '', raw_text)
    cleaned = cleaned.lower()
    return cleaned


In [21]:

df["review"] = df["review"].apply(clean_text)
df.head()


Unnamed: 0,review
0,bycharlotteperkinsgilman\nit isveryseldomthat ...
1,"a colonialmansion, ahereditaryestate, i\nwould..."
2,stilli willproudlydeclarethat thereis\nsomethi...
3,"else, whyshouldit be\nlet socheaply?andwhyhave..."
4,"hehasnopatiencewithfaith, an\nintensehorrorof ..."


In [22]:

sw_list = stopwords.words("english")

df["review"] = df["review"].apply(
    lambda x: " ".join([word for word in x.split() if word not in sw_list])
)

df.head()


Unnamed: 0,review
0,bycharlotteperkinsgilman isveryseldomthat mere...
1,"colonialmansion, ahereditaryestate, wouldsayah..."
2,stilli willproudlydeclarethat thereis somethin...
3,"else, whyshouldit let socheaply?andwhyhavestoo..."
4,"hehasnopatiencewithfaith, intensehorrorof supe..."


In [23]:

df["sentiment"] = ["positive" if i % 2 == 0 else "negative" for i in range(len(df))]
df.head()


Unnamed: 0,review,sentiment
0,bycharlotteperkinsgilman isveryseldomthat mere...,positive
1,"colonialmansion, ahereditaryestate, wouldsayah...",negative
2,stilli willproudlydeclarethat thereis somethin...,positive
3,"else, whyshouldit let socheaply?andwhyhavestoo...",negative
4,"hehasnopatiencewithfaith, intensehorrorof supe...",positive


In [24]:

df["sentiment"] = ["positive" if i % 2 == 0 else "negative" for i in range(len(df))]
df.head()


Unnamed: 0,review,sentiment
0,bycharlotteperkinsgilman isveryseldomthat mere...,positive
1,"colonialmansion, ahereditaryestate, wouldsayah...",negative
2,stilli willproudlydeclarethat thereis somethin...,positive
3,"else, whyshouldit let socheaply?andwhyhavestoo...",negative
4,"hehasnopatiencewithfaith, intensehorrorof supe...",positive


In [25]:

encoder = LabelEncoder()
y = encoder.fit_transform(df["sentiment"])
x = df["review"]


In [26]:

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)


In [27]:

vectorizer = TfidfVectorizer(max_features=5000)
x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)


In [28]:

model = MultinomialNB()
model.fit(x_train_vec, y_train)


In [30]:
y_pred = model.predict(x_test_vec)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.25      1.00      0.40         1
           1       1.00      0.25      0.40         4

    accuracy                           0.40         5
   macro avg       0.62      0.62      0.40         5
weighted avg       0.85      0.40      0.40         5



In [31]:
sample_reviews = [
    "I feel calm and happy staying in this place",
    "The environment makes me anxious and uncomfortable"
]

sample_vec = vectorizer.transform(sample_reviews)
model.predict(sample_vec)


array([0, 0])