In [3]:
import pandas as pd
import string
import nltk
import numpy as np
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [4]:
# Step 2: Download NLTK Data
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
punctuations = set(string.punctuation)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tridi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
df = pd.read_csv('sentiment.csv')
df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [7]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()

In [8]:
df['sentiment'] = encoder.fit_transform(df['sentiment'])

In [9]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [10]:
def clean_text(text):
    
    text = text.lower()
    words = text.split()
    
    
    cleaned = []
    for word in words:
        word = word.strip(''.join(punctuations))  
        if word.isalpha() and word not in stop_words:
            cleaned.append(word)
    
    return ' '.join(cleaned)


df['clean_text'] = df['review'].apply(clean_text)
df[['review', 'clean_text', 'sentiment']].head()


Unnamed: 0,review,clean_text,sentiment
0,One of the other reviewers has mentioned that ...,one reviewers mentioned watching oz episode ho...,1
1,A wonderful little production. <br /><br />The...,wonderful little production br br filming tech...,1
2,I thought this was a wonderful way to spend ti...,thought wonderful way spend time hot summer we...,1
3,Basically there's a family where a little boy ...,basically family little boy jake thinks zombie...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",petter love time money visually stunning film ...,1


In [11]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['clean_text'])
y = df['sentiment']


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
model = MultinomialNB()
model.fit(X_train, y_train)


In [14]:
y_pred = model.predict(X_test)

print("Accuracy ", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix \n", confusion_matrix(y_test, y_pred))
print("\nClassification Report \n", classification_report(y_test, y_pred))


Accuracy  0.858

Confusion Matrix 
 [[4335  626]
 [ 794 4245]]

Classification Report 
               precision    recall  f1-score   support

           0       0.85      0.87      0.86      4961
           1       0.87      0.84      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



In [15]:
def predict_sentiment(text):
    cleaned = clean_text(text)
    vector = vectorizer.transform([cleaned])
    result = model.predict(vector)[0]
    return "Positive" if result == 1 else "Negative"

predict_sentiment("bad product")


'Negative'

In [16]:
import pickle

with open("sentiment_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print(" Model and vectorizer saved")

 Model and vectorizer saved
