In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
import requests
import io
import zipfile

# Download the stopwords
nltk.download('stopwords')

# Load the dataset from the UCI repository
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip'
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))
df = pd.read_csv(z.open('sentiment labelled sentences/yelp_labelled.txt'), sep='\t', header=None, names=['sentence', 'sentiment'])

# Display the first few rows
print(df.head())


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/prashanthbandari/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                            sentence  sentiment
0                           Wow... Loved this place.          1
1                                 Crust is not good.          0
2          Not tasty and the texture was just nasty.          0
3  Stopped by during the late May bank holiday of...          1
4  The selection on the menu was great and so wer...          1


In [3]:
# Data Preprocessing
df.dropna(inplace=True)
df['sentence'] = df['sentence'].str.lower()
df['sentence'] = df['sentence'].str.replace('[^\w\s]', '')

stop = stopwords.words('english')
df['sentence'] = df['sentence'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop]))


In [4]:
# Split the data
X = df['sentence']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [5]:
# Feature Extraction
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [6]:
# Model Training
model = MultinomialNB()
model.fit(X_train_vec, y_train)


In [7]:
# Predictions
y_pred = model.predict(X_test_vec)


In [8]:
# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')
print(classification_report(y_test, y_pred))


Accuracy: 77.33%
              precision    recall  f1-score   support

           0       0.78      0.76      0.77       152
           1       0.76      0.78      0.77       148

    accuracy                           0.77       300
   macro avg       0.77      0.77      0.77       300
weighted avg       0.77      0.77      0.77       300

