In [8]:
pip install pandas numpy scikit-learn nltk

Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.1 MB)
[K     |████████████████████████████████| 11.1 MB 4.4 MB/s eta 0:00:01
[?25hCollecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 5.8 MB/s eta 0:00:01
Collecting scipy>=1.5.0
  Using cached scipy-1.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5 MB)
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Collecting joblib>=1.1.1
  Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
[K     |████████████████████████████████| 301 kB 7.8 MB/s eta 0:00:01
[?25hCollecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 2.2 MB/s eta 0:00:011
Collecting regex>=2021.8.3
  Downloading regex-2024.11.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (785 kB)
[K     |█████████████████████████

In [1]:
import pandas as pd
import numpy as np
import re


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to /home/admin1/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
df = pd.read_csv("Downloads/data.csv")

In [9]:
df = df[['review', 'rating']]
df.dropna(inplace=True)

print(df.head())

                                              review  rating
0  It was nice produt. I like it's design a lot. ...       5
1  awesome sound....very pretty to see this nd th...       5
2  awesome sound quality. pros 7-8 hrs of battery...       4
3  I think it is such a good product not only as ...       5
4  awesome bass sound quality very good bettary l...       5


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


In [10]:
def rating_to_sentiment(rating):
    if rating >= 4:
        return "positive"
    elif rating <= 2:
        return "negative"
    else:
        return "neutral"

df['sentiment'] = df['rating'].apply(rating_to_sentiment)


In [12]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

df['clean_review'] = df['review'].apply(clean_text)


In [13]:
X = df['clean_review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [14]:
vectorizer = TfidfVectorizer(max_features=5000)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [15]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)


In [16]:
y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Accuracy: 0.8687374749498998

Classification Report:

              precision    recall  f1-score   support

    negative       0.85      0.51      0.63       200
     neutral       0.36      0.10      0.16       156
    positive       0.88      0.99      0.93      1640

    accuracy                           0.87      1996
   macro avg       0.70      0.53      0.57      1996
weighted avg       0.84      0.87      0.84      1996



In [17]:
def predict_sentiment(review):
    review = clean_text(review)
    review_tfidf = vectorizer.transform([review])
    return model.predict(review_tfidf)[0]

print(predict_sentiment("The product quality is amazing and delivery was fast"))
print(predict_sentiment("Very bad experience, product stopped working"))


positive
negative
