In [1]:
import pandas as pd
import numpy as np

import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
data = {
    'review': [
        'I love this product',
        'This is the worst purchase I have made',
        'Amazing quality and great service',
        'Very bad experience',
        'I am extremely happy with this',
        'Totally disappointed',
        'Excellent product, worth the money',
        'Not good at all',
        'I would recommend this to everyone',
        'Waste of money'
    ],
    'sentiment': [
        'positive', 'negative', 'positive', 'negative', 'positive',
        'negative', 'positive', 'negative', 'positive', 'negative'
    ]
}

df = pd.DataFrame(data)
df

Unnamed: 0,review,sentiment
0,I love this product,positive
1,This is the worst purchase I have made,negative
2,Amazing quality and great service,positive
3,Very bad experience,negative
4,I am extremely happy with this,positive
5,Totally disappointed,negative
6,"Excellent product, worth the money",positive
7,Not good at all,negative
8,I would recommend this to everyone,positive
9,Waste of money,negative


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     10 non-null     object
 1   sentiment  10 non-null     object
dtypes: object(2)
memory usage: 292.0+ bytes


In [4]:
def clean_text(text):
    text = text.lower()                       # convert to lowercase
    text = re.sub(r'\d+', '', text)           # remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    text = text.strip()                       # remove extra spaces
    return text

In [5]:
df['cleaned_review'] = df['review'].apply(clean_text)
df

Unnamed: 0,review,sentiment,cleaned_review
0,I love this product,positive,i love this product
1,This is the worst purchase I have made,negative,this is the worst purchase i have made
2,Amazing quality and great service,positive,amazing quality and great service
3,Very bad experience,negative,very bad experience
4,I am extremely happy with this,positive,i am extremely happy with this
5,Totally disappointed,negative,totally disappointed
6,"Excellent product, worth the money",positive,excellent product worth the money
7,Not good at all,negative,not good at all
8,I would recommend this to everyone,positive,i would recommend this to everyone
9,Waste of money,negative,waste of money


In [6]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
df

Unnamed: 0,review,sentiment,cleaned_review
0,I love this product,1,i love this product
1,This is the worst purchase I have made,0,this is the worst purchase i have made
2,Amazing quality and great service,1,amazing quality and great service
3,Very bad experience,0,very bad experience
4,I am extremely happy with this,1,i am extremely happy with this
5,Totally disappointed,0,totally disappointed
6,"Excellent product, worth the money",1,excellent product worth the money
7,Not good at all,0,not good at all
8,I would recommend this to everyone,1,i would recommend this to everyone
9,Waste of money,0,waste of money


In [7]:
X = df['cleaned_review']   # input (text)
y = df['sentiment']        # output (label)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [9]:
tfidf = TfidfVectorizer(stop_words='english')

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [10]:
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

In [11]:
y_pred = model.predict(X_test_tfidf)

In [12]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5


In [13]:
confusion_matrix(y_test, y_pred)

array([[1, 0],
       [1, 0]], dtype=int64)

In [15]:
print(classification_report(y_test, y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



In [17]:
sample_review = ["The product quality is amazing"]

sample_clean = [clean_text(sample_review[0])]
sample_tfidf = tfidf.transform(sample_clean)

prediction = model.predict(sample_tfidf)

if prediction[0] == 1:
    print("Sentiment: Positive ")
else:
    print("Sentiment: Negative ")

Sentiment: Positive 


In [21]:
sample_review = ["Not good Product"]

sample_clean = [clean_text(sample_review[0])]
sample_tfidf = tfidf.transform(sample_clean)

prediction = model.predict(sample_tfidf)

if prediction[0] == 1:
    print("Sentiment: Positive ")
else:
    print("Sentiment: Negative ")


Sentiment: Negative 
