### Problem Statement
You are given a small dataset of product reviews labeled as positive (1) or negative (0).
Your task is to build a text classification pipeline using TF-IDF vectorization and Logistic Regression.

In [2]:
import pandas as pd

df = pd.read_csv("reviews_dataset.csv")
df.head()

Unnamed: 0,review,label
0,"This product is amazing, I love it!",1
1,Worst purchase I have ever made.,0
2,"Absolutely fantastic quality, highly recommend.",1
3,"Terrible, broke after one use.",0
4,Good value for money.,1


In [3]:
# check for unique label
df['label'].nunique()


2

In [4]:
print(df.isnull().sum())

review    0
label     0
dtype: int64


In [5]:
df['label'].value_counts()

label
1    5
0    5
Name: count, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split

# lets split the data into training and test set
X, y = df['review'], df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42, stratify = y)

# encode the reviews
from sklearn.feature_extraction.text import TfidfVectorizer

tfid = TfidfVectorizer(stop_words='english')

X_train_tfidf = tfid.fit_transform(X_train)
X_test_tfidf = tfid.transform(X_test)

print("X_train", X_train)
print("X Train Tfidf :", X_train_tfidf)

X_train 2    Absolutely fantastic quality, highly recommend.
9                     Very disappointing experience.
3                     Terrible, broke after one use.
6             Excellent performance and easy to use.
4                              Good value for money.
0                This product is amazing, I love it!
5                       I hate this, waste of money.
Name: review, dtype: object
X Train Tfidf : <Compressed Sparse Row sparse matrix of dtype 'float64'
	with 23 stored elements and shape (7, 21)>
  Coords	Values
  (0, 0)	0.4472135954999579
  (0, 7)	0.4472135954999579
  (0, 15)	0.4472135954999579
  (0, 10)	0.4472135954999579
  (0, 16)	0.4472135954999579
  (1, 3)	0.7071067811865476
  (1, 6)	0.7071067811865476
  (2, 17)	0.6098192948782316
  (2, 2)	0.6098192948782316
  (2, 18)	0.5062023856012858
  (3, 18)	0.43218152024617124
  (3, 5)	0.5206467559864713
  (3, 13)	0.5206467559864713
  (3, 4)	0.5206467559864713
  (4, 8)	0.6098192948782316
  (4, 19)	0.6098192948782316
  (4

In [9]:
# logistic regression model

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model = LogisticRegression()
model.fit(X_train_tfidf, y_train)


# Evalutae
y_predicted = model.predict(X_test_tfidf)
print("accuracy: ",accuracy_score(y_test, y_predicted))
print("Classification report: ", classification_report(y_true=y_test, y_pred=y_predicted))

accuracy:  0.3333333333333333
Classification report:                precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.33      1.00      0.50         1

    accuracy                           0.33         3
   macro avg       0.17      0.50      0.25         3
weighted avg       0.11      0.33      0.17         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
