In [2]:
import os

os.listdir()


['.ipynb_checkpoints',
 'Sentiment analysis.ipynb',
 'submission.csv',
 'test_Sentiment Analysis.csv',
 'train_Sentiment Analysis.csv']

In [3]:
import pandas as pd

test = pd.read_csv("test_Sentiment Analysis.csv")   # change to correct name
train = pd.read_csv("train_Sentiment Analysis.csv")     # change to correct name

train.head()


Unnamed: 0,ID,Review_Title,Review,Rating
0,0,good product,fine at this price\r\n,1
1,1,Worth the money,Best quality materials,1
2,2,Perfect product!,Good product,1
3,3,Mind-blowing purchase,Excellent,1
4,4,Mind-blowing purchase,Nice,1


In [4]:
test.head()

Unnamed: 0,ID,Review_Title,Review
0,0,Super!,Good cooling low noise. Better Daily change th...
1,1,Brilliant,Awesome
2,2,Good quality product,Good looking and quality
3,4,Perfect\n,Sound Quality awesome\n
4,5,Classy product,Very nice


In [5]:
# Combine title + review into one text column
train["text"] = train["Review_Title"].astype(str) + " " + train["Review"].astype(str)


In [6]:
test["text"] = test["Review_Title"].astype(str) + " " + test["Review"].astype(str)

In [7]:
# Check the new column
train[["Review_Title", "Review", "text"]].head()

Unnamed: 0,Review_Title,Review,text
0,good product,fine at this price\r\n,good product fine at this price\r\n
1,Worth the money,Best quality materials,Worth the money Best quality materials
2,Perfect product!,Good product,Perfect product! Good product
3,Mind-blowing purchase,Excellent,Mind-blowing purchase Excellent
4,Mind-blowing purchase,Nice,Mind-blowing purchase Nice


We will apply very simple cleaning:

lowercase

remove extra spaces

(We will not remove stopwords yet — simple models often do better without heavy cleaning.)

In [8]:
import re

def clean_text(t):
    t = t.lower()                     # lowercase
    t = re.sub(r'\s+', ' ', t)        # remove extra spaces
    return t.strip()

#train["clean_text"] = train["text"].apply(clean_text)
#test["clean_text"] = test["text"].apply(clean_text)

# See sample
#train["clean_text"].head(10)


In [9]:
test["clean_text"] = test["text"].apply(clean_text)

In [10]:
train["clean_text"] = train["text"].apply(clean_text)

In [11]:
train["clean_text"].head()

0           good product fine at this price
1    worth the money best quality materials
2             perfect product! good product
3           mind-blowing purchase excellent
4                mind-blowing purchase nice
Name: clean_text, dtype: object

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF vectorizer
tfidf = TfidfVectorizer(
    max_features=50000,      # limit vocabulary size
    ngram_range=(1, 2)       # unigrams + bigrams
)


In [13]:
# Fit on train, transform both train & test
X_train = tfidf.fit_transform(train["clean_text"])
X_test = tfidf.transform(test["clean_text"])


In [14]:

train.head()


Unnamed: 0,ID,Review_Title,Review,Rating,text,clean_text
0,0,good product,fine at this price\r\n,1,good product fine at this price\r\n,good product fine at this price
1,1,Worth the money,Best quality materials,1,Worth the money Best quality materials,worth the money best quality materials
2,2,Perfect product!,Good product,1,Perfect product! Good product,perfect product! good product
3,3,Mind-blowing purchase,Excellent,1,Mind-blowing purchase Excellent,mind-blowing purchase excellent
4,4,Mind-blowing purchase,Nice,1,Mind-blowing purchase Nice,mind-blowing purchase nice


In [15]:
train.columns

Index(['ID', 'Review_Title', 'Review', 'Rating', 'text', 'clean_text'], dtype='object')

In a sentiment analysis training dataset, there must be a target like:

Rating

Sentiment

Label

Polarity

target

class

In [16]:
X = train["clean_text"]
y = train["Rating"]

X[:5], y[:5]


(0           good product fine at this price
 1    worth the money best quality materials
 2             perfect product! good product
 3           mind-blowing purchase excellent
 4                mind-blowing purchase nice
 Name: clean_text, dtype: object,
 0    1
 1    1
 2    1
 3    1
 4    1
 Name: Rating, dtype: int64)

In [17]:
train["Rating"].value_counts()


Rating
1    39074
0     5724
Name: count, dtype: int64

Logistic Regression

(Works extremely well for TF–IDF text classification)

And because your data is imbalanced
(39,000 positives vs 5,700 negatives),

In [18]:
class_weight = 'balanced'


In [19]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(class_weight='balanced', max_iter=200)
model.fit(X_train, y)


In [20]:
from sklearn.metrics import accuracy_score

train_preds = model.predict(X_train)
accuracy = accuracy_score(y, train_preds)
accuracy


0.9877003437653467

Predict on test data

In [21]:
test_predictions = model.predict(X_test)
test_predictions[:10]


array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [22]:
test["Predicted_Rating"] = test_predictions


In [23]:
test[["ID", "Predicted_Rating"]].to_csv("submission.csv", index=False)


Checking accuracy for test data

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Split into train and validation sets (20% validation)
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

# Train the model again
model = LogisticRegression(class_weight='balanced', max_iter=200)
model.fit(X_train_split, y_train_split)

# Predict on validation set
val_preds = model.predict(X_val_split)

# Print accuracy
print("Validation Accuracy:", accuracy_score(y_val_split, val_preds))

# Detailed metrics
print("\nClassification Report:")
print(classification_report(y_val_split, val_preds))


Validation Accuracy: 0.9800223214285714

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.98      0.93      1145
           1       1.00      0.98      0.99      7815

    accuracy                           0.98      8960
   macro avg       0.94      0.98      0.96      8960
weighted avg       0.98      0.98      0.98      8960



In [25]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


In [26]:
# Model
nb_model = MultinomialNB()


In [27]:
# Train
nb_model.fit(X_train, y)


In [30]:
# Accuracy
nb_accuracy = accuracy_score(y, train_preds)
print("Naive Bayes Accuracy:", nb_accuracy)

# Classification Report
print(classification_report(y, train_preds))

Naive Bayes Accuracy: 0.9877003437653467
              precision    recall  f1-score   support

           0       0.92      0.99      0.95      5724
           1       1.00      0.99      0.99     39074

    accuracy                           0.99     44798
   macro avg       0.96      0.99      0.97     44798
weighted avg       0.99      0.99      0.99     44798

