In [1]:
import pandas as pd

df = pd.read_csv('C:/Users/night/Downloads/Processed_Reviews.csv')


In [11]:
# Make a copy of just the columns you want to work with
df_clean = df[['lemmatized']].copy()

# Auto-labeling function
positive_keywords = ['good', 'great', 'excellent', 'amazing', 'love', 'perfect', 'nice', 'awesome', 'worth']
negative_keywords = ['bad', 'poor', 'terrible', 'worst', 'hate', 'broken', 'slow', 'disappoint', 'waste']

def auto_label(text):
    text = text.lower()
    if any(word in text for word in positive_keywords):
        return 1
    elif any(word in text for word in negative_keywords):
        return 0
    else:
        return None  # cannot decide

# Apply label
df_clean['label'] = df_clean['lemmatized'].apply(auto_label)

# Drop unlabelled rows (where label is None)
df_clean = df_clean.dropna(subset=['label'])

# Convert label to integer
df_clean['label'] = df_clean['label'].astype(int)

# Optional: check results
print(df_clean['label'].value_counts())
df_clean.head()


label
1    11
Name: count, dtype: int64


Unnamed: 0,lemmatized,label
0,product arrive time packaging great quality am...,1
1,product amaze love,1
2,buy phone hz display totally worth,1
3,wow product awesome bit expensive,1
4,laptop work perfectly fine,1


In [13]:
# Select only the 'lemmatized' and 'label' columns
df = df[['lemmatized', 'label']]

# Convert labels to numeric if not already
df['label'] = df['label'].astype(int)


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['lemmatized'])
y = df['label']


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)


In [25]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming model is your trained model and X_test, y_test are your test data
y_pred = model.predict(X_test)

# Specify the labels (adjust according to your dataset's labels)
labels = [0, 1]  # Replace with the actual labels you're working with

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred, labels=labels))


Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00         3

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3

Confusion Matrix:
 [[0 0]
 [0 3]]


In [31]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(cm, classes, title='Confusion Matrix'):
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=classes, yticklabels=classes)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(title)
    plt.show()

# Check the unique labels in y_test and y_pred
print("Unique labels in y_test:", set(y_test))
print("Unique labels in y_pred:", set(y_pred))

# Ensure that y_test and y_pred contain both classes
if len(set(y_test)) > 1 and len(set(y_pred)) > 1:
    # Generate confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plot_confusion_matrix(cm, classes=['Negative', 'Positive'])
else:
    print("Warning: There is only one unique label in y_test or y_pred.")


Unique labels in y_test: {1}
Unique labels in y_pred: {1}
