In [1]:
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Read the plain text dataset
with open("customer_reviews.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

# Split the dataset into training (90%) and testing (10%)
train_lines, test_lines = train_test_split(lines, test_size=0.1, random_state=42)

# Initialize empty lists to store labels and reviews
X_train_labels = []
y_train_labels = []
train_reviews = []
X_test_labels = []
y_test_labels = []
test_reviews = []

# Process the lines and split into labels and reviews
for line in train_lines:
    flag = 1
    if '__label__2' in line:
        flag = 1
        review = line.strip().split("__label__2 ")[1]
    else:
        flag = 0
        review = line.strip().split("__label__1 ")[1]
    X_train_labels.append(flag)
    train_reviews.append(review)

for line in test_lines:
    flag = 1
    if '__label__2' in line:
        flag = 1
        review = line.strip().split("__label__2 ")[1]
    else:
        flag = 0
        review = line.strip().split("__label__1 ")[1]
    X_test_labels.append(flag)
    test_reviews.append(review)

# Part 1: Basic Text Matching with Keywords
positive_keywords = ["great", "love", "amazing", "excellent"]
negative_keywords = ["bad", "terrible", "awful", "horrible"]

def classify_with_keywords(text):
    positive_count = sum(text.count(keyword) for keyword in positive_keywords)
    negative_count = sum(text.count(keyword) for keyword in negative_keywords)
    
    if positive_count > negative_count:
        return 1
    elif negative_count > positive_count:
        return 0
    else:
        return 0

y_train_labels = [classify_with_keywords(review) for review in train_reviews]

y_test_labels = [classify_with_keywords(review) for review in test_reviews]

train_accuracy = accuracy_score(X_train_labels, y_train_labels)
print(f"Test Accuracy using Basic Text Matching with Keywords: {train_accuracy:.2f}")

test_accuracy = accuracy_score(X_test_labels, y_test_labels)
print(f"Test Accuracy using Basic Text Matching with Keywords: {test_accuracy:.2f}")




Test Accuracy using Basic Text Matching with Keywords: 0.64
Test Accuracy using Basic Text Matching with Keywords: 0.65


In [2]:
# Part 2: Bag of Words (BoW) Technique
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_reviews)
X_test = vectorizer.transform(test_reviews)

if not vectorizer.vocabulary_:
    print("Empty vocabulary after preprocessing. Check your dataset and preprocessing steps.")
else:
    classifier = MultinomialNB()
    classifier.fit(X_train, X_train_labels)

    predicted_labels_bow = classifier.predict(X_test)

    accuracy_bow = accuracy_score(X_test_labels, predicted_labels_bow)
    print(f"Accuracy using Bag of Words (BoW) Technique: {accuracy_bow:.2f}")

Accuracy using Bag of Words (BoW) Technique: 0.85


In [None]:
from sklearn.svm import SVC  # Import the Support Vector Machine classifier

# Part 2: Bag of Words (BoW) Technique with SVM
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_reviews)
X_test = vectorizer.transform(test_reviews)

if not vectorizer.vocabulary_:
    print("Empty vocabulary after preprocessing. Check your dataset and preprocessing steps.")
else:
    classifier = SVC(kernel='rbf')  # Use a linear kernel for SVM
    classifier.fit(X_train, X_train_labels)

    predicted_labels_bow = classifier.predict(X_test)

    accuracy_bow = accuracy_score(X_test_labels, predicted_labels_bow)
    print(f"Accuracy using Bag of Words (BoW) Technique with SVM: {accuracy_bow:.2f}")


In [2]:
from sklearn.ensemble import RandomForestClassifier

# Part 2: Bag of Words (BoW) Technique with Random Forest
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_reviews)
X_test = vectorizer.transform(test_reviews)

if not vectorizer.vocabulary_:
    print("Empty vocabulary after preprocessing. Check your dataset and preprocessing steps.")
else:
    classifier = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)  # Initialize Random Forest classifier
    classifier.fit(X_train, X_train_labels)  # Train the classifier

    predicted_labels_bow = classifier.predict(X_test)  # Make predictions

    accuracy_bow = accuracy_score(X_test_labels, predicted_labels_bow)  # Calculate accuracy
    print(f"Accuracy using Bag of Words (BoW) Technique with Random Forest: {accuracy_bow:.2f}")


Accuracy using Bag of Words (BoW) Technique with Random Forest: 0.79


In [4]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LogisticRegression

# Part 2: Bag of Words (BoW) Technique with Logistic Regression
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_reviews)
X_test = vectorizer.transform(test_reviews)

if not vectorizer.vocabulary_:
    print("Empty vocabulary after preprocessing. Check your dataset and preprocessing steps.")
else:
    classifier = LogisticRegression(random_state=42, max_iter=1000)  # Initialize Logistic Regression classifier
    classifier.fit(X_train, X_train_labels)  # Train the classifier

    predicted_labels_bow = classifier.predict(X_test)  # Make predictions

    accuracy_bow = accuracy_score(X_test_labels, predicted_labels_bow)  # Calculate accuracy
    print(f"Accuracy using Bag of Words (BoW) Technique with Logistic Regression: {accuracy_bow:.2f}")


Accuracy using Bag of Words (BoW) Technique with Logistic Regression: 0.90
