In [67]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text)
    return text

# Read in the text files
positive_reviews = open("positive-reviews.txt").readlines()
negative_reviews = open("negative-reviews.txt").readlines()

# Preprocess the text
positive_reviews = [preprocess_text(review) for review in positive_reviews]
negative_reviews = [preprocess_text(review) for review in negative_reviews]

# Assign labels to the reviews
positive_reviews = [(review, 1) for review in positive_reviews]
negative_reviews = [(review, 0) for review in negative_reviews]

# Combine the two lists into a single list of reviews
reviews = positive_reviews + negative_reviews

# Split the data into a training set and a test set
reviews_df = pd.DataFrame(reviews, columns=["review", "label"])
train_df, test_df = train_test_split(reviews_df, test_size=0.2, random_state=42)

# Vectorize the text using CountVectorizer
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_df["review"])
print(X_train.shape[1])
X_test = vectorizer.transform(test_df["review"])
y_train = train_df["label"]
y_test = test_df["label"]

# Train a logistic regression classifier
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))
from sklearn.metrics import accuracy_score, confusion_matrix
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

8831
Accuracy: 93.71%
Accuracy: 0.9371048615652933
Confusion Matrix:
 [[4380  227]
 [ 350 4217]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [68]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.93      0.95      0.94      4607
           1       0.95      0.92      0.94      4567

    accuracy                           0.94      9174
   macro avg       0.94      0.94      0.94      9174
weighted avg       0.94      0.94      0.94      9174



In [69]:
import pickle

with open('classifier2.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [70]:
clf_loaded = pickle.load(open("classifier2.pkl", "rb"))

In [81]:
sentence = "the product has no abnormal. It works great"

# Assume that `clf_loaded` is the loaded classifier
# and `sentence` is the new sentence whose sentiment you want to predict

# Preprocess the sentence
sentence = preprocess_text(sentence)

# Vectorize the sentence
X_new = vectorizer.transform([sentence])

# Make a prediction
y_pred = clf_loaded.predict(X_new)

# Print the prediction
if y_pred[0] == 1:
    print("The sentiment of the sentence is positive.")
else:
    print("The sentiment of the sentence is negative.")



The sentiment of the sentence is positive.
