In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [2]:
textdata = pd.read_csv('reviews.csv')

In [3]:
textdata.head()

Unnamed: 0,Review,Sentiment
0,This product exceeded my expectations! It's hi...,Positive
1,"The product was decent. It worked fine, but it...",Neutral
2,I had a terrible experience with this company....,Negative
3,It's an okay product. Nothing to write home ab...,Neutral
4,Disappointed with the product. It didn't meet ...,Negative


In [4]:
textdata.shape

(386, 2)

In [5]:
data = textdata.iloc[:, 0]
target = textdata.iloc[:, 1]
target.value_counts()

Sentiment
Positive    129
Negative    129
Neutral     128
Name: count, dtype: int64

In [6]:
X_train_text, X_test_text, y_train, y_test = train_test_split(data, target, random_state = 42)

In [7]:
vect = CountVectorizer(min_df = 5).fit(X_train_text)
X_train = vect.transform(X_train_text)
X_test = vect.transform(X_test_text)

In [8]:
repr(X_train)

"<Compressed Sparse Row sparse matrix of dtype 'int64'\n\twith 2799 stored elements and shape (289, 92)>"

In [9]:
feature_names = vect.get_feature_names_out()

In [10]:
feature_names[:10]

array(['about', 'absolutely', 'advertised', 'again', 'all', 'an', 'and',
       'anything', 'are', 'as'], dtype=object)

In [11]:
scores = cross_val_score(LogisticRegression(), X_train, y_train, cv = 5)
print("Mean cross-validation accuracy : {:.3f}".format(np.mean(scores)))
print(scores)

Mean cross-validation accuracy : 0.976
[0.98275862 0.98275862 0.94827586 1.         0.96491228]


In [12]:
param_grid = {'C': [0.0001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(), param_grid, cv = 5)
grid.fit(X_train, y_train)
print("Best cross-validation score : {:.3f}".format(grid.best_score_))
print("Best parameters : ", grid.best_params_)

Best cross-validation score : 0.976
Best parameters :  {'C': 1}


In [13]:
lr = LogisticRegression(C=1)
lr.fit(X_train, y_train)

In [14]:
print("Traing Accuracy : ", lr.score(X_train, y_train))
print("Test Accuracy : ", lr.score(X_test, y_test))

Traing Accuracy :  0.9965397923875432
Test Accuracy :  0.979381443298969


In [15]:
review = ["WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.			"]
review_text = pd.Series(review)
review = vect.transform(review_text)
result = lr.predict(review)
print(result)

['Positive']


In [16]:
def try_review(review : str):
    review_text = pd.Series(review)
    review = vect.transform(review_text)
    result = lr.predict(review)
    print(*result)

In [17]:
try_review('satisfactory but not that good')

Neutral
