In [1]:
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from sklearn.model_selection import GridSearchCV,StratifiedKFold,KFold,train_test_split,cross_val_score,cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, f1_score,make_scorer
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal

import time
from tabulate import tabulate

In [None]:
file_path='/content/drive/MyDrive/cleaned_office_and sentiment.csv'
df=pd.read_csv(file_path)
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,overall,unixReviewTime,price,title,main_category,sub_category,date,year,Helpful,not_Helpful,review,cleaned_text,polarity,sentiment
0,A3BBNK2R5TUYGV,0113000316,b,"[0, 0]",5.0,1359417600,23.242188,123GetInk -14-pack 5-black 3-cyan 3-magenta 3-...,Office Products,Office & School Supplies,2013-01-29,2013,0,0,even though they were refurbished the colors w...,even though refurbished color vivid cartridge ...,0.291667,Neutral
1,A5J78T14FJ5DU,0113000316,N. Sommers,"[0, 0]",3.0,1318723200,23.242188,123GetInk -14-pack 5-black 3-cyan 3-magenta 3-...,Office Products,Office & School Supplies,2011-10-16,2011,0,0,"A good deal and I can make them work ok, BUT I...",good deal make work ok live dry climate tend d...,0.106881,Neutral
2,A2P462UH5L6T57,043928631X,D. Morrissey,"[0, 0]",5.0,1356912000,23.242188,Harry Potter Lenticular Hologram Bookmark - Ha...,Office Products,Office & School Supplies,2012-12-31,2012,0,0,Nice added stocking stuffer. Beautiful additi...,nice added stocking stuffer beautiful addition...,0.766667,Positive
3,AI7SARYVM8FGA,0439394058,A. Manternach,"[5, 5]",4.0,1212624000,11.64,Scholastic SC939405 All-In-One Schoolhouse Cal...,Office Products,Office & School Supplies,2008-06-05,2008,5,5,"Nice size, colorful simple pictures and a nice...",nice size colorful simple picture nice font wi...,0.5,Positive
4,A1BUVOGGFTGMBN,0439394058,ANON,"[5, 5]",2.0,1389744000,11.64,Scholastic SC939405 All-In-One Schoolhouse Cal...,Office Products,Office & School Supplies,2014-01-15,2014,5,5,This is not coated in plastic as the descripti...,coated plastic description indicates thin card...,-0.2125,Negative


In [None]:
df.dropna(inplace=True)

In [None]:
df.isna().sum()

reviewerID        0
asin              0
reviewerName      0
helpful           0
overall           0
unixReviewTime    0
price             0
title             0
main_category     0
sub_category      0
date              0
year              0
Helpful           0
not_Helpful       0
review            0
cleaned_text      0
polarity          0
sentiment         0
dtype: int64

In [None]:
df.sentiment.value_counts()

Neutral     558722
Positive    530901
Negative    141422
Name: sentiment, dtype: int64

In [None]:
# as this data is biased we can do a resampling
from sklearn.utils import resample
# Separate classes
neutral = df[df['sentiment'] == 'Neutral']
positive = df[df['sentiment'] == 'Positive']
negative = df[df['sentiment'] == 'Negative']



# Upsample the minority class to balance the dataset
upsampled_negative = resample(negative, replace=True, n_samples=len(neutral), random_state=42)

# Combine the upsampled class with the majority classes
balanced_df = pd.concat([neutral, positive, upsampled_negative])

# Shuffle the balanced dataset
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Check the class distribution
print(balanced_df['sentiment'].value_counts())

Negative    558722
Neutral     558722
Positive    530901
Name: sentiment, dtype: int64


In [None]:
# determining the x and y for the model building
x=balanced_df['cleaned_text']
y=balanced_df['sentiment']

trainx,testx,trainy,testy=train_test_split(x,y,test_size=0.3,random_state=43)

In [None]:
len(trainx),len(testx),len(trainy),len(testy)

(1153841, 494504, 1153841, 494504)

In [None]:
# Create the pipeline.
model1 = Pipeline([
    (
        "tfidf",
        TfidfVectorizer(),
    ),
    (
        "clf",
        MultinomialNB(),
    ),
])

# Fit the pipeline to the training data.
model1.fit(trainx, trainy)

# Make predictions.
start_time = time.time()
predictions = model1.predict(testx)
predict_time = time.time() - start_time

# Calculate the confusion matrix and classification report.
cm1 = confusion_matrix(testy, predictions)
cr1 = classification_report(testy, predictions)

# Calculate accuracy and F1-score.
accuracy = accuracy_score(testy, predictions)
f1 = f1_score(testy, predictions, average='weighted')



In [None]:
# Print the confusion matrix and classification report.
print(cm1)
print(cr1,'\n')


print(f"Accuracy: {accuracy:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"Time taken for predicting: {predict_time:.4f} seconds")

[[128375  36563   2558]
 [ 33770 112219  21522]
 [  6117  49554 103826]]
              precision    recall  f1-score   support

    Negative       0.76      0.77      0.76    167496
     Neutral       0.57      0.67      0.61    167511
    Positive       0.81      0.65      0.72    159497

    accuracy                           0.70    494504
   macro avg       0.71      0.70      0.70    494504
weighted avg       0.71      0.70      0.70    494504
 

Accuracy: 0.6965
F1-Score: 0.6999
Time taken for predicting: 24.7480 seconds


## model 2

In [None]:
# Create the pipeline.
model2 = Pipeline([
    (
        "tfidf",
        TfidfVectorizer(),
    ),
    (
        "clf",
        LogisticRegression(multi_class='ovr'),
    ),
])

# Fit the pipeline to the training data.
model2.fit(trainx, trainy)

# Make predictions.
start_time = time.time()
predictions2 = model2.predict(testx)
predict_time = time.time() - start_time

# Calculate the confusion matrix and classification report.
cm2 = confusion_matrix(testy, predictions2)
cr2 = classification_report(testy, predictions2)

# Calculate accuracy and F1-score.
accuracy2 = accuracy_score(testy, predictions2)
f12 = f1_score(testy, predictions2, average='weighted')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Print the confusion matrix and classification report.
print(cm2)
print(cr2,'\n')


print(f"Accuracy: {accuracy2:.4f}")
print(f"F1-Score: {f12:.4f}")
print(f"Time taken for predicting: {predict_time:.4f} seconds")

[[163012   4354    130]
 [ 15921 138114  13476]
 [    79   7438 151980]]
              precision    recall  f1-score   support

    Negative       0.91      0.97      0.94    167496
     Neutral       0.92      0.82      0.87    167511
    Positive       0.92      0.95      0.94    159497

    accuracy                           0.92    494504
   macro avg       0.92      0.92      0.92    494504
weighted avg       0.92      0.92      0.92    494504
 

Accuracy: 0.9163
F1-Score: 0.9151
Time taken for predicting: 24.4305 seconds


### Model 3

In [None]:

# Create the pipeline.
model3 = Pipeline([
    (
        "tfidf",
        TfidfVectorizer(),
    ),
    (
        "clf",
        LinearSVC(),
    ),
])

# Fit the pipeline to the training data.
model3.fit(trainx, trainy)

# Make predictions.
start_time = time.time()
predictions3 = model3.predict(testx)
predict_time = time.time() - start_time

# confusion matrix and classification report.
cm3 = confusion_matrix(testy, predictions3)
cr3 = classification_report(testy, predictions3)

#accuracy and F1-score.
accuracy3 = accuracy_score(testy, predictions3)
f13 = f1_score(testy, predictions3, average='weighted')


In [None]:
# Print the confusion matrix and classification report.
print(cm3)
print(cr3,'\n')


print(f"Accuracy: {accuracy3:.4f}")
print(f"F1-Score: {f13:.4f}")
print(f"Time taken for predicting: {predict_time:.4f} seconds")

[[165056   2370     70]
 [ 10501 145211  11799]
 [    50   6294 153153]]
              precision    recall  f1-score   support

    Negative       0.94      0.99      0.96    167496
     Neutral       0.94      0.87      0.90    167511
    Positive       0.93      0.96      0.94    159497

    accuracy                           0.94    494504
   macro avg       0.94      0.94      0.94    494504
weighted avg       0.94      0.94      0.94    494504
 

Accuracy: 0.9371
F1-Score: 0.9364
Time taken for predicting: 26.1170 seconds


## Hyperparameter tuning on model3

In [None]:
# Create the pipeline.
model4 = Pipeline([
    (
        "tfidf",
        TfidfVectorizer(),
    ),
    (
        "clf",
        LinearSVC(),
    ),
])

# Define hyperparameters and their distributions for random search
param_dist = {
    "clf__C": reciprocal(0.01, 100),  # Reciprocal distribution for C
    "clf__penalty": ['l1', 'l2'],     # Regularization penalty
}


scorer = make_scorer(f1_score, average='weighted')

# Perform Randomized Search with Cross-Validation
random_search = RandomizedSearchCV(model4, param_distributions=param_dist, n_iter=10, scoring=scorer, cv=2, verbose=2, n_jobs=-1)
random_search.fit(trainx, trainy)

# Get the best model and its hyperparameters
best_model = random_search.best_estimator_
best_params = random_search.best_params_

# Make predictions with the best model
predictions4 = best_model.predict(testx)

# confusion matrix and classification report.
cm4 = confusion_matrix(testy, predictions4)
cr4 = classification_report(testy, predictions4)
# Calculate F1-score of the best model on the test data


In [None]:
print(random_search.best_estimator_)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf', LinearSVC(C=4.098458048409581))])


In [None]:
#accuracy and F1-score.
accuracy4 = accuracy_score(testy, predictions4)
best_f1_score = f1_score(testy, predictions4, average='weighted')
print(cm4)
print(cr4  ,'\n')
print("Best Hyperparameters:", best_params)
print(f"Accuracy: {accuracy4:.4f}")
print(f"Best F1-Score on Test Data: {best_f1_score:.4f}")

[[165373   2078     45]
 [  8913 147021  11577]
 [    45   6690 152762]]
              precision    recall  f1-score   support

    Negative       0.95      0.99      0.97    167496
     Neutral       0.94      0.88      0.91    167511
    Positive       0.93      0.96      0.94    159497

    accuracy                           0.94    494504
   macro avg       0.94      0.94      0.94    494504
weighted avg       0.94      0.94      0.94    494504
 

Best Hyperparameters: {'clf__C': 4.098458048409581, 'clf__penalty': 'l2'}
Accuracy: 0.9407
Best F1-Score on Test Data: 0.9401


In [None]:
# Print the confusion matrix and classification report.
print(cm4)
print(cr4,'\n')

print("Best Hyperparameters:", best_params)
print(f"Best F1-Score on Test Data: {best_f1_score:.4f}")

In [None]:
# Prepare the results for tabulation
results = [
    ["Metric", "MultinomialNB-TFIDF", "Logi-TFIDF", "LinearSVC-TFIDF","LinearSVC-TFIDF Hyper-tuned"],
    ["Confusion Matrix", cm1, cm2, cm3,cm4],
    ["Accuracy", accuracy, accuracy2, accuracy3,accuracy4],
    ["F1-Score", f1, f12, f13,best_f1_score]
    ]

In [None]:
print(tabulate(results, headers="firstrow", tablefmt="grid"))

+------------------+--------------------------+--------------------------+--------------------------+-------------------------------+
| Metric           | MultinomialNB-TFIDF      | Logi-TFIDF               | LinearSVC-TFIDF          | LinearSVC-TFIDF Hyper-tuned   |
| Confusion Matrix | [[128375  36563   2558]  | [[163012   4354    130]  | [[165056   2370     70]  | [[165373   2078     45]       |
|                  |  [ 33770 112219  21522]  |  [ 15921 138114  13476]  |  [ 10501 145211  11799]  |  [  8913 147021  11577]       |
|                  |  [  6117  49554 103826]] |  [    79   7438 151980]] |  [    50   6294 153153]] |  [    45   6690 152762]]      |
+------------------+--------------------------+--------------------------+--------------------------+-------------------------------+
| Accuracy         | 0.6964958827431122       | 0.9162837914354586       | 0.9371410544707424       | 0.9406516428582984            |
+------------------+--------------------------+---------------

  or (len(row) >= 2 and row[1] == SEPARATING_LINE)


In [3]:
import joblib
save_path = "/content/drive/MyDrive/best_model_amz.pkl"

In [None]:


joblib.dump(best_model, save_path)

['/content/drive/MyDrive/best_model_amz.pkl']

In [None]:
testx[:1]

110834    mouse pad oversized match size advertised disa...
Name: cleaned_text, dtype: object

In [4]:
loaded_model = joblib.load(save_path)

In [5]:
# Define the new review sentences
new_review_sentences = [
    "This mouse pad is oversized and matches the size as advertised. Very satisfied with the purchase.",
    "The quality of the product doesn't match the description. Disappointed with the purchase.",
    "The product is functional. It neither exceeded nor fell short of expectations."

]

# Make predictions using the loaded model
predictions = loaded_model.predict(new_review_sentences)

predictions

array(['Positive', 'Negative', 'Neutral'], dtype=object)

In [None]:
predictions_array = np.array(predictions)

# Map numerical predictions back to sentiment labels
sentiments = ['Negative', 'Neutral', 'Positive']
predicted_sentiments = np.vectorize(lambda x: sentiments.index(x))(predictions_array)

# Print the predicted sentiments for the new review sentences
for sentence, sentiment in zip(new_review_sentences, predicted_sentiments):
    print(f"Review: {sentence}")
    print(f"Predicted Sentiment: {sentiments[sentiment]}")
    print()

Review: This mouse pad is oversized and matches the size as advertised. Very satisfied with the purchase.
Predicted Sentiment: Positive

Review: The quality of the product doesn't match the description. Disappointed with the purchase.
Predicted Sentiment: Negative

Review: The product is functional. It neither exceeded nor fell short of expectations.
Predicted Sentiment: Neutral



In [7]:
import sys
print(sys.version)

3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0]
