In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from transformers import BertTokenizer
import torch

# Load the dataset
df = pd.read_csv('https://www.cs.fsu.edu/~liux/courses/deepRL/assignments/amazon_reviews.csv')

df = df.dropna()

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['reviewText'], df['overall'], test_size=0.2, random_state=42)

# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the input texts
train_input_ids = []
train_attention_masks = []

for text in X_train:
    encoded_dict = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    train_input_ids.append(encoded_dict['input_ids'])
    train_attention_masks.append(encoded_dict['attention_mask'])

test_input_ids = []
test_attention_masks = []
for text in X_test:
    encoded_dict = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    test_input_ids.append(encoded_dict['input_ids'])
    test_attention_masks.append(encoded_dict['attention_mask'])

# Combine the input_ids and attention_masks into a single feature vector
# Combine the input_ids and attention_masks into a single feature vector
X_train_vectorized = torch.cat((torch.stack(train_input_ids), torch.stack(train_attention_masks)), dim=1).view(len(X_train), -1)
X_test_vectorized = torch.cat((torch.stack(test_input_ids), torch.stack(test_attention_masks)), dim=1).view(len(X_test), -1)

  from .autonotebook import tqdm as notebook_tqdm
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV

# Initialize the Logistic Regression
clf = LogisticRegression(random_state=42,C=0.1)
clf.fit(X_train_vectorized, y_train)

test_accuracy = clf.score(X_test_vectorized, y_test)
print(f"Test accuracy: {test_accuracy:.2f}")

Test accuracy: 0.76


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
# Define a function to preprocess the input text
def preprocess_text(text):
    encoded_dict = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    input_ids = encoded_dict['input_ids'].view(1, -1)
    attention_masks = encoded_dict['attention_mask'].view(1, -1)
    return torch.cat((input_ids, attention_masks), dim=1)

# Load the trained Logistic Regression model
clf = LogisticRegression(class_weight='balanced', random_state=42)
clf.fit(X_train_vectorized, y_train)

# Example new review
new_review = X_test.iloc[100]

# Preprocess the new review
new_review_vectorized = preprocess_text(new_review)

# Make the prediction
predicted_label = clf.predict(new_review_vectorized)[0]
print(new_review)
print(f"Predicted sentiment: {predicted_label}")
print(f"Actual Value: {y_test.iloc[95]}")

I want to upgrade the memory to my Samsung Note II. It is great to be able to instantly double my storage capacity. I am waiting for the 128 GB to down in price.
Predicted sentiment: 4.0
Actual Value: 5.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
