## Model

In [None]:
# logistic
from joblib import dump
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
# train test split
X = df['content_cleaned']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# use TF-IDF performing feature extraction
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
# model fit
clf = LogisticRegression()
clf.fit(X_train_tfidf, y_train)
dump(clf, 'logistic_regression_model.joblib')
dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')
# prediction
y_pred = clf.predict(X_test_tfidf)
# results
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

accuracy, classification_rep

In [None]:
# transformer
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset, random_split
# load BERT and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', use_fast=True)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
# data preprocessing
sub_df = df.head(100000)
inputs = tokenizer.batch_encode_plus(list(sub_df['content_cleaned']), truncation=True, padding=True, max_length=256, return_tensors='pt')
input_ids = inputs["input_ids"]
attention_masks = inputs["attention_mask"]
labels = torch.tensor(sub_df['sentiment'].values)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
input_ids = input_ids.to(device)
attention_masks = attention_masks.to(device)
labels = labels.to(device)
# construct PyTorch datatset
dataset = TensorDataset(input_ids, attention_masks, labels)
# split
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=32)
val_loader = DataLoader(val_dataset, batch_size=32)
# fine tune
from tqdm import tqdm
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

for epoch in range(3):  # 3 rounds
    model.train()
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch}"):
        input_ids, attention_masks, labels = batch
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# evaluation
model.eval()
all_preds, all_labels = [], []
for batch in val_loader:
    input_ids, attention_masks, labels = batch
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_masks)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    all_preds.extend(preds)
    all_labels.extend(labels)

accuracy = (torch.tensor(all_preds) == torch.tensor(all_labels)).float().mean()
print(f"Validation Accuracy: {accuracy:.4f}")
from sklearn.metrics import precision_recall_fscore_support
# Convert the list of tensors to a single tensor and then move it to CPU
all_preds_tensor = torch.stack(all_preds).cpu()
all_labels_tensor = torch.stack(all_labels).cpu()

# Now convert to NumPy arrays
all_preds_array = all_preds_tensor.numpy()
all_labels_array = all_labels_tensor.numpy()

# Calculate the confusion matrix using scikit-learn
precision, recall, f1, _ = precision_recall_fscore_support(all_labels_array, all_preds_array, average='weighted')

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


## Chatgpt's API

In [None]:
import openai

openai.api_key = ''  

def get_chatgpt_response(comment, product_url, model="gpt-4-1106-preview", max_tokens=60):
    prompt = f"A customer has a concern about the product: '{comment}'. Imagine that you are a competitor of that product and sell the benefits of your own product based on the claim. Don't include sorry and thank you's. Answer in a positive sunny way! don't apologize, and present the benefits of the product(For example, it's cost-effective, very good quality and looks great. Respond to reviews based on their claims). End the response with a link to the product website for additional support."
    
    try:
        response = openai.ChatCompletion.create(
            model=model,
            messages=[{"role": "system", "content": prompt}],
            max_tokens=max_tokens
        )
        reply = response['choices'][0]['message']['content'].strip()
        return f"{reply}\nFor more information, please visit our website: {product_url}"
    except openai.error.OpenAIError as e:
        print(f"An error occurred: {e}")
        return "" 

product_url_example = "http://www.example.com"
n_df = df[df['sentiment'] == 0]
sub_df = n_df.iloc[0:1000]
responses = []
for comment in sub_df['content']: 
    response = get_chatgpt_response(comment, product_url_example)
    responses.append(response)
    

In [None]:
# Visualization
import plotly.graph_objects as go
fig = go.Figure(data=[go.Table(
    header=dict(values=['Comment', 'Response'],
                line_color='darkslategray',
                fill_color='lightskyblue',
                align='center'),
    cells=dict(values=[sub_df['content'], responses],
               line_color='darkslategray',
               fill_color='lightcyan',
               align='left',
               font_size=12,
               height=30))
])


fig.update_layout(width=700, height=400, title_text="Interactive Comments and Responses Table")
fig.show()

In [None]:
# Evaluation
from joblib import load

# Load the saved logistic regression model and TF-IDF vectorizer
clf = load('logistic_regression_model.joblib')
tfidf_vectorizer = load('tfidf_vectorizer.joblib')

# Function to calculate sentiment score
def get_sentiment_score(text):
    text_tfidf = tfidf_vectorizer.transform([text])
    return clf.predict(text_tfidf)[0]


# Calculate sentiment scores
sentiment_scores = []


for response in responses:
    sentiment_scores.append(get_sentiment_score(response))


# Calculate the averages
average_sentiment_score = sum(sentiment_scores) / len(sentiment_scores) if sentiment_scores else 0


# Print or store the average scores
print("Average Sentiment Score:", average_sentiment_score)