In [None]:
!pip install transformers==4.24.0
!pip install simpletransformers==0.63.11
!pip install scikit-learn
!pip install tensorflow
!pip install dataset

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, mean_squared_error
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from simpletransformers.classification import ClassificationModel, MultiLabelClassificationModel
import tensorflow as tf

In [None]:
import sys, os, re, json
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
from collections import defaultdict, Counter
from typing import List, Tuple, Dict
import torch
import transformers
from transformers import BertTokenizer

In [None]:
torch.cuda.is_available()

In [None]:
from google.colab import drive
drive.mount('/content/drive/')
#
dir_dataset = ""
file_business = os.path.join(dir_dataset, "yelp_academic_dataset_business.json")
file_review = os.path.join(dir_dataset, "yelp_academic_dataset_review.json")

In [None]:
%cd /content/drive/MyDrive/Yelp_Data

In [None]:
''' Read in the data - copied from my code in using machine learning algorithms'''
def json_df (file_name):
  counter = 0
  recorder = []

  for chunk in pd.read_json("yelp_academic_dataset_review.json", lines=True, chunksize=1000):
      recorder.append(chunk)

      ''' #For smaller sized chunks to test code
      if counter == 5:
          break
      '''
      counter += 1
      if counter % 1000 == 0:
          print(counter)

  df_review = pd.concat(recorder)
  print("df_review made")
  return df_review

In [None]:
from sklearn.utils import resample

def undersample(df, group_size=200000):
  dfs = []

  for label in df["stars"].value_counts().keys():
    df_group = df[df["stars"] == label]
    df_group_undersampled = resample(df_group,
                                     replace=False,
                                     n_samples=group_size,
                                     random_state=0)
    dfs.append(df_group_undersampled)

  return pd.concat(dfs).sample(frac=1, random_state=0)

In [None]:
%time df_all = json_df(file_review)

In [None]:
df_review = df_all.copy()
df_review = undersample(df_review, 250000)

In [None]:
df_review['labels'] = df_review['stars'] - 1

In [None]:
#df_review['labels'] = df_review['stars'].apply(lambda x: 1 if x > 3 else 0)
df_review = df_review[['text', 'labels']]

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_review, test_size=500000,random_state=42, shuffle=True)
df_train, df_dev = train_test_split(df_train, test_size=500000, random_state=42, shuffle=True)

In [None]:
# Function to tokenize the texts
def tokenize(df):
    return tokenizer(df['text'], padding='max_length', truncation=True, max_length=128)

In [None]:
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments
from datasets import load_dataset
import torch

os.environ["WANDB_DISABLED"] = "true"
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = ""

unique_labels = df_train['labels'].unique()

# Load the tokenizer and the model
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(unique_labels)+1)


# Load your dataframe into a Hugging Face dataset
from datasets import Dataset

df_new = df_train[['text', 'labels']] # training
#df_new = df_new[:1000]

dataset = Dataset.from_pandas(df_new)

# Tokenize the dataset
dataset = dataset.map(tokenize, batched=True, batch_size=len(dataset))
dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# Split the dataset into a training and validation set
dataset = dataset.train_test_split(test_size=0.2)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy='epoch',  # or 'epoch' if you want to evaluate at the end of each epoch
    save_steps = 10000,
    eval_steps = 10000,
    logging_dir='./logs',
)


# Create the Trainer and train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test']
)

trainer.train()


In [None]:
from datasets import Dataset

# Select the part of the dataframe you want to use for evaluation
test_eval = df_review[1000:1010]

# Convert the pandas DataFrame to a Hugging Face Dataset
test_eval = Dataset.from_pandas(test_eval)

# Tokenize the data and set the format
test_eval = test_eval.map(tokenize, batched=True, batch_size=len(test_eval))
test_eval.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
df_new = df_test[['text', 'labels']]
df_test_dataset = Dataset.from_pandas(df_new)
df_test_dataset = df_test_dataset.map(tokenize, batched=True, batch_size=len(dataset))

df_test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
prediction_output = trainer.predict(df_test_dataset)

In [None]:
pred = prediction_output.predictions

y_pred = prediction_output.label_ids
pred_metrics = prediction_output.metrics

In [None]:
y = df_test['labels']
label = 'testing'
model_name = 'bert sentiment-only'

In [None]:
print(y_pred)
print(label + ' Set')
print("Accuracy:", accuracy_score(y, y_pred))
print()

print(classification_report(y, y_pred, digits=4))
df_cm = pd.DataFrame(confusion_matrix(y, y_pred, normalize='true'),range(1,6), range(1,6))
#df_cm = pd.DataFrame(confusion_matrix(y, y_pred, normalize='true'),range(1,3), range(1,3))
plt.figure(figsize=(6,4))
ax = sn.heatmap(df_cm, annot=True, cmap=plt.cm.Blues, square=True)
ax.set_xlabel('Predicted label')
ax.set_ylabel('True label')
plt.savefig(model_name + "_" + label.lower() + ".eps")
plt.show()
print()