In [5]:
!pip install --upgrade transformers accelerate datasets
!pip install s3fs

Collecting s3fs
  Downloading s3fs-2024.10.0-py3-none-any.whl.metadata (1.7 kB)
Collecting fsspec==2024.10.0.* (from s3fs)
  Downloading fsspec-2024.10.0-py3-none-any.whl.metadata (11 kB)
Downloading s3fs-2024.10.0-py3-none-any.whl (29 kB)
Downloading fsspec-2024.10.0-py3-none-any.whl (179 kB)
Installing collected packages: fsspec, s3fs
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2023.6.0
    Uninstalling fsspec-2023.6.0:
      Successfully uninstalled fsspec-2023.6.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jupyter-scheduler 2.9.0 requires fsspec==2023.6.0, but you have fsspec 2024.10.0 which is incompatible.
datasets 3.1.0 requires fsspec[http]<=2024.9.0,>=2023.1.0, but you have fsspec 2024.10.0 which is incompatible.[0m[31m
[0mSuccessfully installed fsspec-2024.10.0 s3fs-2024.10.0


In [6]:
import os
import io
import boto3
import torch
import sagemaker
import numpy as np
import pandas as pd
import datetime as dt
from io import BytesIO
from sklearn.metrics import accuracy_score
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
from datasets import Dataset
from sklearn.model_selection import train_test_split

In [2]:
s3 = boto3.client('s3')

bucket_name = "sagemaker-studio-619071335465-8h7owh9eftx"
main_text_dir = 'training/text reviews/'


response = s3.list_objects_v2(Bucket=bucket_name, Prefix=main_text_dir)

csv_files = [obj['Key'] for obj in response.get('Contents', []) if obj['Key'].endswith('.csv')]

if len(csv_files) == 1:
    csv_file_key = csv_files[0]
    print(f"Found CSV file: {csv_file_key}")
else:
    raise ValueError(f"Expected exactly one CSV file, but found {len(csv_files)}")

Found CSV file: training/text reviews/FL_Reviews_Edited.csv


In [7]:
chunk_size = 10000

s3_uri = f"s3://{bucket_name}/{csv_file_key}"

chunk_list = []

for chunk in pd.read_csv(s3_uri, chunksize=chunk_size):
    chunk_list.append(chunk)

df_reviews = pd.concat(chunk_list, ignore_index=True)
df_reviews.shape

(792133, 24)

In [8]:
df_sample = df_reviews[df_reviews['stars_reviews'].isin([1,5])]
df_sample.shape

(474385, 24)

In [9]:
df_sample.columns

Index(['business_id', 'name', 'address', 'city_original', 'state',
       'postal_code', 'latitude', 'longitude', 'stars_business',
       'review_count', 'is_open', 'attributes', 'categories', 'hours',
       'review_id', 'user_id', 'stars_reviews', 'useful', 'funny', 'cool',
       'text', 'date', 'zip_code', 'city_updated'],
      dtype='object')

In [10]:
df_sample['binary_labels'] = df_sample['stars_reviews'].map({1: 0, 5: 1})
df_sample[['text', 'binary_labels']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample['binary_labels'] = df_sample['stars_reviews'].map({1: 0, 5: 1})


Unnamed: 0,text,binary_labels
0,I eat pho about 4 times a week and from a spec...,1
1,I've been in Wesley Chapel area for about 2 ye...,1
4,How can an order of pho take an 1 hour and 15 ...,0
5,I ordered the beef and meatball pho & two egg ...,1
7,"I have traveled to Vietnam several times, love...",1
...,...,...
792127,Colony Grill occupies one of the larger eatery...,1
792129,This place is great! The space is big with a n...,1
792130,So first looking at the menu for this place yo...,1
792131,I lived in CT for 10 years and CT style pizza ...,1


In [11]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df_sample['text'].tolist(), df_sample['binary_labels'].tolist(), test_size=0.2, random_state=42
)

train_dataset = Dataset.from_dict({'text': train_texts, 'label': train_labels})
test_dataset = Dataset.from_dict({'text': test_texts, 'label': test_labels})

In [12]:
df_sample['stars_reviews'].value_counts()

stars_reviews
5    375217
1     99168
Name: count, dtype: int64

In [13]:
distilbert_model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(distilbert_model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [14]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/379508 [00:00<?, ? examples/s]

Map:   0%|          | 0/94877 [00:00<?, ? examples/s]

In [15]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {
        'eval_accuracy': accuracy
    }

best_params = {'learning_rate': 1.0080248949472861e-05, 'batch_size': 64, 'num_epochs': 5}

best_learning_rate = best_params['learning_rate']
best_batch_size = best_params['batch_size']
best_num_epochs = best_params['num_epochs']

for k, v in best_params.items():
    print(f"{k}: {v}")

learning_rate: 1.0080248949472861e-05
batch_size: 64
num_epochs: 5


In [16]:
model = AutoModelForSequenceClassification.from_pretrained(distilbert_model_name, num_labels=2)

s3_output_dir = "s3://sagemaker-studio-619071335465-8h7owh9eftx/training/outputs/"


training_args = TrainingArguments(
    output_dir=s3_output_dir,
    evaluation_strategy="epoch",
    learning_rate=best_learning_rate,
    per_device_train_batch_size=best_batch_size,
    per_device_eval_batch_size=best_batch_size,
    num_train_epochs=best_num_epochs,
    weight_decay=0.01,
    logging_dir=f"{s3_output_dir}/logs",
    logging_steps=10000,
    save_strategy="steps",
    save_steps=10000,
    report_to="none",
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

eval_results = trainer.evaluate()

final_accuracy = eval_results["eval_accuracy"]
print(f"Final evaluation accuracy: {final_accuracy}")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.016469,0.994846
2,0.020300,0.01765,0.995373
3,0.020300,0.020138,0.995257
4,0.006900,0.023811,0.995162
5,0.006900,0.026761,0.995141


Final evaluation accuracy: 0.995141077394943
