In [1]:
!pip install --upgrade transformers accelerate datasets



In [2]:
import os
import io
import boto3
import torch
import sagemaker
import numpy as np
import pandas as pd
import datetime as dt
from io import BytesIO
from sklearn.metrics import accuracy_score
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
from datasets import Dataset
from sklearn.model_selection import train_test_split

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


2024-11-24 22:40:36.779003: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-24 22:40:36.799236: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-24 22:40:36.805589: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-24 22:40:36.820558: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
s3 = boto3.client('s3')

bucket_name = "sagemaker-studio-619071335465-8h7owh9eftx"
main_text_dir = 'training/text reviews/'


response = s3.list_objects_v2(Bucket=bucket_name, Prefix=main_text_dir)

csv_files = [obj['Key'] for obj in response.get('Contents', []) if obj['Key'].endswith('.csv')]

if len(csv_files) == 1:
    csv_file_key = csv_files[0]
    print(f"Found CSV file: {csv_file_key}")
else:
    raise ValueError(f"Expected exactly one CSV file, but found {len(csv_files)}")

Found CSV file: training/text reviews/FL_Reviews_Edited.csv


In [4]:
chunk_size = 10000

s3_uri = f"s3://{bucket_name}/{csv_file_key}"

chunk_list = []

for chunk in pd.read_csv(s3_uri, chunksize=chunk_size):
    chunk_list.append(chunk)

df_reviews = pd.concat(chunk_list, ignore_index=True)
df_reviews.shape


(792133, 24)

In [5]:
df_sample = df_reviews[df_reviews['stars_reviews'].isin([1,5])]
# df_sample = df_sample.sample(frac=0.0025, random_state=42)
df_sample.shape

(474385, 24)

In [6]:
df_sample['binary_labels'] = df_sample['stars_reviews'].map({1: 0, 5: 1})
df_sample.columns

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample['binary_labels'] = df_sample['stars_reviews'].map({1: 0, 5: 1})


Index(['business_id', 'name', 'address', 'city_original', 'state',
       'postal_code', 'latitude', 'longitude', 'stars_business',
       'review_count', 'is_open', 'attributes', 'categories', 'hours',
       'review_id', 'user_id', 'stars_reviews', 'useful', 'funny', 'cool',
       'text', 'date', 'zip_code', 'city_updated', 'binary_labels'],
      dtype='object')

In [7]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df_sample['text'].tolist(), df_sample['binary_labels'].tolist(), test_size=0.2, random_state=42
)

train_dataset = Dataset.from_dict({'text': train_texts, 'label': train_labels})
test_dataset = Dataset.from_dict({'text': test_texts, 'label': test_labels})

In [8]:
df_sample['stars_reviews'].value_counts()

stars_reviews
5    375217
1     99168
Name: count, dtype: int64

In [9]:
distilbert_model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(distilbert_model_name)

In [10]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/379508 [00:00<?, ? examples/s]

Map:   0%|          | 0/94877 [00:00<?, ? examples/s]

In [11]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {
        'eval_accuracy': accuracy
    }

best_params = {'learning_rate': 1.0080248949472861e-05, 'batch_size': 64, 'num_epochs': 5}

best_learning_rate = best_params['learning_rate']
best_batch_size = best_params['batch_size']
best_num_epochs = best_params['num_epochs']

for k, v in best_params.items():
    print(f"{k}: {v}")

learning_rate: 1.0080248949472861e-05
batch_size: 64
num_epochs: 5


In [12]:
s3_client = boto3.client('s3')
bucket_name = "sagemaker-studio-619071335465-8h7owh9eftx"
s3_file_key = 'training/outputs/cumulative_reviews.csv'

model = AutoModelForSequenceClassification.from_pretrained(distilbert_model_name, num_labels=2)

def append_to_s3(df, bucket_name, file_key):
    try:
        obj = s3_client.get_object(Bucket=bucket_name, Key=file_key)
        existing_df = pd.read_csv(io.BytesIO(obj["Body"].read()))
        combined_df = pd.concat([existing_df, df], ignore_index=True)
    except s3_client.exceptions.NoSuchKey:
        combined_df = df

    csv_buffer = io.StringIO()
    combined_df.to_csv(csv_buffer, index=False)
    s3_client.put_object(Bucket=bucket_name, Key=file_key, Body=csv_buffer.getvalue())
    print(f"Updated cumulative file to s3://{bucket_name}/{file_key}")


def get_last_saved_row(bucket_name, file_key):
    try:
        obj = s3_client.get_object(Bucket=bucket_name, Key=file_key)
        saved_df = pd.read_csv(io.BytesIO(obj["Body"].read()))
        return len(saved_df)
    except s3_client.exceptions.NoSuchKey:
        return 0


start_row = get_last_saved_row(bucket_name, s3_file_key)
print(f"Resuming processing from row {start_row}.")


chunk_size = 1000
intermittent_data = []
processed_dataset = train_dataset[start_row:]

for i, (review, label) in enumerate(zip(processed_dataset["text"], processed_dataset["label"]), start=start_row+1):
    inputs = tokenizer(review, return_tensors='pt', truncation=True, padding=True, max_length=512)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_label = torch.argmax(logits, dim=-1).item()

    intermittent_data.append({
        "review": review,
        "true_label": label,
        "predicted_label": predicted_label
    })

    if i % chunk_size == 0:
        print(f"Processed {i} rows so far.")
        intermittent_df = pd.DataFrame(intermittent_data)
        append_to_s3(intermittent_df, bucket_name, s3_file_key)
        intermittent_data = []


if intermittent_data:
    print(f"Saving final {len(intermittent_data)} rows.")
    intermittent_df = pd.DataFrame(intermittent_data)
    append_to_s3(intermittent_df, bucket_name, s3_file_key)
    

s3_output_dir = "s3://sagemaker-studio-619071335465-8h7owh9eftx/training/outputs/"

training_args = TrainingArguments(
    output_dir=s3_output_dir,
    evaluation_strategy="epoch",
    learning_rate=best_learning_rate,
    per_device_train_batch_size=best_batch_size,
    per_device_eval_batch_size=best_batch_size,
    num_train_epochs=best_num_epochs,
    weight_decay=0.01,
    logging_dir=f"{s3_output_dir}/logs",
    logging_steps=10,
    save_strategy="steps",
    save_steps=500,
    report_to="none",
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

eval_results = trainer.evaluate()

final_accuracy = eval_results["eval_accuracy"]
print(f"Final evaluation accuracy: {final_accuracy}")

Resuming processing from row 0.
Processed 1000 rows so far.
Updated cumulative file to s3://sagemaker-studio-619071335465-8h7owh9eftx/training/outputs/cumulative_reviews.csv
Processed 2000 rows so far.
Updated cumulative file to s3://sagemaker-studio-619071335465-8h7owh9eftx/training/outputs/cumulative_reviews.csv
Processed 3000 rows so far.
Updated cumulative file to s3://sagemaker-studio-619071335465-8h7owh9eftx/training/outputs/cumulative_reviews.csv
Processed 4000 rows so far.
Updated cumulative file to s3://sagemaker-studio-619071335465-8h7owh9eftx/training/outputs/cumulative_reviews.csv
Processed 5000 rows so far.
Updated cumulative file to s3://sagemaker-studio-619071335465-8h7owh9eftx/training/outputs/cumulative_reviews.csv
Processed 6000 rows so far.
Updated cumulative file to s3://sagemaker-studio-619071335465-8h7owh9eftx/training/outputs/cumulative_reviews.csv
Processed 7000 rows so far.
Updated cumulative file to s3://sagemaker-studio-619071335465-8h7owh9eftx/training/outpu



Epoch,Training Loss,Validation Loss


SafetensorError: Error while serializing: IoError(Os { code: 28, kind: StorageFull, message: "No space left on device" })