In [1]:
import os
import re
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import boto3
import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri
from pathlib import Path

# Ensure NLTK stopwords are available
nltk.download('stopwords')

# Load stopwords globally for efficiency
stop_words = set(stopwords.words('english'))



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# -------------------------------
# 1. Data Loading and Preprocessing
# -------------------------------

# Define dataset path (modify if needed)
data_path = Path("Restaurant_Reviews.csv")

# Load dataset with error handling
try:
    df = pd.read_csv(data_path)
except FileNotFoundError:
    raise FileNotFoundError(f"Dataset not found at {data_path}. Please check the path.")

# Drop unwanted columns (Modify if necessary)
unwanted_columns = ["7514"]  # Ensure this is a real column
df.drop(columns=[col for col in unwanted_columns if col in df.columns], inplace=True)

# Remove duplicates and null values
df.drop_duplicates(inplace=True)
df.dropna(subset=["Review", "Rating"], inplace=True)

# Normalize Ratings (Replace "Like" with 5, and handle other potential issues)
df['Rating'] = df['Rating'].replace({"Like": "5"}).astype(str)
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
df.dropna(subset=["Rating"], inplace=True)

# Create sentiment labels
df['label'] = np.where(df['Rating'] <= 3.0, 'Negative', 'Positive')

# Define function to clean text
def clean_text(text):
    """Cleans review text by removing non-alphabet characters and stopwords."""
    text = re.sub(r"[^a-zA-Z']", ' ', str(text)).lower().split()
    return ' '.join([word for word in text if word not in stop_words])

# Apply text cleaning
df['clean_text'] = df['Review'].astype(str).apply(clean_text)

# Display cleaned data sample
print(df.head())

# Split data into training and test sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=123)

        Restaurant              Reviewer  \
0  Beyond Flavours     Rusha Chakraborty   
1  Beyond Flavours  Anusha Tirumalaneedi   
2  Beyond Flavours       Ashok Shekhawat   
3  Beyond Flavours        Swapnil Sarkar   
4  Beyond Flavours                Dileep   

                                              Review  Rating  \
0  The ambience was good, food was quite good . h...     5.0   
1  Ambience is too good for a pleasant evening. S...     5.0   
2  A must try.. great food great ambience. Thnx f...     5.0   
3  Soumen das and Arun was a great guy. Only beca...     5.0   
4  Food is good.we ordered Kodi drumsticks and ba...     5.0   

                  Metadata             Time  Pictures     label  \
0   1 Review , 2 Followers  5/25/2019 15:54         0  Positive   
1  3 Reviews , 2 Followers  5/25/2019 14:20         0  Positive   
2  2 Reviews , 3 Followers  5/24/2019 22:54         0  Positive   
3    1 Review , 1 Follower  5/24/2019 22:11         0  Positive   
4  3 Reviews , 

In [3]:
# -------------------------------
# 2. Convert Data to AWS BlazingText Format
# -------------------------------

def create_bt_file(data, filename):
    """Convert data to AWS BlazingText format and save to file."""
    with open(filename, 'w') as f:
        for _, row in data.iterrows():
            f.write(f"__label__{row['label'].lower()} {row['clean_text']}\n")

# Save training and test data
train_filename, test_filename = 'train.txt', 'test.txt'
create_bt_file(train_data, train_filename)
create_bt_file(test_data, test_filename)

In [7]:
# -------------------------------
# 3. AWS BlazingText Training with SageMaker
# -------------------------------

# Set up SageMaker session and role
session = sagemaker.Session()

try:
    role = sagemaker.get_execution_role()
except Exception:
    import sagemaker.session
    role = sagemaker.session.Session().get_caller_identity_arn()

bucket = session.default_bucket()  # Use default S3 bucket

# Upload data to S3
prefix = 'blazingtext-sentiment'
train_input_path = session.upload_data(train_filename, bucket=bucket, key_prefix=f"{prefix}/train")
test_input_path = session.upload_data(test_filename, bucket=bucket, key_prefix=f"{prefix}/test")

# Get BlazingText container image
region = boto3.Session().region_name
bt_container = get_image_uri(region, 'blazingtext')

# Set up BlazingText estimator
bt = sagemaker.estimator.Estimator(
    image_uri=bt_container,
    role=role,
    instance_count=1,
    instance_type='ml.c5.xlarge',
    output_path=f's3://{bucket}/{prefix}/output',
    sagemaker_session=session
)

# Configure hyperparameters
bt.set_hyperparameters(
    mode='supervised',
    epochs=10,
    learning_rate=0.05,
    vector_dim=100,
    min_count=2,
    word_ngrams=2
)

# Define data channels
train_channel = sagemaker.inputs.TrainingInput(train_input_path, content_type='text/plain')
test_channel = sagemaker.inputs.TrainingInput(test_input_path, content_type='text/plain')

# Train model
bt.fit({'train': train_channel, 'test': test_channel})

2025-02-21 18:06:13 Starting - Starting the training job...
..25-02-21 18:06:28 Starting - Preparing the instances for training.
..25-02-21 18:06:58 Downloading - Downloading input data.
2025-02-21 18:07:29 Training - Training image download completed. Training in progress.[34mArguments: train[0m
  self.stdout = io.open(c2pread, 'rb', bufsize)[0m
[34m[02/21/2025 18:07:37 INFO 140269303150400] nvidia-smi took: 0.025302648544311523 secs to identify 0 gpus[0m
[34m[02/21/2025 18:07:37 INFO 140269303150400] Running single machine CPU BlazingText training using supervised mode.[0m
[34mNumber of CPU sockets found in instance is  1[0m
[34m[02/21/2025 18:07:37 INFO 140269303150400] Processing /opt/ml/input/data/train/train.txt . File size: 1.5446062088012695 MB[0m
[34mRead 0M words[0m
[34mNumber of words:  6997[0m
[34m##### Alpha: 0.0337  Progress: 32.68%  Million Words/sec: 7.69 #####[0m
[34m##### Alpha: 0.0179  Progress: 64.27%  Million Words/sec: 7.57 #####[0m
[34m##### A

In [10]:
# -------------------------------
# 4. Model Deployment and Prediction
# -------------------------------

# Deploy trained model
predictor = bt.deploy(initial_instance_count=1, instance_type='ml.t3.medium')

def predict_sentiment(text):
    """Predict sentiment using deployed SageMaker model."""
    response = predictor.predict([text])
    return response

# Test sample reviews
print(predict_sentiment("The food was great and the service was excellent."))
print(predict_sentiment("I did not enjoy the meal; it was disappointing."))