In [2]:
!pip install sagemaker

import sagemaker

import boto3
import pandas as pd
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
prefix = "nlp-model-demo"



In [3]:
df = pd.read_csv("/content/Reviews.csv")
df = df[["Text", "Score"]].dropna()
df["Sentiment"] = df["Score"].apply(lambda x: 1 if x > 3 else o)
df = df[["Text", "Sentiment"]]
df.to_csv("processed_reviews.csv", index=False)
s3 = boto3.resource("s3")
s3.upload_file("processed_reviews.csv", f"{prefix}/data/processed_reviews.csv")
s3_train_data = f"s3://{bucket}/{prefix}/data/processed_reviews.csv"
print("Data uploaded to:" , s3_train_data)

In [4]:
%%writefile train.py
import argparse
import os
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

def train():
    parser = argparse.ArgumentParser()
    parser.add_argument("--train_data", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
    args = parser.parse_args()

    train_data_path = os.path.join(args.train_data, "processed_reviews.csv")
    df = pd.read_csv(train_data_path)

    X = df["Text"]
    y = df["Sentiment"]

    pipeline = Pipeline([
        ("tfidf", TfidfVectorizer(stop_words="english")),
        ("clf", LogisticRegression())
    ])

    pipeline.fit(X, y)

    model_path = os.path.join("/opt/ml/model", "model.joblib")
    joblib.dump(pipeline, model_path)
    print("Model saved at", model_path)

if __name__ == "__main__":
    train()


Writing train.py


In [5]:
from sagemaker.sklearn.estimator import SKLearn

sklearn_estimator = SKLearn(
    entry_point="train.py",
    framework_version="0.23-1",
    instance_type="ml.m5.large",
    role=role,
    sagemaker_session=sagemaker_session,
)

sklearn_estimator.fit({"train": s3_train_data})


In [6]:
%%writefile inference.py
import joblib
import os
import json
import pandas as pd

def model_fn(model_dir):
    model_path = os.path.join(model_dir, "model.joblib")
    return joblib.load(model_path)

def input_fn(request_body, request_content_type):
    if request_content_type == "application/json":
        data = json.loads(request_body)
        return pd.DataFrame(data, columns=["Text"])
    else:
        raise ValueError("Unsupported content type: {}".format(request_content_type))

def predict_fn(input_data, model):
    return model.predict(input_data["Text"]).tolist()


Writing inference.py


In [7]:
from sagemaker.sklearn.model import SKLearnModel

model_data = sklearn_estimator.model_data

sklearn_model = SKLearnModel(
    model_data=model_data,
    role=role,
    entry_point="inference.py",
    framework_version="0.23-1",
    sagemaker_session=sagemaker_session,
)

predictor = sklearn_model.deploy(instance_type="ml.m5.large", initial_instance_count=1)



In [8]:
import json

test_data = json.dumps(["This product is amazing!", "Worst product ever."])
response = predictor.predict(test_data)

print("Predictions:", response)


In [9]:
predictor.delete_endpoint()