In [None]:
%pip install pandas
%pip install scikit-learn==1.2.1
%pip install boto3
%pip install sagemaker

In [None]:
import warnings

# Suppress future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Load Dataset

In [None]:
import pandas as pd
import boto3
from urllib.parse import urlparse
from io import StringIO

# Initialize S3 client
s3 = boto3.client('s3')

# S3 URI
s3_uri = 's3://pathto/file.csv'

# Parse the S3 URI
parsed_uri = urlparse(s3_uri)
bucket_name = parsed_uri.netloc
csv_file_key = parsed_uri.path.lstrip('/')

# Read the CSV file from S3
response = s3.get_object(Bucket=bucket_name, Key=csv_file_key)
csv_content = response['Body'].read().decode('utf-8')

# Convert CSV content to a pandas DataFrame
df = pd.read_csv(StringIO(csv_content))
df.columns = ["true_label", "text"]

# Now you have your DataFrame ready
print(len(df))
print(df.head())

In [None]:
# Strip leading and trailing whitespace from the 'text' column
df['text'] = df['text'].str.strip()

# Drop rows with empty strings in the 'text' column
df = df[df['text'] != ''].reset_index(drop=True)

# Drop rows where column in blank or NaN
df = df.dropna(subset=['true_label', 'text'], how='any').reset_index(drop=True)

print(len(df))
print(df.head())

In [None]:
print('Unique labels:')
print(len(df['true_label'].unique()))
print(df['true_label'].unique())

In [None]:
print(df['true_label'].value_counts())

In [None]:
# Define the minimum number of occurrences for a label to be kept
minimum_occurrences = 20 # Adjust this number as needed

# Count the occurrences of each label
label_counts = df['true_label'].value_counts()

# Filter labels that have fewer occurrences than the minimum
labels_to_keep = label_counts[label_counts >= minimum_occurrences].index

# Keep only the rows with labels that meet the minimum occurrence criterion
df = df[df['true_label'].isin(labels_to_keep)].reset_index(drop=True)

# Now filtered_df contains only the rows with labels that have at least `minimum_occurrences` occurrences
print(df['true_label'].value_counts())

# Convert to Multilabel

In [None]:
# Define a function to convert a list of labels to a single string with each label enclosed in single quotes
def join_labels(label_list):
    return '[' + ','.join([f"'{label}'" for label in label_list]) + ']'

# Group the DataFrame by 'text' column and aggregate the 'true_label' column
df = df.groupby('text')['true_label'].agg(join_labels).reset_index()

# Reorder the columns
df = df[['true_label', 'text']]

print(len(df))
print(df.head())

# Vectorize DF

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# create vectorizer object
vectorizer = CountVectorizer(input='content', stop_words='english')
# vectorizer = TfidfVectorizer(input='content', stop_words='english')

In [None]:
# create document term matrix using TFIDF object
dtm = vectorizer.fit_transform(df['text'])
# vectorize df
df_vect = pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names_out())

df_vect.insert(0, 'LABEL', df['true_label'])

# Rename df
df = df_vect
print(df.head())

# Clean Vectorized Dataset

In [None]:
# Import NLTK
import nltk

# Download the NLTK words corpus
nltk.download('words')

# Load English words into a set
english_words = set(nltk.corpus.words.words())

# Function to check if a string contains any numeric characters
def contains_numbers(s):
    return any(char.isdigit() for char in s)

# Drop columns that contain numbers, are more than 15 characters long, contain underscores,
# are less than 3 characters long, or don't contain dictionary words
columns_to_drop = []
for column in df.columns:
    if (contains_numbers(column) or 
        len(column) > 15 or 
        '_' in column or 
        len(column) < 3 or 
        column.lower() not in english_words):
        columns_to_drop.append(column)

df = df.drop(columns=columns_to_drop)

print(df.head())

# Binarize Labels for Multilabel Classification

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

# Separate features (text) and labels
X = df.drop(columns=['LABEL'])  # Features (vectorized text)
y = df['LABEL']  # Labels

# Convert string representations of labels to lists of labels
y = y.apply(eval)

# Convert labels to binary array representation
mlb = MultiLabelBinarizer()
y_binary = mlb.fit_transform(y)
print(y_binary)

In [None]:
print(len(mlb.classes_))
print(mlb.classes_)

In [None]:
print(y[0])  # Original list of labels
print(y_binary[0])  # Binarized representation
debinarized_labels = mlb.inverse_transform(y_binary)
print(debinarized_labels[0])

# Split into Training/Testing Sets

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.3, random_state=42)

In [None]:
X_test

In [None]:
print(y_test[0])
print(mlb.inverse_transform(y_test)[0])

print(y_test[12])
print(mlb.inverse_transform(y_test)[12])

In [None]:
print('Training set size:')
print(len(X_train))
print('Testing set size:')
print(len(X_test))
print('Total dataset set size:')
print(len(X_train + X_test))

# Train Naive Bayes Model

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.multioutput import MultiOutputClassifier

# Train the multilabel classifier
base_classifier = MultinomialNB()  # You can use any classifier as the base estimator
multi_label_classifierNB = MultiOutputClassifier(base_classifier)
multi_label_classifierNB.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Predict labels for the test set
y_pred = multi_label_classifierNB.predict(X_test)

# Evaluate the model
print('Accuracy:')
print(accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=mlb.classes_))

# Train Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report

# Train the multilabel classifier with Random Forest
base_classifier = RandomForestClassifier()  # Using RandomForestClassifier
multi_label_classifierRF = MultiOutputClassifier(base_classifier)
multi_label_classifierRF.fit(X_train, y_train)

In [None]:
# Predict labels for the test set
y_pred = multi_label_classifierRF.predict(X_test)

# Evaluate the model
print('Accuracy:')
print(accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=mlb.classes_))

# Train Support Vector Machine (SVM)

In [None]:
from sklearn.preprocessing import MinMaxScaler

# scale training and testing sets for use by SVMs
scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
TrainSetScaled = scaling.transform(X_train)
TestSetScaled = scaling.transform(X_test)

In [None]:
TrainSetScaled

In [None]:
TestSetScaled

In [None]:
import time
from tqdm import tqdm
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier

# Train the multilabel classifier with Support Vector Machine
base_classifier = SVC()  # Using Support Vector Classifier

# Get the number of samples for the progress bar
num_samples = len(X_train)

# Start the timer
start_time = time.time()

# Initialize the progress bar
with tqdm(total=num_samples, desc="Training progress", unit="sample") as pbar:
    multi_label_classifierSVM = OneVsRestClassifier(base_classifier)
    multi_label_classifierSVM.fit(X_train, y_train)
    pbar.update(num_samples)  # Update the progress bar to completion

# End the timer
end_time = time.time()
training_time = end_time - start_time
print("Training time:", training_time, "seconds")

In [None]:
# Predict labels for the test set
y_pred = multi_label_classifierSVM.predict(X_test)

# Evaluate the model
print('Accuracy:')
print(accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=mlb.classes_))

# Save and Deploy Model

In [None]:
import joblib
import tarfile

# Save the trained models
# joblib.dump(multi_label_classifierNB, 'NB_model.joblib')
joblib.dump(multi_label_classifierRF, 'RF_model.joblib')

with tarfile.open('RF_model.tar.gz', 'w:gz') as tar:
    tar.add('RF_model.joblib')

# Upload the models to an S3 bucket
s3 = boto3.client('s3')
bucket_name = 'mybucket'
# s3.upload_file('NB_model.joblib', bucket_name, 'model_endpoints/NB_model.joblib')
s3.upload_file('RF_model.tar.gz', bucket_name, 'model_endpoints/RF_model.tar.gz')

print('Files uploaded to S3!')

In [None]:
# TEST MODEL OUT LOCALLY 
import pandas as pd

# Load the model
model_path = 'RF_model.joblib'
model = joblib.load(model_path)

# Perform inference
predictions = model.predict(X_test)
print(predictions[0])

original_labels = mlb.inverse_transform(y_test)
print(original_labels[0])

In [None]:
import boto3
import sagemaker
from datetime import datetime
from sagemaker.compute_resource_requirements.resource_requirements import ResourceRequirements
from sagemaker.predictor import Predictor
from sagemaker.enums import EndpointType
from sagemaker.model import Model
from sagemaker.session import Session

# Create a SageMaker session
# sagemaker_session = sagemaker.Session()

# Create a variable w/ the model S3 URL

# The name of your S3 bucket:
s3_bucket = "mybucket"
# The directory within your S3 bucket your model is stored in:
bucket_prefix = "model_endpoints"
# The file name of your model artifact:
model_filename = "RF_model.tar.gz"
# Relative S3 path:
model_s3_key = f"{bucket_prefix}/" + model_filename
# Combine bucket name, model file name, and relate S3 path to create S3 model URL:
model_url = f"s3://{s3_bucket}/{model_s3_key}"
print('model_url:')
print(model_url)

# Alternate Method

In [None]:
client = boto3.client(service_name="sagemaker")
region = sess.boto_region_name

# retrieve sklearn image
image_uri = sagemaker.image_uris.retrieve(
    framework="sklearn",
    region=region,
    version="0.23-1",
    py_version="py3",
    instance_type="ml.m5.xlarge",
)

#Step 1: Model Creation
now = datetime.now()
dt_string = now.strftime("%d-%m-%Y-%H-%M-%S")
model_name = "RF-model-v1-"+dt_string
role = sagemaker.get_execution_role()

print("Model name: " + model_name)
create_model_response = client.create_model(
    ModelName=model_name,
    Containers=[
        {
            "Image": image_uri,
            "ModelDataUrl": model_url,
        }
    ],
    ExecutionRoleArn=role,
)
print("Model Arn: " + create_model_response["ModelArn"])
print(region)

In [None]:
from time import gmtime, strftime

#Step 2: EPC Creation - Serverless
# sklearn_epc_name = "sklearn-epc" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
# response = client.create_endpoint_config(
#    EndpointConfigName=sklearn_epc_name,
#    ProductionVariants=[
#         {
#             "ModelName": model_name,
#             "VariantName": "sklearnvariant",
#             "ServerlessConfig": {
#                 "MemorySizeInMB": 2048,
#                 "MaxConcurrency": 20
#             }
#         } 
#     ]
# )
# Step 2: EPC Creation - Synchronous
sklearn_epc_name = "sklearn-epc" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
endpoint_config_response = client.create_endpoint_config(
    EndpointConfigName=sklearn_epc_name,
    ProductionVariants=[
        {
            "VariantName": "sklearnvariant",
            "ModelName": model_name,
            "InstanceType": "ml.m5.xlarge",
            "InitialInstanceCount": 1
        },
    ],
)
print("Endpoint Configuration Arn: " + endpoint_config_response["EndpointConfigArn"])

In [None]:
#Step 3: EP Creation
endpoint_name = "sklearn-local-ep" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
create_endpoint_response = client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=sklearn_epc_name,
)
print("Endpoint Arn: " + create_endpoint_response["EndpointArn"])

# End of Alternate Method

In [None]:
resources = ResourceRequirements(
    requests = {
        "num_cpus": 2,  # Number of CPU cores required:
        "num_accelerators": 1, # Number of accelerators required
        "memory": 8192,  # Minimum memory required in Mb (required)
        "copies": 1,
    },
    limits = {"num_cpus": 4,  # Maximum number of CPU cores allowed
    "num_accelerators": 1,  # Maximum number of accelerators allowed
    "memory": 16384,  # Maximum memory allowed in Mb}
             }
)

now = datetime.now()
dt_string = now.strftime("%d-%m-%Y-%H-%M-%S")
model_name = "RF-model-v1-"+dt_string

# build your model with Model class
role = sagemaker.get_execution_role()
print('Role:')
print(role)

In [None]:
print(image_uri)

In [None]:
from sagemaker import image_uris

sess = sagemaker.Session()
region = sess.boto_region_name

image_uri = image_uris.retrieve(framework='sklearn',
                                region=region,
                                version='1.2-1',
                                image_scope='inference')

# build your model with Model class
model = Model(
    name = model_name,
    image_uri = image_uri,
    model_data = model_url,
    role = role,
    resources = resources,
    predictor_cls = Predictor,
)

print('model_name:')
print(model_name)

In [None]:
predictor = model.deploy(
    initial_instance_count = 1,
    instance_type = "ml.m5.xlarge", # ml.p4d.24xlarge
    endpoint_type = EndpointType.INFERENCE_COMPONENT_BASED,  # MODEL_BASED INFERENCE_COMPONENT_BASED
    resources = resources,
)

In [None]:
result = predictor.predict(data)  # Send inference request

In [None]:
predictor.delete_endpoint()