In [None]:
%pip install pandas
%pip install scikit-learn==1.2.1
%pip install boto3
%pip install sagemaker

In [None]:
import warnings

# Suppress future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Load Dataset

In [None]:
import pandas as pd
import boto3
from urllib.parse import urlparse
from io import StringIO

# Initialize S3 client
s3 = boto3.client('s3')

# S3 URI
s3_uri = 's3://pathto/file.csv'

# Parse the S3 URI
parsed_uri = urlparse(s3_uri)
bucket_name = parsed_uri.netloc
csv_file_key = parsed_uri.path.lstrip('/')

# Read the CSV file from S3
response = s3.get_object(Bucket=bucket_name, Key=csv_file_key)
csv_content = response['Body'].read().decode('utf-8')

# Convert CSV content to a pandas DataFrame
df = pd.read_csv(StringIO(csv_content))
df.columns = ["true_label", "text"]

# Now you have your DataFrame ready
print(len(df))
print(df.head())

In [None]:
# Strip leading and trailing whitespace from the 'text' column
df['text'] = df['text'].str.strip()

# Drop rows with empty strings in the 'text' column
df = df[df['text'] != ''].reset_index(drop=True)

# Drop rows with empty strings in the 'true_label' column
df = df[df['true_label'] != '""'].reset_index(drop=True)

# Drop rows where column is NaN
df = df.dropna(subset=['true_label', 'text'], how='any').reset_index(drop=True)

print(len(df))
print(df.head())

In [None]:
print('Unique labels:')
print(len(df['true_label'].unique()))
print(df['true_label'].unique())

In [None]:
print(df['true_label'].value_counts())

# Vectorize DF

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# create vectorizer object
vectorizer = CountVectorizer(input='content', stop_words='english')
# vectorizer = TfidfVectorizer(input='content', stop_words='english')

In [None]:
# create document term matrix using TFIDF object
dtm = vectorizer.fit_transform(df['text'])
# vectorize df
df_vect = pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names_out())

df_vect.insert(0, 'LABEL', df['true_label'])

# Rename df
df = df_vect
print(df.head())

# Clean Vectorized Dataset

In [None]:
# Import NLTK
import nltk

# Download the NLTK words corpus
nltk.download('words')

# Load English words into a set
english_words = set(nltk.corpus.words.words())

# Function to check if a string contains any numeric characters
def contains_numbers(s):
    return any(char.isdigit() for char in s)

# Drop columns that contain numbers, are more than 15 characters long, contain underscores,
# are less than 3 characters long, or don't contain dictionary words
columns_to_drop = []
for column in df.columns:
    if (contains_numbers(column) or 
        len(column) > 15 or 
        '_' in column or 
        len(column) < 3 or 
        column.lower() not in english_words):
        columns_to_drop.append(column)

df = df.drop(columns=columns_to_drop)

print(df.head())

# Split into Training/Testing Sets

In [None]:
from sklearn.model_selection import train_test_split

TrainSet, TestSet = train_test_split(df, test_size=0.3)

In [None]:
# split testing labels from testing data
TestLabels = TestSet['LABEL']
TestSet = TestSet.drop(["LABEL"], axis=1)  # remove the entire column

# split training labels from training data
TrainLabels = TrainSet['LABEL']
TrainSet = TrainSet.drop(["LABEL"], axis=1)  # remove the entire column

In [None]:
print('Training set size:')
print(len(TrainSet))
print('Testing set size:')
print(len(TestSet))
print('Total dataset set size:')
print(len(TrainSet + TestSet))

# Train Naive Bayes Model

In [None]:
from sklearn.naive_bayes import MultinomialNB
import numpy as np

NBModel = MultinomialNB()

TrainedNB = NBModel.fit(TrainSet, TrainLabels)
PredictionNB = NBModel.predict(TestSet)
print(PredictionNB)
print(np.round(NBModel.predict_proba(TestSet),2))

In [None]:
from sklearn import metrics

# Print a classification report
print(metrics.classification_report(TestLabels, PredictionNB))

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

cnf_matrix = confusion_matrix(TestLabels, PredictionNB, labels=TrainedNB.classes_)
print("\nThe Model (Multinomial Naive Bayes) confusion matrix is:")
# print(cnf_matrix)
disp = ConfusionMatrixDisplay(confusion_matrix=cnf_matrix,
                              display_labels=TrainedNB.classes_)

# Plot the confusion matrix
fig, ax = plt.subplots()
disp.plot(ax=ax)

# Rotate the x-axis tick labels
plt.xticks(rotation=90)

plt.show()

In [None]:
# Get the feature log probabilities for each class (label)
feature_log_probs = NBModel.feature_log_prob_

# Get the feature names
feature_names = TrainSet.columns

# Create a dictionary to store the feature importance scores for each class
feature_importance = {}

# Iterate over each class
for i, label in enumerate(NBModel.classes_):
    # Get the feature log probabilities for the current class
    class_feature_log_probs = feature_log_probs[i]
    
    # Create a dictionary to store the feature importance scores for the current class
    class_feature_importance = {}
    
    # Iterate over each feature and its corresponding log probability
    for feature_name, log_prob in zip(feature_names, class_feature_log_probs):
        # Store the feature importance score (log probability) in the dictionary
        class_feature_importance[feature_name] = log_prob
    
    # Store the feature importance scores for the current class in the overall dictionary
    feature_importance[label] = class_feature_importance

# Sort the feature importance scores for each class by their values (log probabilities)
for label, class_feature_importance in feature_importance.items():
    # Sort the dictionary by values (log probabilities) in descending order
    sorted_feature_importance = sorted(class_feature_importance.items(), key=lambda x: x[1], reverse=True)
    
    # Print the top N most significant words for the current class
    print(f"Top 10 most significant words for class '{label}':")
    for feature_name, log_prob in sorted_feature_importance[:10]:
        print(f"{feature_name}: {log_prob}")
    print("\n")

# Train Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier()
trainedRF = RF.fit(TrainSet, TrainLabels)
RF_pred = RF.predict(TestSet)

In [None]:
# Print a classification report
print(metrics.classification_report(TestLabels, RF_pred))

In [None]:
bn_matrix_RF_text = confusion_matrix(TestLabels, RF_pred)
print("\nThe confusion matrix (Random Forest) is:")
# print(bn_matrix_RF_text)
disp = ConfusionMatrixDisplay(confusion_matrix=bn_matrix_RF_text,
                              display_labels=trainedRF.classes_)

# Plot the confusion matrix
fig, ax = plt.subplots()
disp.plot(ax=ax)

# Rotate the x-axis tick labels
plt.xticks(rotation=90)

plt.show()

In [None]:
# Get feature importances from the trained Random Forest model
feature_importances = trainedRF.feature_importances_

# Create a dictionary to store the feature importance scores for each feature
feature_importance_rf = {}

# Iterate over each feature and its corresponding importance score
for feature_name, importance_score in zip(TrainSet.columns, feature_importances):
    # Store the feature importance score in the dictionary
    feature_importance_rf[feature_name] = importance_score

# Sort the feature importance scores by their values in descending order
sorted_feature_importance_rf = sorted(feature_importance_rf.items(), key=lambda x: x[1], reverse=True)

# Print the top N most significant words
print("Top 20 most significant words for Random Forest:")
for feature_name, importance_score in sorted_feature_importance_rf[:20]:
    print(f"{feature_name}: {importance_score}")

# Train Support Vector Machine (SVM)

In [None]:
from sklearn.preprocessing import MinMaxScaler

# scale training and testing sets for use by SVMs
scaling = MinMaxScaler(feature_range=(-1,1)).fit(TrainSet)
TrainSetScaled = scaling.transform(TrainSet)
TestSetScaled = scaling.transform(TestSet)

In [None]:
TestSetScaled

In [None]:
from sklearn.svm import LinearSVC
import time
from tqdm import tqdm


# Initialize LinearSVC model
SVM_Model = LinearSVC(C=10, 
                       max_iter=10000,
                       dual=True,
                       verbose=True)

# Start the timer
start_time = time.time()

# Fit the model with progress bar
with tqdm(total=len(TrainSetScaled)) as pbar:
    trainedSVM = SVM_Model.fit(TrainSetScaled, TrainLabels)
    pbar.update(len(TrainSetScaled))

# Predict with the trained model
with tqdm(total=len(TestSetScaled)) as pbar:
    predSVM = SVM_Model.predict(TestSetScaled)
    pbar.update(len(TestSetScaled))

# Print the predictions
print(predSVM)

# Calculate the time taken
end_time = time.time()
elapsed_time = end_time - start_time

# Convert elapsed time to minutes and seconds
minutes = int(elapsed_time // 60)
seconds = int(elapsed_time % 60)

# Print the time taken
print(f"Time taken: {minutes} minutes and {seconds} seconds")

In [None]:
# Print a classification report
print(metrics.classification_report(TestLabels, predSVM))

In [None]:
cnf_matrix = confusion_matrix(TestLabels, predSVM, labels=trainedSVM.classes_)
print("\nThe Model (SVM with Linear SVC) confusion matrix is:")
# print(cnf_matrix2)
disp = ConfusionMatrixDisplay(confusion_matrix=cnf_matrix,
                              display_labels=trainedSVM.classes_)

# Plot the confusion matrix
fig, ax = plt.subplots()
disp.plot(ax=ax)

# Rotate the x-axis tick labels
plt.xticks(rotation=90)

plt.show()

# Save and Deploy Model

In [None]:
import joblib
import tarfile

# Save the trained models
# joblib.dump(multi_label_classifierNB, 'NB_model.joblib')
joblib.dump(multi_label_classifierRF, 'RF_model.joblib')

with tarfile.open('RF_model.tar.gz', 'w:gz') as tar:
    tar.add('RF_model.joblib')

# Upload the models to an S3 bucket
s3 = boto3.client('s3')
bucket_name = 'mybucket'
# s3.upload_file('NB_model.joblib', bucket_name, 'model_endpoints/NB_model.joblib')
s3.upload_file('RF_model.tar.gz', bucket_name, 'model_endpoints/RF_model.tar.gz')

print('Files uploaded to S3!')

In [None]:
# TEST MODEL OUT LOCALLY 
import pandas as pd

# Load the model
model_path = 'RF_model.joblib'
model = joblib.load(model_path)

# Perform inference
predictions = model.predict(X_test)
print(predictions[0])

original_labels = mlb.inverse_transform(y_test)
print(original_labels[0])

In [None]:
import boto3
import sagemaker
from datetime import datetime
from sagemaker.compute_resource_requirements.resource_requirements import ResourceRequirements
from sagemaker.predictor import Predictor
from sagemaker.enums import EndpointType
from sagemaker.model import Model
from sagemaker.session import Session

# Create a SageMaker session
# sagemaker_session = sagemaker.Session()

# Create a variable w/ the model S3 URL

# The name of your S3 bucket:
s3_bucket = "mybucket"
# The directory within your S3 bucket your model is stored in:
bucket_prefix = "model_endpoints"
# The file name of your model artifact:
model_filename = "RF_model.tar.gz"
# Relative S3 path:
model_s3_key = f"{bucket_prefix}/" + model_filename
# Combine bucket name, model file name, and relate S3 path to create S3 model URL:
model_url = f"s3://{s3_bucket}/{model_s3_key}"
print('model_url:')
print(model_url)

# Alternate Method

In [None]:
client = boto3.client(service_name="sagemaker")
region = sess.boto_region_name

# retrieve sklearn image
image_uri = sagemaker.image_uris.retrieve(
    framework="sklearn",
    region=region,
    version="0.23-1",
    py_version="py3",
    instance_type="ml.m5.xlarge",
)

#Step 1: Model Creation
now = datetime.now()
dt_string = now.strftime("%d-%m-%Y-%H-%M-%S")
model_name = "RF-model-v1-"+dt_string
role = sagemaker.get_execution_role()

print("Model name: " + model_name)
create_model_response = client.create_model(
    ModelName=model_name,
    Containers=[
        {
            "Image": image_uri,
            "ModelDataUrl": model_url,
        }
    ],
    ExecutionRoleArn=role,
)
print("Model Arn: " + create_model_response["ModelArn"])
print(region)

In [None]:
from time import gmtime, strftime

#Step 2: EPC Creation - Serverless
# sklearn_epc_name = "sklearn-epc" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
# response = client.create_endpoint_config(
#    EndpointConfigName=sklearn_epc_name,
#    ProductionVariants=[
#         {
#             "ModelName": model_name,
#             "VariantName": "sklearnvariant",
#             "ServerlessConfig": {
#                 "MemorySizeInMB": 2048,
#                 "MaxConcurrency": 20
#             }
#         } 
#     ]
# )
# Step 2: EPC Creation - Synchronous
sklearn_epc_name = "sklearn-epc" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
endpoint_config_response = client.create_endpoint_config(
    EndpointConfigName=sklearn_epc_name,
    ProductionVariants=[
        {
            "VariantName": "sklearnvariant",
            "ModelName": model_name,
            "InstanceType": "ml.m5.xlarge",
            "InitialInstanceCount": 1
        },
    ],
)
print("Endpoint Configuration Arn: " + endpoint_config_response["EndpointConfigArn"])

In [None]:
#Step 3: EP Creation
endpoint_name = "sklearn-local-ep" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
create_endpoint_response = client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=sklearn_epc_name,
)
print("Endpoint Arn: " + create_endpoint_response["EndpointArn"])

# End of Alternate Method

In [None]:
resources = ResourceRequirements(
    requests = {
        "num_cpus": 2,  # Number of CPU cores required:
        "num_accelerators": 1, # Number of accelerators required
        "memory": 8192,  # Minimum memory required in Mb (required)
        "copies": 1,
    },
    limits = {"num_cpus": 4,  # Maximum number of CPU cores allowed
    "num_accelerators": 1,  # Maximum number of accelerators allowed
    "memory": 16384,  # Maximum memory allowed in Mb}
             }
)

now = datetime.now()
dt_string = now.strftime("%d-%m-%Y-%H-%M-%S")
model_name = "RF-model-v1-"+dt_string

# build your model with Model class
role = sagemaker.get_execution_role()
print('Role:')
print(role)

In [None]:
print(image_uri)

In [None]:
from sagemaker import image_uris

sess = sagemaker.Session()
region = sess.boto_region_name

image_uri = image_uris.retrieve(framework='sklearn',
                                region=region,
                                version='1.2-1',
                                image_scope='inference')

# build your model with Model class
model = Model(
    name = model_name,
    image_uri = image_uri,
    model_data = model_url,
    role = role,
    resources = resources,
    predictor_cls = Predictor,
)

print('model_name:')
print(model_name)

In [None]:
predictor = model.deploy(
    initial_instance_count = 1,
    instance_type = "ml.m5.xlarge", # ml.p4d.24xlarge
    endpoint_type = EndpointType.INFERENCE_COMPONENT_BASED,  # MODEL_BASED INFERENCE_COMPONENT_BASED
    resources = resources,
)

In [None]:
result = predictor.predict(data)  # Send inference request

In [None]:
predictor.delete_endpoint()