# Model - 1 - CNN Model (Best Model)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score,accuracy_score

# Load the dataset
#df = pd.read_csv('data.csv')

# Preprocess the data (e.g., handle missing values, normalize/scale features)

# Split the dataset into training and validation sets
df = pd.read_csv('train.csv')
train_data1 = df.iloc[:, [7, 8, 21, 22, 24, 25, 27, 29]]
train_labels = df.iloc[:, [0]]
dftest = pd.read_csv('test.csv')
val_data1 = dftest.iloc[:, [7, 8, 21, 22, 24, 25, 27, 29]]
val_labels = dftest.iloc[:, [0]]

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data
scaler.fit(train_data1)

# Scale the training data
train_data = scaler.transform(train_data1)
train_l = train_labels.to_numpy()
# Scale the validation data
val_data = scaler.transform(val_data1)
val_l = val_labels.to_numpy()
# Reshape the data for CNN input
train_data = train_data.reshape(train_data.shape[0], train_data.shape[1], 1)
val_data = val_data.reshape(val_data.shape[0], val_data.shape[1], 1)

# Define the CNN model
model = Sequential()
model.add(Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(train_data.shape[1], 1)))
model.add(Conv1D(filters=32, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

# Define the number of folds for cross-validation
num_folds = 7

# Perform k-fold cross-validation
kf = KFold(n_splits=num_folds, shuffle=True)

train_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
auc_roc_scores = []
val_scores = []
best_model = None
best_auc_roc = 0.0
for train_index, val_index in kf.split(train_data):
    # Split the data into training and validation sets for the current fold
    train_fold_data, val_fold_data = train_data[train_index], train_data[val_index]
    train_fold_labels, val_fold_labels = train_l[train_index], train_l[val_index]

    # Train the model on the current fold
    history = model.fit(train_fold_data, train_fold_labels, epochs=50, batch_size=32, validation_data=(val_fold_data, val_fold_labels))

    # Evaluate the model on the training and validation sets of the current fold
    train_predictions = model.predict(train_fold_data)
    val_predictions = model.predict(val_fold_data)
    val_predictions = np.round(val_predictions)

    # Calculate accuracy scores for training and validation sets
    train_acc = model.evaluate(train_fold_data, train_fold_labels, verbose=0)[1]
    val_acc = model.evaluate(val_fold_data, val_fold_labels, verbose=0)[1]

    train_scores.append(train_acc)
    val_scores.append(val_acc)
    
    auc_roc = roc_auc_score(val_fold_labels, val_predictions)
    
    precision = precision_score(val_fold_labels, val_predictions)
    recall = recall_score(val_fold_labels, val_predictions)
    f1 = f1_score(val_fold_labels, val_predictions)
    auc_roc = roc_auc_score(val_fold_labels, val_predictions)

    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    auc_roc_scores.append(auc_roc)


# Compute the average scores across all folds
avg_precision = np.mean(precision_scores)
avg_recall = np.mean(recall_scores)
avg_f1 = np.mean(f1_scores)
avg_auc_roc = np.mean(auc_roc_scores)
avg_val_acc=np.mean(val_scores)
avg_train_acc=np.mean(train_scores)
print('Average Precision:', avg_precision)
print('Average Recall:', avg_recall)
print('Average F1-score:', avg_f1)
print('Average AUC-ROC:', avg_auc_roc)
print('average val accuracy',avg_val_acc)
print('average train accuracy',avg_train_acc)
# Calculate bias and variance

# Plot bias and variance
# plt.bar(['Bias', 'Variance'], [bias, variance])
# plt.xlabel('Error Type')
# plt.ylabel('Error Value')
# plt.title('Bias-Variance Tradeoff')
# plt.show()


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/5

# Model - 2 - ANN Model

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score,accuracy_score

# Load the dataset
#df = pd.read_csv('data.csv')

# Preprocess the data (e.g., handle missing values, normalize/scale features)

# Split the dataset into training and validation sets
df = pd.read_csv('train.csv')
train_data1 = df.iloc[:, [7, 8, 21, 22, 24, 25, 27, 29]]
train_labels = df.iloc[:, [0]]
dftest = pd.read_csv('test.csv')
val_data1 = dftest.iloc[:, [7, 8, 21, 22, 24, 25, 27, 29]]
val_labels = dftest.iloc[:, [0]]

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data
scaler.fit(train_data1)

# Scale the training data
train_data = scaler.transform(train_data1)
train_l = train_labels.to_numpy()
# Scale the validation data
val_data = scaler.transform(val_data1)
val_l = val_labels.to_numpy()
# Reshape the data for CNN input
train_data = train_data.reshape(train_data.shape[0], train_data.shape[1], 1)
val_data = val_data.reshape(val_data.shape[0], val_data.shape[1], 1)

# Define the CNN model
model = Sequential()
model.add(Dense(32,activation='relu',input_shape=(8,)))
model.add(Dense(32,activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16,activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(2,activation='softmax'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

# Define the number of folds for cross-validation
num_folds = 7

# Perform k-fold cross-validation
kf = KFold(n_splits=num_folds, shuffle=True)

train_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
auc_roc_scores = []
val_scores = []
best_model = None
best_auc_roc = 0.0
for train_index, val_index in kf.split(train_data):
    # Split the data into training and validation sets for the current fold
    train_fold_data, val_fold_data = train_data[train_index], train_data[val_index]
    train_fold_labels, val_fold_labels = train_l[train_index], train_l[val_index]

    # Train the model on the current fold
    history = model.fit(train_fold_data, train_fold_labels, epochs=50, batch_size=32, validation_data=(val_fold_data, val_fold_labels))

    # Evaluate the model on the training and validation sets of the current fold
    train_predictions = model.predict(train_fold_data)
    val_predictions = model.predict(val_fold_data)
    val_predictions = np.round(val_predictions)

    # Calculate accuracy scores for training and validation sets
    train_acc = model.evaluate(train_fold_data, train_fold_labels, verbose=0)[1]
    val_acc = model.evaluate(val_fold_data, val_fold_labels, verbose=0)[1]

    train_scores.append(train_acc)
    val_scores.append(val_acc)
    
    auc_roc = roc_auc_score(val_fold_labels, val_predictions)
    
    precision = precision_score(val_fold_labels, val_predictions)
    recall = recall_score(val_fold_labels, val_predictions)
    f1 = f1_score(val_fold_labels, val_predictions)
    auc_roc = roc_auc_score(val_fold_labels, val_predictions)

    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    auc_roc_scores.append(auc_roc)


# Compute the average scores across all folds
avg_precision = np.mean(precision_scores)
avg_recall = np.mean(recall_scores)
avg_f1 = np.mean(f1_scores)
avg_auc_roc = np.mean(auc_roc_scores)
avg_val_acc=np.mean(val_scores)
avg_train_acc=np.mean(train_scores)
print('Average Precision:', avg_precision)
print('Average Recall:', avg_recall)
print('Average F1-score:', avg_f1)
print('Average AUC-ROC:', avg_auc_roc)
print('average val accuracy',avg_val_acc)
print('average train accuracy',avg_train_acc)
# Calculate bias and variance

# Plot bias and variance
# plt.bar(['Bias', 'Variance'], [bias, variance])
# plt.xlabel('Error Type')
# plt.ylabel('Error Value')
# plt.title('Bias-Variance Tradeoff')
# plt.show()


In [30]:
#create a bucket bucket1cap
#in the anaconda prompt run aws configure  AKIATHSXV4W4GFVU6YI3  xSKizIhCJcdaCigAqImD8MjUeRT9ZOSDsMJd8xw3
#should create iam roles and iam user
#in the vs code create a virtual env called myenv (conda create -p myenv python=3.9)
#activate that myenv (conda activat myenv/)
#install all packages in that env
#before running estimator.fit run the below cell (import boto3 boto3.setup_default_session(region_name="ap-south-1") )

In [3]:
import sagemaker
import boto3
import pandas as pd
region_name = 'ap-south-1'
sm_boto3 = boto3.client("sagemaker",region_name=region_name)
sess = sagemaker.Session(boto_session=boto3.Session(region_name=region_name))


region = sess.boto_session.region_name

bucket = 'tranformeddatabucketcap' # Mention the created S3 bucket name here

print("Using bucket"+bucket)

Using buckettranformeddatabucketcap


In [36]:
df = pd.read_csv('train.csv')
train_data1 = df.iloc[:, [7, 8,9,30 ,23,25, 22,  26, 28,27, ]]
train_labels = df.iloc[:, [0]]
dftest = pd.read_csv('test.csv')
val_data1 = dftest.iloc[:, [7, 8,9,30,23,25, 22,  26,28, 27, ]]
val_labels = dftest.iloc[:, [0]]

In [37]:
train_data1.isnull().sum()

age      0
bmi      0
waist    0
fglu     0
c1p      0
alt      0
ghp      0
trig     0
ins      0
ggt      0
dtype: int64

In [38]:
val_data1.isnull().sum()

age      0
bmi      0
waist    0
fglu     0
c1p      0
alt      0
ghp      0
trig     0
ins      0
ggt      0
dtype: int64

In [9]:
# send data to S3. SageMaker will take training data from s3

sk_prefix = "nafld_detection"

trainpath = "s3://tranformeddatabucketcap/train.csv"

testpath = "s3://tranformeddatabucketcap/test.csv"
print(trainpath)
print(testpath)

s3://tranformeddatabucketcap/train.csv
s3://tranformeddatabucketcap/test.csv


In [6]:
%%writefile script.py

import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
import joblib
import boto3
import pathlib
from io import StringIO
import argparse
import joblib
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import recall_score, f1_score, roc_auc_score






def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))    
    return clf 
if __name__=="__main__":
    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()
    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)

# Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR")) 
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train.csv")
    parser.add_argument("--test-file", type=str, default="test.csv")
    
    args, _ = parser.parse_known_args()

    #print("SKLearn Version: ", sklearn.version)
    #print("Joblib Version: ", joblib.version)

    print("[INFO] Reading data")
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    features = list(train_df.columns)
    label=features.pop(0)

    print("Building training and testing datasets")
    print()
    X_train =train_df [features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print('Column order: ')
    print (features)
    print()
    print("Label column is: ", label)
    print()
    print("Data Shape: ")
    print()
    print("---- SHAPE OF TRAINING DATA (70%) ----")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("---- SHAPE OF TESTING DATA (30%) ----")
    print(X_test.shape)
    print(y_test.shape)
    print()
    print("Training RandomForest Model....")
    print()
    model=RandomForestClassifier(n_estimators=args.n_estimators,random_state=args.random_state)
    model.fit(X_train,y_train)
    print()
    model_path= os.path.join(args.model_dir,"model.joblib")
    joblib.dump(model,model_path)
    print("model persisted at "+ model_path)
    print()
    
    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test,y_pred_test)
    test_rep = classification_report(y_test,y_pred_test)
    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total Rows are: ", X_test.shape[0])
    print('[TESTING] Model Accuracy is: ', test_acc)
    print('[TESTING] Testing Report: ')
    print(test_rep)
    

Overwriting script.py


In [4]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
     entry_point="script.py",
     role="arn:aws:iam::222448444856:role/service-role/SageMaker-DataEngineer",
     instance_count=1,
     instance_type="ml.m5.large",
     framework_version=FRAMEWORK_VERSION,
     base_job_name="RF-custom-sklearn",
     use_spot_instances = True,
     max_wait=7200,
     max_run=3600,
     region_name="ap-south-1"
    )

In [10]:
sklearn_estimator.fit({"train":trainpath,"test":testpath},wait=True)

INFO:sagemaker:Creating training-job with name: RF-custom-sklearn-2024-04-12-10-17-32-243


2024-04-12 10:17:36 Starting - Starting the training job...
2024-04-12 10:17:51 Starting - Preparing the instances for training...
2024-04-12 10:18:35 Downloading - Downloading input data...
2024-04-12 10:19:05 Downloading - Downloading the training image...
2024-04-12 10:19:40 Training - Training image download completed. Training in progress...2024-04-12 10:19:45,581 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2024-04-12 10:19:45,585 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-04-12 10:19:45,629 sagemaker_sklearn_container.training INFO     Invoking user training script.
2024-04-12 10:19:45,776 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-04-12 10:19:45,789 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-04-12 10:19:45,800 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-04-12 10

In [2]:
import boto3

boto3.setup_default_session(region_name="ap-south-1")  # Or any other supported region


In [15]:
sklearn_estimator.latest_training_job.wait(logs="None") 
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name )["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at"+artifact)


2024-04-12 10:20:06 Starting - Preparing the instances for training
2024-04-12 10:20:06 Downloading - Downloading the training image
2024-04-12 10:20:06 Training - Training image download completed. Training in progress.
2024-04-12 10:20:06 Uploading - Uploading generated training model
2024-04-12 10:20:06 Completed - Training job completed
Model artifact persisted ats3://sagemaker-ap-south-1-222448444856/RF-custom-sklearn-2024-04-12-10-17-32-243/output/model.tar.gz


In [21]:
#creating end points is balance

In [16]:
artifact

's3://sagemaker-ap-south-1-222448444856/RF-custom-sklearn-2024-04-12-10-17-32-243/output/model.tar.gz'

In [19]:
from sagemaker.sklearn.model import SKLearnModel 
from time import gmtime, strftime
model_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
name = model_name,
model_data=artifact,
role="arn:aws:iam::222448444856:role/service-role/SageMaker-DataEngineer",
entry_point="script.py",
framework_version=FRAMEWORK_VERSION,)

In [21]:
model_name

'Custom-sklearn-model-2024-04-12-10-41-59'

In [22]:
endpoint_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime()) 
print("EndpointName={}".format(endpoint_name))
predictor = model.deploy(
initial_instance_count=1,
instance_type="ml.m4.xlarge",
endpoint_name=endpoint_name)

EndpointName=Custom-sklearn-model-2024-04-12-10-47-51


INFO:sagemaker:Creating model with name: Custom-sklearn-model-2024-04-12-10-41-59
INFO:sagemaker:Creating endpoint-config with name Custom-sklearn-model-2024-04-12-10-47-51
INFO:sagemaker:Creating endpoint with name Custom-sklearn-model-2024-04-12-10-47-51


-----!

In [39]:
val_data1[0:2].values.tolist()

[[48.0, 25.0, 94.0, 271.70001, 0.211, 9.0, 9.1000004, 94.0, 48.490002, 15.0],
 [50.0,
  25.1,
  91.5,
  96.300003,
  0.29499999,
  18.0,
  5.9000001,
  204.0,
  8.9200001,
  22.0]]

In [42]:
print(predictor.predict(val_data1[0:4].values.tolist()))

[0 0 0 1]


In [43]:
sm_boto3.delete_endpoint(EndpointName=endpoint_name)

ClientError: An error occurred (ValidationException) when calling the DeleteEndpoint operation: Could not find endpoint "Custom-sklearn-model-2024-04-12-10-47-51".