## Data Preparation

In [68]:
import kagglehub
import os

# Download dataset
path = kagglehub.dataset_download("satyajeetrai/palmer-penguins-dataset-for-eda")
print("Dataset downloaded to:", path)

# Check the directory structure to find the file
for root, dirs, files in os.walk(path):
    for file in files:
        print(os.path.join(root, file))


Dataset downloaded to: /root/.cache/kagglehub/datasets/satyajeetrai/palmer-penguins-dataset-for-eda/versions/1
/root/.cache/kagglehub/datasets/satyajeetrai/palmer-penguins-dataset-for-eda/versions/1/penguins.csv


In [69]:
import pandas as pd

# Specify the correct path to the CSV file
csv_file_path = "/root/.cache/kagglehub/datasets/satyajeetrai/palmer-penguins-dataset-for-eda/versions/1/penguins.csv"

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file_path)

# View the first few rows of the dataset to ensure it's loaded correctly
df.head()

Unnamed: 0,id,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,3,Adelie,Torgersen,,,,,,2007
4,4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [70]:
df.shape

(344, 9)

In [71]:
df.isna().sum()


Unnamed: 0,0
id,0
species,0
island,0
bill_length_mm,2
bill_depth_mm,2
flipper_length_mm,2
body_mass_g,2
sex,11
year,0


In [72]:
df.duplicated().sum()

0

In [73]:
# Check Null and Dtypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 344 non-null    int64  
 1   species            344 non-null    object 
 2   island             344 non-null    object 
 3   bill_length_mm     342 non-null    float64
 4   bill_depth_mm      342 non-null    float64
 5   flipper_length_mm  342 non-null    float64
 6   body_mass_g        342 non-null    float64
 7   sex                333 non-null    object 
 8   year               344 non-null    int64  
dtypes: float64(4), int64(2), object(3)
memory usage: 24.3+ KB


In [74]:
df.nunique()

Unnamed: 0,0
id,344
species,3
island,3
bill_length_mm,164
bill_depth_mm,80
flipper_length_mm,55
body_mass_g,94
sex,2
year,3


In [75]:
df.describe()

Unnamed: 0,id,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,year
count,344.0,342.0,342.0,342.0,342.0,344.0
mean,171.5,43.92193,17.15117,200.915205,4201.754386,2008.02907
std,99.448479,5.459584,1.974793,14.061714,801.954536,0.818356
min,0.0,32.1,13.1,172.0,2700.0,2007.0
25%,85.75,39.225,15.6,190.0,3550.0,2007.0
50%,171.5,44.45,17.3,197.0,4050.0,2008.0
75%,257.25,48.5,18.7,213.0,4750.0,2009.0
max,343.0,59.6,21.5,231.0,6300.0,2009.0


In [76]:
print("Categories in 'species' variable:     ",end=" " )
print(df['species'].unique())

print("Categories in 'island' variable:  ",end=" ")
print(df['island'].unique())

Categories in 'species' variable:      ['Adelie' 'Gentoo' 'Chinstrap']
Categories in 'island' variable:   ['Torgersen' 'Biscoe' 'Dream']


In [77]:
data = df.drop(columns=['id', 'island','sex', 'year'])
data = data.dropna(subset=['bill_length_mm'])
data.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,Adelie,39.1,18.7,181.0,3750.0
1,Adelie,39.5,17.4,186.0,3800.0
2,Adelie,40.3,18.0,195.0,3250.0
4,Adelie,36.7,19.3,193.0,3450.0
5,Adelie,39.3,20.6,190.0,3650.0


In [78]:
data.isna().sum()

Unnamed: 0,0
species,0
bill_length_mm,0
bill_depth_mm,0
flipper_length_mm,0
body_mass_g,0


In [79]:
#convert species names to numerical values
data['species']=data['species'].replace('Adelie', 0)
data['species']=data['species'].replace('Gentoo', 1)
data['species']=data['species'].replace('Chinstrap', 2)


#shuffle
data=data.sample(frac=1).reset_index(drop=True)


  data['species']=data['species'].replace('Chinstrap', 2)


In [80]:
data.columns = range(data.shape[1])

# split training and validation tests
from sklearn.model_selection import train_test_split
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)


## Move Data into s3 bucket

In [79]:
import boto3

bucket_name= 'sagemaker-build-and-deploy-penguin-model'

train_data.csv('data.csv', header=False, index=False)
key='data/train/data'
url='s3://{}/{}'.format(bucket_name, key)
boto3.Session().resource('s3').Bucket(bucket_name).Object(key).upload_file('data.csv')

val_data.csv('data.csv', header=False, index=False)
key='data/val/data'
url='s3://{}/{}'.format(bucket_name, key)
boto3.Session().resource('s3').Bucket(bucket_name).Object(key).upload_file('data.csv')

## Create Model

In [79]:
import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker import get_execution_role

key= 'model/xgbmodel'
s3_output_location= url= 's3://{}/{}'.format(bucket_name, key)

xgbmodel= sagemaker.estimator.Estimator(
    get_image_uri(boto3.Session().region_name, 'xgboost'),
    get_execution_role(),
    train_instance_count=1,
    train_instance_type='ml.m4.xlarge',
    train_volume_size=5,
    output_path=s3_output_location,
    sagemaker_session= sagemaker_Session()
)


xgbmodel.set_hyperparameters(
    max_depth=5,
    gamma=4,
    min_child_weight=6,
    silent=0,
    objective='multi:softmax',
    num_class=3,
    num_round=8
)



## Train Model

In [79]:
train_data= 's3://{}/{}'.format(bucket_name, 'data/train')
val_data= 's3://{}/{}'.format(bucket_name, 'data/val')

train_channel= sagemaker.session.s3_input(train_data, content_type='text/csv')
val_channel= sagemaker.session.s3_input(val_data, content_type='text/csv')

data_channels= {'train': train_channel, 'validation': val_channel}


xgbmodel.fit(inputs=data_channels)

## Deploy Model

In [None]:
xgb_predictor= xgbmodel.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')