In [3]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
import sagemaker

# Load the dataset
column_names = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
    'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
    'hours-per-week', 'native-country', 'target'
]

data = pd.read_csv('adult.data', names=column_names, sep=',\s', engine='python')
data.head()

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\shubh\AppData\Local\sagemaker\sagemaker\config.yaml


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [5]:
data.isnull().sum()


age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
target            0
dtype: int64

In [6]:
data = data.drop(['fnlwgt'], axis=1)

In [7]:
data['target'] = data['target'].apply(lambda x: 1 if x == '>50K' else 0)

In [8]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
categorical_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

for column in categorical_columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le


In [9]:
data.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,39,7,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,9,13,2,4,0,4,1,0,0,13,39,0
2,38,4,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,9,13,2,10,5,2,0,0,0,40,5,0


In [10]:
from sklearn.preprocessing import StandardScaler

continuous_columns = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
scaler = StandardScaler()

data[continuous_columns] = scaler.fit_transform(data[continuous_columns])

In [11]:
data

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,0.030671,7,9,1.134739,4,1,1,4,1,0.148453,-0.21666,-0.035429,39,0
1,0.837109,6,9,1.134739,2,4,0,4,1,-0.145920,-0.21666,-2.222153,39,0
2,-0.042642,4,11,-0.420060,0,6,1,4,1,-0.145920,-0.21666,-0.035429,39,0
3,1.057047,4,1,-1.197459,2,6,0,2,1,-0.145920,-0.21666,-0.035429,39,0
4,-0.775768,4,9,1.134739,2,10,5,2,0,-0.145920,-0.21666,-0.035429,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,-0.849080,4,7,0.746039,2,13,5,4,0,-0.145920,-0.21666,-0.197409,39,0
32557,0.103983,4,11,-0.420060,2,7,0,4,1,-0.145920,-0.21666,-0.035429,39,1
32558,1.423610,4,11,-0.420060,6,1,4,4,0,-0.145920,-0.21666,-0.035429,39,0
32559,-1.215643,4,11,-0.420060,4,1,3,4,1,-0.145920,-0.21666,-1.655225,39,0


In [12]:
X = data.drop('target', axis=1)
y = data['target']

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [20]:
# Save your data
train_file = 'train_data.csv'
test_file = 'test_data.csv'
pd.concat([X_train, y_train], axis=1).to_csv(train_file, index=False)
pd.concat([X_test, y_test], axis=1).to_csv(test_file, index=False)


## Upload Data to S3

In [21]:
import boto3
import sagemaker

# Create a SageMaker session
session = sagemaker.Session()
# Get the default S3 bucket
bucket = session.default_bucket()

# Upload files to S3
s3_client = boto3.client('s3')
s3_client.upload_file(train_file, bucket, 'data/train_data.csv')
s3_client.upload_file(test_file, bucket, 'data/test_data.csv')

# Paths to the uploaded files
train_path = f's3://{bucket}/data/train_data.csv'
test_path = f's3://{bucket}/data/test_data.csv'

print(f'Train data uploaded to: {train_path}')
print(f'Test data uploaded to: {test_path}')


Train data uploaded to: s3://sagemaker-ap-south-1-654654375132/data/train_data.csv
Test data uploaded to: s3://sagemaker-ap-south-1-654654375132/data/test_data.csv


In [22]:
s3_client.upload_file('train.py', bucket, 'scripts/train.py')
script_path = f's3://{bucket}/scripts/train.py'
print(f'Script uploaded to: {script_path}')


Script uploaded to: s3://sagemaker-ap-south-1-654654375132/scripts/train.py


## Train the Model using SageMaker

In [27]:
import sagemaker
from sagemaker.sklearn.estimator import SKLearn

# Initialize SageMaker session
session = sagemaker.Session()

# Define SageMaker role ARN
role = "arn:aws:iam::654654375132:role/SageMakerExecutionRole"  

# Define the estimator
sklearn_estimator = SKLearn(
    entry_point='train.py',  # Your training script
    role=role,
    instance_type='ml.m5.large',  # Free tier eligible instance type
    framework_version='0.23-1',
    py_version='py3',
    output_path=f's3://{session.default_bucket()}/output',
    code_location=f's3://{session.default_bucket()}/scripts'
)

# Train the model
sklearn_estimator.fit({'train': train_path})


INFO:sagemaker:Creating training-job with name: sagemaker-scikit-learn-2024-06-18-13-11-47-996


2024-06-18 13:11:50 Starting - Starting the training job...
2024-06-18 13:12:05 Starting - Preparing the instances for training...
2024-06-18 13:12:56 Downloading - Downloading the training image.....[34m2024-06-18 13:13:40,847 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2024-06-18 13:13:40,850 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-06-18 13:13:40,895 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2024-06-18 13:13:41,042 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-06-18 13:13:41,055 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-06-18 13:13:41,067 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-06-18 13:13:41,076 sagemaker-training-toolkit INFO     Invoking user script[0m
[34mTraining Env

## Deploy the Model

In [28]:
# Deploy the model
predictor = sklearn_estimator.deploy(initial_instance_count=1, instance_type='ml.m5.large')


INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2024-06-18-13-14-23-091
INFO:sagemaker:Creating endpoint-config with name sagemaker-scikit-learn-2024-06-18-13-14-23-091
INFO:sagemaker:Creating endpoint with name sagemaker-scikit-learn-2024-06-18-13-14-23-091


-----!

## Evaluate the Model

In [29]:
import numpy as np

# Test the deployed model
test_data = pd.concat([X_test, y_test], axis=1).sample(10)
test_features = test_data.drop('target', axis=1)
test_labels = test_data['target']

predictions = predictor.predict(test_features.values)
print(f'Predictions: {predictions}')
print(f'Actual labels: {test_labels.values}')


Predictions: [0 0 0 0 0 0 0 0 0 0]
Actual labels: [0 0 0 0 1 0 0 0 0 0]


## Clean Up Resources

In [30]:
# Delete the endpoint
predictor.delete_endpoint()


INFO:sagemaker:Deleting endpoint configuration with name: sagemaker-scikit-learn-2024-06-18-13-14-23-091
INFO:sagemaker:Deleting endpoint with name: sagemaker-scikit-learn-2024-06-18-13-14-23-091


## Conclusion:
 - The project successfully demonstrates a complete workflow for training, deploying, and evaluating a machine learning model using AWS SageMaker.
 - We preprocessed the data, addressed class imbalance, visualized the data, and performed feature scaling and encoding.
 - The processed data was uploaded to S3, and a training script was used to train the model on SageMaker.
 - The trained model was deployed, and predictions were made on the test set to evaluate model performance.
 - Resources were cleaned up to avoid unnecessary costs.