# AWS Assignment 2 | Nishthavan Dahiya

## Read Data from S3 Bucket

In [None]:
import boto3
import pandas as pd
import numpy as np  
import os
from time import gmtime, strftime

S3 = boto3.resource("s3")

# Get Data from the bucket
all_data = []
bucket = "awsbucketassignment2"
for i in range (1,4):
    all_data.append(S3.Object(bucket, "titanic{}.csv".format(i)))

data1 = pd.read_csv(all_data[0].get()["Body"])
data2 = pd.read_csv(all_data[1].get()["Body"])
data3 = pd.read_csv(all_data[2].get()["Body"])

In [None]:
data2.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,201,0,3,"Vande Walle, Mr. Nestor Cyriel",male,28.0,0,0,345770,9.5,,S
1,202,0,3,"Sage, Mr. Frederick",male,,8,2,CA. 2343,69.55,,S
2,203,0,3,"Johanson, Mr. Jakob Alfred",male,34.0,0,0,3101264,6.4958,,S
3,204,0,3,"Youseff, Mr. Gerious",male,45.5,0,0,2628,7.225,,C
4,205,1,3,"Cohen, Mr. Gurshon ""Gus""",male,18.0,0,0,A/5 3540,8.05,,S


## Combine All the Data

In [None]:
Data = pd.concat([data1, data2, data3])
Data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,101,0,3,"Petranec, Miss. Matilda",female,28.0,0,0,349245,7.8958,,S
1,102,0,3,"Petroff, Mr. Pastcho (""Pentcho"")",male,,0,0,349215,7.8958,,S
2,103,0,1,"White, Mr. Richard Frasar",male,21.0,0,1,35281,77.2875,D26,S
3,104,0,3,"Johansson, Mr. Gustaf Joel",male,33.0,0,0,7540,8.6542,,S
4,105,0,3,"Gustafsson, Mr. Anders Vilhelm",male,37.0,2,0,3101276,7.925,,S


In [None]:
Data.shape

(300, 12)

In [None]:
Data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [None]:
Data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,300.0,300.0,300.0,244.0,300.0,300.0,300.0
mean,250.5,0.386667,2.303333,29.071393,0.553333,0.363333,34.483221
std,86.746758,0.4878,0.832847,13.886814,1.237846,0.711578,52.846651
min,101.0,0.0,1.0,0.92,0.0,0.0,0.0
25%,175.75,0.0,2.0,21.0,0.0,0.0,7.8958
50%,250.5,0.0,3.0,28.0,0.0,0.0,14.4542
75%,325.25,1.0,3.0,37.0,1.0,0.0,31.0
max,400.0,1.0,3.0,70.5,8.0,4.0,512.3292


In [None]:
# Upload Final Concatenated File of 3 Files
Data.to_csv("Data.csv", index=False)

In [None]:
S3.Bucket(bucket).upload_file( "Data.csv" , "ConcatenatedTitanic.csv")

## SageMaker for Machine Learning

In [None]:
import sagemaker
from sagemaker.tuner import ( IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner,)
region = boto3.Session().region_name
smclient = boto3.Session().client("sagemaker")

role = sagemaker.get_execution_role()
prefix = 'sagemaker'

## Read Dataset again from S3 Bucket

In [None]:
# Importing data from S3
Imported_Data = pd.read_csv(S3.Object(bucket, "ConcatenatedTitanic.csv").get()["Body"], index_col=False)
Imported_Data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,101,0,3,"Petranec, Miss. Matilda",female,28.0,0,0,349245,7.8958,,S
1,102,0,3,"Petroff, Mr. Pastcho (""Pentcho"")",male,,0,0,349215,7.8958,,S
2,103,0,1,"White, Mr. Richard Frasar",male,21.0,0,1,35281,77.2875,D26,S
3,104,0,3,"Johansson, Mr. Gustaf Joel",male,33.0,0,0,7540,8.6542,,S
4,105,0,3,"Gustafsson, Mr. Anders Vilhelm",male,37.0,2,0,3101276,7.925,,S


## Data Cleaning

In [None]:
# Drop Not required Cols
cols = ['Name', 'Ticket', 'Cabin']
Imported_Data = Imported_Data.drop(cols, axis=1)

# Take care of Missing Data
Imported_Data['Age'] = Imported_Data['Age'].interpolate()

Imported_Data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,101,0,3,female,28.0,0,0,7.8958,S
1,102,0,3,male,24.5,0,0,7.8958,S
2,103,0,1,male,21.0,0,1,77.2875,S
3,104,0,3,male,33.0,0,0,8.6542,S
4,105,0,3,male,37.0,2,0,7.925,S


## Split Dataset in Train, Test & Validation

In [None]:
train_data, validation_data, test_data = np.split(Imported_Data.sample(frac=1, random_state=777),[int(0.7 * len(Imported_Data)), int(0.9 * len(Imported_Data))])

## Upload Split datasets to S3 Bucket

In [None]:
train_data.to_csv("train.csv", index=False, header=False)
validation_data.to_csv("validation.csv", index=False, header=False)
test_data.to_csv("test.csv", index=False, header=False)

In [None]:
boto3.Session().resource("s3").Bucket(bucket).Object(os.path.join(prefix, "train/train.csv")).upload_file("train.csv")
boto3.Session().resource("s3").Bucket(bucket).Object(os.path.join(prefix, "validation/validation.csv")).upload_file("validation.csv")

## Creating Input for SageMaker

In [None]:
from sagemaker.inputs import TrainingInput
s3_input_train = TrainingInput(
    s3_data="s3://{}/{}/train".format(bucket, prefix), content_type="csv"
)
s3_input_validation = TrainingInput(
    s3_data="s3://{}/{}/validation".format(bucket, prefix), content_type="csv"
)

## Hyperparameter Tuning 

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.image_uris import retrieve

sess = sagemaker.Session()

container = retrieve("xgboost", region, "latest")

xgb = sagemaker.estimator.Estimator(
    container,
    role,
    base_job_name="xgboost-random-search",
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path="s3://{}/{}/output".format(bucket, prefix),
    sagemaker_session=sess,
)

xgb.set_hyperparameters(
    eval_metric="auc",
    objective="binary:logistic",
    num_round=10,
    rate_drop=0.3,
    tweedie_variance_power=1.4,
)
objective_metric_name = "validation:auc"

###  Logarithmic Scaling Random Search

In [None]:
hyperparameter_ranges = {
    "alpha": ContinuousParameter(0.01, 10, scaling_type="Logarithmic"),
    "lambda": ContinuousParameter(0.01, 10, scaling_type="Logarithmic"),
}

In [None]:
tuner_log = HyperparameterTuner(
    xgb,
    objective_metric_name,
    hyperparameter_ranges,
    max_jobs=5,
    max_parallel_jobs=5,
    strategy="Random",
)

tuner_log.fit(
    {"train": s3_input_train, "validation": s3_input_validation},
    include_cls_metadata=False,
    job_name="xgb-randsearch-" + strftime("%Y%m%d-%H-%M-%S", gmtime()),
)

......................................................!


In [None]:
boto3.client("sagemaker").describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner_log.latest_tuning_job.job_name
)["HyperParameterTuningJobStatus"]

'Completed'

### Linear Scaling Random Search

In [None]:
hyperparameter_ranges_linear = {
    "alpha": ContinuousParameter(0.01, 10, scaling_type="Linear"),
    "lambda": ContinuousParameter(0.01, 10, scaling_type="Linear"),
}

In [None]:
tuner_linear = HyperparameterTuner(
    xgb,
    objective_metric_name,
    hyperparameter_ranges_linear,
    max_jobs=5,
    max_parallel_jobs=5,
    strategy="Random",
)

tuner_linear.fit(
    {"train": s3_input_train, "validation": s3_input_validation},
    include_cls_metadata=False,
    job_name="xgb-linsearch-" + strftime("%Y%m%d-%H-%M-%S", gmtime()),
)

...............................................................!


In [None]:
boto3.client("sagemaker").describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner_linear.latest_tuning_job.job_name
)["HyperParameterTuningJobStatus"]

'Completed'