## Linear Regresson of Standardization

In [1]:
import pandas as pd
import numpy as np
import warnings
import csv
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("s3://datasetairbnb/airbnb_cleaned.csv")

In [3]:
import statsmodels.api as sm
from sklearn import linear_model
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [4]:
def standardize_col(col):
    mean = np.mean(col)
    std = np.std(col)
    return col.apply(lambda x: (x - mean) / std)

non_cat_vars = ['accommodates', 'bedrooms', 'beds', 'number_of_reviews', 'availability_30', 'minimum_nights', 'bathrooms']
for col in non_cat_vars:
    df[col] = df[col].astype(float)
    df[col] = standardize_col(df[col])

df.head()

Unnamed: 0.1,Unnamed: 0,id,host_id,zipcode,neighbourhood_cleansed,property_type,room_type,accommodates,bedrooms,beds,bed_type,price,number_of_reviews,review_scores_rating,availability_30,minimum_nights,bathrooms,host_is_superhost,host_response_rate
0,0,16457286,11796099,11221,Bedford-Stuyvesant,Apartment,1.01,-0.448325,-0.22605,-0.524398,1.0,38.0,-0.480161,100.0,-0.71572,0.050488,-0.315183,0.0,0.0
1,1,1356652,4508795,11233,Bedford-Stuyvesant,Apartment,1.01,-0.448325,-0.22605,-0.524398,1.0,90.0,-0.480161,90.0,2.21043,-0.023537,-0.315183,0.0,0.0
2,2,775016,4088378,11221,Bedford-Stuyvesant,Apartment,1.01,-0.448325,-0.22605,-0.524398,1.01,90.0,-0.506798,100.0,2.21043,-0.023537,-0.315183,0.0,0.0
3,3,773497,4081688,11206,Bedford-Stuyvesant,Apartment,1.02,-0.984809,-0.22605,-0.524398,1.0,200.0,-0.533434,,2.21043,-0.029705,-0.315183,0.0,0.0
4,4,8468835,6518093,11233,Bedford-Stuyvesant,Apartment,1.01,-0.984809,-0.22605,-0.524398,1.0,50.0,-0.480161,60.0,-0.71572,-0.029705,-0.315183,0.0,0.0


In [5]:
def findAveragePriceStandardize(location):
    return (df[(df.neighbourhood_cleansed == location)])

trainingDataSets = []

# get neighborhoods and create standarized datasets from them
neighborhoods = np.unique(df.neighbourhood_cleansed)
for neighborhood in neighborhoods:
    areaStandardize = findAveragePriceStandardize(neighborhood)
    areaStandardize.drop(['neighbourhood_cleansed'], axis=1)
    trainingDataSets.append(areaStandardize)

modelDataSets = []
targetDataSets = []

#get standardized neighborhoods and create model/target data from them
for dataSet in trainingDataSets:
    modelData  = np.array(dataSet.iloc[:, [7,8,9,16]]).astype('float32')
    target = np.array(dataSet.iloc[:, 11]).astype('float32')
    modelDataSets.append(modelData)
    targetDataSets.append(target)

# areaStandardize = findAveragePriceStandardize('Allerton')
# areaStandardize.drop(['neighbourhood_cleansed'], axis=1)
# areaStandardize.info()


In [6]:
import boto3
import sagemaker
import io
import os
import sagemaker.amazon.common as smac
from sagemaker import get_execution_role
import json

# Create new sagemaker session
sess = sagemaker.Session()

# S3 bucket to export results to
bucket = "airbnb-estimator-sagemaker"
train_data_prefix = "dataset"
model_prefix = "models"

role = get_execution_role()
output_location = 's3://{}/{}/LinearRegression'.format(bucket, model_prefix)

# Use all regions for ML model
containers = {'us-west-2': '174872318107.dkr.ecr.us-west-2.amazonaws.com/linear-learner:latest',
              'us-east-1': '382416733822.dkr.ecr.us-east-1.amazonaws.com/linear-learner:latest',
              'us-east-2': '404615174143.dkr.ecr.us-east-2.amazonaws.com/linear-learner:latest',
              'eu-west-1': '438346466558.dkr.ecr.eu-west-1.amazonaws.com/linear-learner:latest',
              'ap-northeast-1': '351501993468.dkr.ecr.ap-northeast-1.amazonaws.com/linear-learner:latest'}

trainingData = []
for neighborhood in neighborhoods:
    areaStandardize = findAveragePriceStandardize(neighborhood)
    areaStandardize.drop(['neighbourhood_cleansed'], axis=1)

    modelData = np.array(areaStandardize.iloc[:, [7,8,9,16]]).astype('float32')
    target = np.array(areaStandardize.iloc[:, 11]).astype('float32')
        
    # Use the IO buffer as dataset is small
    buf = io.BytesIO()

    smac.write_numpy_to_dense_tensor(buf,modelData,target)
    buf.seek(0)

    # importing create data set and upload to s3
    key = 'linear-regression-{}'.format(neighborhood).replace(' ','-').replace(',','').lower()
    boto3.resource('s3').Bucket(bucket).Object(os.path.join(train_data_prefix, 'train', key)).upload_fileobj(buf)
    trainingData.append('s3://{}/{}/train/{}'.format(bucket, train_data_prefix, key))


In [7]:
trainingData[227]

's3://airbnb-estimator-sagemaker/dataset/train/linear-regression-woodside'

In [8]:
    job = trainingData[227].replace('s3://airbnb-estimator-sagemaker/dataset/train/linear-regression-','')
#    create model and train it
    linear = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                        role, 
                                        train_instance_count=1, 
                                        train_instance_type='ml.m4.4xlarge',
                                        output_path=output_location,
                                        sagemaker_session=sess,
                                        base_job_name=job)

    linear.set_hyperparameters(feature_dim=4,
                               predictor_type='regressor',
                               normalize_data=False,
                               mini_batch_size = 2)
    
    linear.fit({'train': trainingData[227]})
# for dataset in trainingData[112:127]:
#     job = dataset.replace('s3://airbnb-estimator-sagemaker/dataset/train/linear-regression-','')
    # create model and train it
#     linear = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
#                                         role, 
#                                         train_instance_count=1, 
#                                         train_instance_type='ml.c4.xlarge',
#                                         output_path=output_location,
#                                         sagemaker_session=sess,
#                                         base_job_name=job.replace("'",""))

#     linear.set_hyperparameters(feature_dim=4,
#                                predictor_type='regressor',
#                                normalize_data=False,
#                                mini_batch_size = 2)
#     linear.fit({'train': dataset})

2019-05-09 22:42:02 Starting - Starting the training job...
2019-05-09 22:42:03 Starting - Launching requested ML instances......
2019-05-09 22:43:12 Starting - Preparing the instances for training......
2019-05-09 22:44:33 Downloading - Downloading input data...
2019-05-09 22:44:58 Training - Training image download completed. Training in progress..
[31mDocker entrypoint called with argument(s): train[0m
[31m[05/09/2019 22:45:01 INFO 139711293511488] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u'lr_scheduler_step': u'auto', u'init_method': u'uniform', u'init_sigma': u'0.01', u'lr_scheduler_minimum_l

[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.6147074218986424, "sum": 0.6147074218986424, "min": 0.6147074218986424}}, "EndTime": 1557441937.455391, "Dimensions": {"model": 0, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 1}, "StartTime": 1557441937.455328}
[0m
[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.6254749025309887, "sum": 0.6254749025309887, "min": 0.6254749025309887}}, "EndTime": 1557441937.455471, "Dimensions": {"model": 1, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 1}, "StartTime": 1557441937.455458}
[0m
[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.6142644364394246, "sum": 0.6142644364394246, "min": 0.6142644364394246}}, "EndTime": 1557441937.455509, "Dimensions": {"model": 2, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 1}, "StartTime": 1557441937.455499}
[0m
[31m#metrics {"Met

[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.6058637469105961, "sum": 0.6058637469105961, "min": 0.6058637469105961}}, "EndTime": 1557441954.760187, "Dimensions": {"model": 0, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 2}, "StartTime": 1557441954.760123}
[0m
[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.621761801947554, "sum": 0.621761801947554, "min": 0.621761801947554}}, "EndTime": 1557441954.760266, "Dimensions": {"model": 1, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 2}, "StartTime": 1557441954.760253}
[0m
[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.6055039295568939, "sum": 0.6055039295568939, "min": 0.6055039295568939}}, "EndTime": 1557441954.760304, "Dimensions": {"model": 2, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 2}, "StartTime": 1557441954.760295}
[0m
[31m#metrics {"Metric

[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.6030054429524278, "sum": 0.6030054429524278, "min": 0.6030054429524278}}, "EndTime": 1557441972.442849, "Dimensions": {"model": 0, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 3}, "StartTime": 1557441972.442786}
[0m
[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.620122680157513, "sum": 0.620122680157513, "min": 0.620122680157513}}, "EndTime": 1557441972.442929, "Dimensions": {"model": 1, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 3}, "StartTime": 1557441972.442915}
[0m
[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.6026999012801753, "sum": 0.6026999012801753, "min": 0.6026999012801753}}, "EndTime": 1557441972.442968, "Dimensions": {"model": 2, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 3}, "StartTime": 1557441972.442959}
[0m
[31m#metrics {"Metric

[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.6019702827588577, "sum": 0.6019702827588577, "min": 0.6019702827588577}}, "EndTime": 1557441989.261986, "Dimensions": {"model": 0, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 4}, "StartTime": 1557441989.261925}
[0m
[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.6187358081251392, "sum": 0.6187358081251392, "min": 0.6187358081251392}}, "EndTime": 1557441989.262062, "Dimensions": {"model": 1, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 4}, "StartTime": 1557441989.26205}
[0m
[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.6020839663153364, "sum": 0.6020839663153364, "min": 0.6020839663153364}}, "EndTime": 1557441989.262108, "Dimensions": {"model": 2, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 4}, "StartTime": 1557441989.262093}
[0m
[31m#metrics {"Metr

[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.6016152211738753, "sum": 0.6016152211738753, "min": 0.6016152211738753}}, "EndTime": 1557442006.459516, "Dimensions": {"model": 0, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 5}, "StartTime": 1557442006.459454}
[0m
[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.617303352776028, "sum": 0.617303352776028, "min": 0.617303352776028}}, "EndTime": 1557442006.459595, "Dimensions": {"model": 1, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 5}, "StartTime": 1557442006.459582}
[0m
[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.6018297222896415, "sum": 0.6018297222896415, "min": 0.6018297222896415}}, "EndTime": 1557442006.459634, "Dimensions": {"model": 2, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 5}, "StartTime": 1557442006.459625}
[0m
[31m#metrics {"Metric

[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.6015375613483764, "sum": 0.6015375613483764, "min": 0.6015375613483764}}, "EndTime": 1557442024.329087, "Dimensions": {"model": 0, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 6}, "StartTime": 1557442024.329025}
[0m
[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.6158155690329616, "sum": 0.6158155690329616, "min": 0.6158155690329616}}, "EndTime": 1557442024.329173, "Dimensions": {"model": 1, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 6}, "StartTime": 1557442024.329154}
[0m
[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.601587252201545, "sum": 0.601587252201545, "min": 0.601587252201545}}, "EndTime": 1557442024.329236, "Dimensions": {"model": 2, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 6}, "StartTime": 1557442024.329219}
[0m
[31m#metrics {"Metric

[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.6015099336309772, "sum": 0.6015099336309772, "min": 0.6015099336309772}}, "EndTime": 1557442041.866991, "Dimensions": {"model": 0, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 7}, "StartTime": 1557442041.866928}
[0m
[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.6142753349482947, "sum": 0.6142753349482947, "min": 0.6142753349482947}}, "EndTime": 1557442041.86707, "Dimensions": {"model": 1, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 7}, "StartTime": 1557442041.867056}
[0m
[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.6013555858909486, "sum": 0.6013555858909486, "min": 0.6013555858909486}}, "EndTime": 1557442041.867108, "Dimensions": {"model": 2, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 7}, "StartTime": 1557442041.867099}
[0m
[31m#metrics {"Metr

[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.6014824078434311, "sum": 0.6014824078434311, "min": 0.6014824078434311}}, "EndTime": 1557442059.260508, "Dimensions": {"model": 0, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 8}, "StartTime": 1557442059.260446}
[0m
[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.6127146354671876, "sum": 0.6127146354671876, "min": 0.6127146354671876}}, "EndTime": 1557442059.260585, "Dimensions": {"model": 1, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 8}, "StartTime": 1557442059.260572}
[0m
[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.6011343085761462, "sum": 0.6011343085761462, "min": 0.6011343085761462}}, "EndTime": 1557442059.260639, "Dimensions": {"model": 2, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 8}, "StartTime": 1557442059.260623}
[0m
[31m#metrics {"Met

[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.6014550159975443, "sum": 0.6014550159975443, "min": 0.6014550159975443}}, "EndTime": 1557442076.124789, "Dimensions": {"model": 0, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 9}, "StartTime": 1557442076.124725}
[0m
[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.6112072816775467, "sum": 0.6112072816775467, "min": 0.6112072816775467}}, "EndTime": 1557442076.124881, "Dimensions": {"model": 1, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 9}, "StartTime": 1557442076.124862}
[0m
[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.600922999947737, "sum": 0.600922999947737, "min": 0.600922999947737}}, "EndTime": 1557442076.124942, "Dimensions": {"model": 2, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 9}, "StartTime": 1557442076.124926}
[0m
[31m#metrics {"Metric

[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.6014277547815254, "sum": 0.6014277547815254, "min": 0.6014277547815254}}, "EndTime": 1557442095.081124, "Dimensions": {"model": 0, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 10}, "StartTime": 1557442095.081061}
[0m
[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.6097772612281861, "sum": 0.6097772612281861, "min": 0.6097772612281861}}, "EndTime": 1557442095.081206, "Dimensions": {"model": 1, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 10}, "StartTime": 1557442095.081192}
[0m
[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.6007213131212853, "sum": 0.6007213131212853, "min": 0.6007213131212853}}, "EndTime": 1557442095.081243, "Dimensions": {"model": 2, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 10}, "StartTime": 1557442095.081234}
[0m
[31m#metrics {"

[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.6014006121462818, "sum": 0.6014006121462818, "min": 0.6014006121462818}}, "EndTime": 1557442112.707576, "Dimensions": {"model": 0, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 11}, "StartTime": 1557442112.707513}
[0m
[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.608413703659554, "sum": 0.608413703659554, "min": 0.608413703659554}}, "EndTime": 1557442112.707655, "Dimensions": {"model": 1, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 11}, "StartTime": 1557442112.707641}
[0m
[31m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.6005287284103948, "sum": 0.6005287284103948, "min": 0.6005287284103948}}, "EndTime": 1557442112.707693, "Dimensions": {"model": 2, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 11}, "StartTime": 1557442112.707684}
[0m
[31m#metrics {"Met


2019-05-09 22:48:42 Uploading - Uploading generated training model
2019-05-09 22:48:42 Completed - Training job completed
Billable seconds: 250


In [9]:
# modelData = np.array(areaStandardize.iloc[:, [7,8,9,16]]).astype('float32')
# target = np.array(areaStandardize.iloc[:, 11]).astype('float32')

In [10]:
import boto3
import sagemaker
import io
import os
import sagemaker.amazon.common as smac

# # Create new sagemaker session
# sess = sagemaker.Session()

# # S3 bucket to export results to
# bucket = "airbnb-estimator-sagemaker"
# train_data_prefix = "dataset"
# model_prefix = "models"
    
# # Use the IO buffer as dataset is small
# buf = io.BytesIO()

# smac.write_numpy_to_dense_tensor(buf,modelData,target)
# buf.seek(0)

# # importing linearyLerner
# key = 'linear-regression-harlem'
# boto3.resource('s3').Bucket(bucket).Object(os.path.join(train_data_prefix, 'train', key)).upload_fileobj(buf)
# s3_train_data = 's3://{}/{}/train/{}'.format(bucket, train_data_prefix, key)

# print('uploaded training data location: {}'.format(s3_train_data))
# output_location = 's3://{}/{}/LinearRegression'.format(bucket, model_prefix)

# print('training artifacts will be uploaded to: {}'.format(output_location))
# # Use all regions for ML model
# containers = {'us-west-2': '174872318107.dkr.ecr.us-west-2.amazonaws.com/linear-learner:latest',
#               'us-east-1': '382416733822.dkr.ecr.us-east-1.amazonaws.com/linear-learner:latest',
#               'us-east-2': '404615174143.dkr.ecr.us-east-2.amazonaws.com/linear-learner:latest',
#               'eu-west-1': '438346466558.dkr.ecr.eu-west-1.amazonaws.com/linear-learner:latest',
#               'ap-northeast-1': '351501993468.dkr.ecr.ap-northeast-1.amazonaws.com/linear-learner:latest'}

In [11]:
from sagemaker import get_execution_role
import json


# for dataset in trainingData:
location = trainingData[0].replace('s3://airbnb-estimator-sagemaker/dataset/train/linear-regression-','').replace(',','').replace("'",'')

print(location)

response = client.create_model(
    ModelName='string',

)

# s3_train_data = 's3://airbnb-estimator-sagemaker/dataset/train/linear-regression-allerton'
# role = get_execution_role()
# linear = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
#                                     role, 
#                                     train_instance_count=1, 
#                                     train_instance_type='ml.m4.xlarge',
#                                     output_path=output_location,
#                                     sagemaker_session=sess,
#                                     base_job_name='allerton')

# linear.set_hyperparameters(feature_dim=4,
#                            predictor_type='regressor',
#                            normalize_data=False,
#                            mini_batch_size = 5)
# linear.fit({'train': s3_train_data})

# linear_predictor = linear.deploy(initial_instance_count=1,
#                                  instance_type='ml.m4.4xlarge',
#                                  endpoint_name='linear-regression',
#                                  update_endpoint=True
#                                  )


allerton


NameError: name 'client' is not defined

In [None]:
# sagemaker.Session().delete_endpoint(linear_predictor.endpoint)