In [1]:
import pandas as pd
import boto3
import sagemaker
from tqdm import tqdm
import os

In [2]:
#creating sagemaker session, role and bucket
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

bucket = sagemaker_session.default_bucket()

In [3]:
%%time
#upload data to s3
data_dir = 'data'
prefix = 'sagemaker/Arvato'
train_location = sagemaker_session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)
val_location = sagemaker_session.upload_data(os.path.join(data_dir, 'val.csv'), key_prefix=prefix)
test_location = sagemaker_session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix)

CPU times: user 9.34 s, sys: 2.71 s, total: 12.1 s
Wall time: 18.9 s


In [4]:
#baseline model
from sagemaker.amazon.amazon_estimator import get_image_uri
#create container
container = get_image_uri(sagemaker_session.boto_region_name, 'xgboost', '0.90-1')

In [5]:
xgb = sagemaker.estimator.Estimator(container,
                                    role,                                    
                                    train_instance_count=1,                  
                                    train_instance_type='ml.m4.xlarge',
                                    output_path=f's3://{sagemaker_session.default_bucket()}/{prefix}/output',
                                    sagemaker_session=sagemaker_session)

In [6]:
xgb.set_hyperparameters(max_depth=10,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        early_stopping_rounds=10,
                        num_round=500)

In [7]:
s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type='csv')

In [8]:
%%time
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})


2020-01-17 09:10:18 Starting - Starting the training job...
2020-01-17 09:10:19 Starting - Launching requested ML instances.........
2020-01-17 09:11:51 Starting - Preparing the instances for training......
2020-01-17 09:12:52 Downloading - Downloading input data...
2020-01-17 09:13:45 Training - Training image download completed. Training in progress...[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34m[09:13:49] 199421x135 matrix with 26921835 entri

In [9]:
xgb_transformer = xgb.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')

In [10]:
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

In [11]:
%%time
xgb_transformer.wait()

.......................[34m[2020-01-17 09:19:59 +0000] [17] [INFO] Starting gunicorn 19.10.0[0m
[34m[2020-01-17 09:19:59 +0000] [17] [INFO] Listening at: unix:/tmp/gunicorn.sock (17)[0m
[34m[2020-01-17 09:19:59 +0000] [17] [INFO] Using worker: gevent[0m
[34m[2020-01-17 09:19:59 +0000] [24] [INFO] Booting worker with pid: 24[0m
[34m[2020-01-17 09:19:59 +0000] [25] [INFO] Booting worker with pid: 25[0m
[34m[2020-01-17 09:20:00 +0000] [29] [INFO] Booting worker with pid: 29[0m
[34m[2020-01-17 09:20:00 +0000] [33] [INFO] Booting worker with pid: 33[0m

[32m2020-01-17T09:20:22.591:[sagemaker logs]: MaxConcurrentTransforms=4, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD[0m
[34m[2020-01-17:09:20:22:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m169.254.255.130 - - [17/Jan/2020:09:20:22 +0000] "GET /ping HTTP/1.1" 200 0 "-" "Go-http-client/1.1"[0m
[35m[2020-01-17:09:20:22:INFO] No GPUs detected (normal if no gpus installed)[0m
[35m169.254.255.130 - - [17/Jan

In [12]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

download: s3://sagemaker-us-east-1-579131320339/sagemaker-xgboost-2020-01-17-09-16-31-641/test.csv.out to data/test.csv.out


In [13]:
!cat data/test.csv.out

0.8839449882507324
0.8571972846984863
0.7826682925224304
0.6887217164039612
0.8304740786552429
0.7787885665893555
0.3492816984653473
0.7705637216567993
0.6286389827728271
0.8853324055671692
0.9100673794746399
0.6122822761535645
0.5372019410133362
0.5488185286521912
0.5723190307617188
0.8680834174156189
0.690088152885437
0.4425048530101776
0.7854576706886292
0.8893135190010071
0.7662985920906067
0.6975858807563782
0.8257752060890198
0.7803918719291687
0.6544868350028992
0.7376005053520203
0.9215046763420105
0.6944776177406311
0.8369120359420776
0.8246368169784546
0.6261188387870789
0.8643384575843811
0.8358799815177917
0.7540819048881531
0.8910057544708252
0.7437300086021423
0.8358799815177917
0.44119349122047424
0.6532459259033203
0.8218470811843872
0.6452484726905823
0.6838634610176086
0.33271482586860657
0.6461027264595032
0.8215827345848083
0.8246368169784546
0.7967041730880737
0.8188447952270508
0.7397969961166382
0.5918987989425659

0.8632981777191162
0.7969085574150085
0.8941235542297363
0.8945479393005371
0.8557065725326538
0.7763665318489075
0.8068529367446899
0.7096781134605408
0.8881106972694397
0.45913025736808777
0.8651129007339478
0.8430495858192444
0.8358799815177917
0.8246368169784546
0.5087435841560364
0.4358080327510834
0.18002860248088837
0.5715592503547668
0.6605013012886047
0.5665568113327026
0.5068841576576233
0.8767178058624268
0.7446107268333435
0.6453506350517273
0.893938422203064
0.7511528730392456
0.7793174386024475
0.92296302318573
0.8863723874092102
0.6862639784812927
0.724913477897644
0.8109892010688782
0.8591587543487549
0.8580269813537598
0.5970397591590881
0.6879734992980957
0.4341207444667816
0.3546949028968811
0.8051832318305969
0.59494948387146
0.8246368169784546
0.5504226088523865
0.6978693008422852
0.8467663526535034
0.6578982472419739
0.6188233494758606
0.489285945892334
0.7810839414596558
0.8358799815177917
0.8189659714698792
0.882

0.8502363562583923
0.7905787229537964
0.444598913192749
0.4529466927051544
0.7204795479774475
0.6826875805854797
0.8153250813484192
0.9107215404510498
0.7751302123069763
0.6257067918777466
0.7618759274482727
0.8358799815177917
0.5892690420150757
0.6829112768173218
0.8163551092147827
0.24696846306324005
0.8246368169784546
0.8192828893661499
0.8236216306686401
0.8592970967292786
0.548007071018219
0.5445706248283386
0.8358799815177917
0.6210464835166931
0.6157926321029663
0.7412107586860657
0.6425039172172546
0.6545246243476868
0.2482200413942337
0.8055098056793213
0.12278666347265244
0.8524825572967529
0.8760936260223389
0.7203258275985718
0.8504258990287781
0.5327013731002808
0.720745861530304
0.5645955204963684
0.7710255980491638
0.6298652291297913
0.7020440101623535
0.8203191161155701
0.26998287439346313
0.6356410384178162
0.7693831920623779
0.7023046612739563
0.8246368169784546
0.856309711933136
0.7781097292900085
0.42174723744392395


In [14]:
index = pd.read_csv('../data/Udacity_MAILOUT_052018_TEST.csv', sep = ';', low_memory = False)['LNR']

In [15]:
pred = pd.read_csv('data/test.csv.out', index_col = None, header = None)
pred.columns = ['RESPONSE']

In [16]:
xgb_out = pd.concat((index, pred), axis = 1)

In [17]:
xgb_out.head()

Unnamed: 0,LNR,RESPONSE
0,1754,0.883945
1,1770,0.857197
2,1465,0.782668
3,1470,0.688722
4,1478,0.830474


In [18]:
xgb_out.to_csv('data/xgb_out.csv', index = None)

In [19]:
!cat data/xgb_out.csv

LNR,RESPONSE
1754,0.8839449882507324
1770,0.8571972846984863
1465,0.7826682925224304
1470,0.6887217164039612
1478,0.8304740786552429
1782,0.7787885665893555
1485,0.3492816984653473
1519,0.7705637216567993
1835,0.6286389827728271
1522,0.8853324055671692
1539,0.91006737947464
1853,0.6122822761535645
1856,0.5372019410133362
2502,0.5488185286521912
2182,0.5723190307617188
2191,0.8680834174156189
2522,0.690088152885437
2530,0.4425048530101776
2215,0.7854576706886292
2550,0.8893135190010071
2570,0.7662985920906067
2574,0.6975858807563782
2579,0.8257752060890198
2247,0.7803918719291687
2674,0.6544868350028992
11434,0.7376005053520203
11122,0.9215046763420104
11127,0.6944776177406311
11443,0.8369120359420776
11170,0.8246368169784546
11191,0.6261188387870789
11198,0.8643384575843811
10936,0.8358799815177917
11244,0.7540819048881531
11247,0.8910057544708252
11252,0.7437300086021423
11259,0.8358799815177917
10945,0.44119349122047424
10951,0.6532459259033203

39568,0.5279263257980347
39249,0.8800597190856934
39261,0.8391101956367493
39573,0.8246368169784546
39579,0.8504258990287781
39278,0.34442949295043945
39279,0.8844529390335083
39608,0.7276996970176697
39615,0.8029008507728577
39314,0.8657436370849609
39338,0.9254344701766968
40325,0.8132339119911194
40326,0.8246368169784546
40015,0.5781844854354858
40367,0.8246368169784546
40387,0.8358799815177917
40399,0.3185041546821594
40057,0.7806295156478882
50664,0.7442244291305542
46130,0.8358799815177917
46132,0.5756798982620239
47111,0.4503850638866425
47751,0.8504258990287781
47758,0.8246368169784546
27590,0.2027840465307236
27605,0.3495517373085022
27614,0.6341222524642944
27303,0.7361530065536499
27619,0.8643319606781006
36507,0.6992790699005127
37150,0.17817600071430206
37165,0.8358799815177917
37173,0.7929575443267822
37177,0.8246368169784546
36862,0.5955893397331238
36872,0.6291497945785522
37180,0.6292752623558044
37190,0.7462961673736572
36889,0.37

72719,0.7386547923088074
73053,0.8358799815177917
73687,0.4506916701793671
73693,0.8619717359542847
74012,0.7230147719383241
74645,0.6609644889831543
74976,0.8369731903076172
75285,0.668459951877594
75294,0.7672258615493774
75599,0.6193756461143494
75610,0.822586715221405
75922,0.8636888265609741
76248,0.816066324710846
76251,0.914072334766388
71457,0.8374039530754089
72110,0.8361800909042358
72430,0.5135745406150818
73060,0.7736221551895142
73388,0.8246368169784546
73702,0.7457329034805298
73715,0.8358799815177917
74018,0.6506491303443909
74024,0.5132553577423096
74340,0.8804089426994324
74344,0.8246368169784546
74356,0.2300894856452942
74669,0.6506001353263855
74986,0.7153124809265137
75307,0.8728711009025574
75633,0.3104192912578583
76262,0.6241782307624817
76272,0.7987861037254333
76276,0.8290190100669861
71476,0.8060117959976196
71490,0.8504258990287781
71796,0.7088370323181152
71811,0.4472646713256836
72128,0.2824065685272217
72132,0.64940494

69776,0.4092973470687866
69778,0.597524881362915
69782,0.7592492699623108
69784,0.6068023443222046
70422,0.6616081595420837
70737,0.8676986694335938
70741,0.6551071405410767
71060,0.8355837464332581
71070,0.8246368169784546
66274,0.4756232500076294
66275,0.5055759549140929
66287,0.6563472747802734
66292,0.764853835105896
66607,0.8573331832885742
66932,0.7400214076042175
67238,0.6825088858604431
67249,0.7703251242637634
67573,0.8965942263603209
67877,0.8278862833976746
78005,0.7759780883789062
78011,0.6904906034469604
78959,0.5141717195510864
78967,0.8467198610305786
78968,0.6162309050559998
78972,0.8246368169784546
79294,0.8239360451698303
79608,0.8504258990287781
79938,0.8504258990287781
80246,0.8246368169784546
80560,0.5942241549491882
80890,0.6888821125030518
40298,0.8005797863006592
39984,0.8246368169784546
40305,0.8597158789634705
44934,0.8666504621505737
44939,0.8116902112960815
45269,0.8246368169784546
40774,0.5221313238143921
40783,0.850425

In [20]:
# score of 0.63847 on kaggle