In [1]:
%load_ext autoreload
%autoreload 2

# 목표
* **XGBoost**의 훈련, 파리미터 튜닝, 배포 과정을 **Amazon SageMaker** 빌트인 알고리즘을 이용하여 진행해본다.
* (선택) SageMaker **HyperparameterTuner**을 사용해서 하이퍼파라미터 튜닝하고 결과를 분석, 시각화해본다.

# 필요조건
* 여기서 사용된 이상거래 탐지 데이터셋은 [해당 Kaggle 대회](https://www.kaggle.com/c/ieee-fraud-detection)에서 다운로드 받을 수 있다.
* `scikit-learn` 버전 0.24 이상이 설치되어야 한다. `bokeh` 플롯을 SVG 형식으로 저장하려면 `selenium`, `geckodriver` 설치 또한 필요하다.

In [2]:
import bokeh
import os
import boto3
import sagemaker
import numpy as np
import pandas as pd
from bokeh.io import export_svgs
from bokeh.layouts import gridplot
from bokeh.models import Band, ColumnDataSource, HoverTool, NumeralTickFormatter
from bokeh.plotting import figure, show
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score, 
                             average_precision_score, precision_recall_curve, roc_auc_score, roc_curve)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import CategoricalParameter, ContinuousParameter, IntegerParameter, HyperparameterTuner
bokeh.io.output_notebook()

In [3]:
def get_pred(score, thr=0.5):
    return np.where(score >= thr, 1, 0)


def is_number(x):
    try:
        float(x)
        return 1
    except:
        return 0 


def make_dirs(path):
    if not os.path.exists(path):
        os.makedirs(path)
    

def str_to_int(x):
    return x if pd.isnull(x) else str(int(x))

# 데이터 불러오기
Kaggle 데이터셋을 로컬 디렉토리 `../../Data/ieee-fraud-detection`에 미리 저장해두었다.

In [4]:
RAW_DATA_PATH = '../../Data/ieee-fraud-detection'
PROC_DATA_PATH = './proc_data'
RANDOM_STATE = 42

In [5]:
train_identity = pd.read_csv(os.path.join(RAW_DATA_PATH, 'train_identity.csv'))
train_transaction = pd.read_csv(os.path.join(RAW_DATA_PATH, 'train_transaction.csv'))
df_train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')

범주형 변수의 목록과 설명은 [해당 페이지](https://www.kaggle.com/c/ieee-fraud-detection/data)에서 살펴볼 수 있다.

In [6]:
cat_features = pd.Index(
    ['ProductCD', 'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain', 'DeviceType', 'DeviceInfo'] + [
        f'card{i}' for i in range(1, 7)] + [f'M{i}' for i in range(1, 10)] + [f'id_{i}' for i in range(12, 39)])
num_features = df_train.columns.difference(pd.Index(['TransactionID', 'TransactionDT', 'isFraud']) | cat_features)
all_features = cat_features | num_features

int_cat_features =  df_train[cat_features].select_dtypes('number').columns
df_train[int_cat_features] = df_train[int_cat_features].applymap(str_to_int)
df_train[cat_features] = df_train[cat_features].astype('str')

# 데이터셋 분할과 전처리
데이터셋을 훈련 셋, 검증 셋, 시험 셋으로 각각 76.5%, 13.5%, 10%씩 분할하였다. 그런 다음 범주형 변수에 서수형 인코딩과 결측값 처리를, 수치형 변수에 결측값 처리를 따로 적용하였다.

In [7]:
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(
    df_train[all_features], df_train['isFraud'], test_size=0.1, random_state=RANDOM_STATE, stratify=df_train['isFraud'])

df_X_train, df_X_valid, df_y_train, df_y_valid = train_test_split(
    df_X_train, df_y_train, test_size=0.15, random_state=RANDOM_STATE, stratify=df_y_train)

In [8]:
cat_pipeline = make_pipeline(OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan),
                             SimpleImputer(strategy='constant', fill_value=-1))
num_pipeline = SimpleImputer(strategy='median')
processor = make_column_transformer((cat_pipeline, cat_features), (num_pipeline, num_features))

X_train = processor.fit_transform(df_X_train)
X_valid = processor.transform(df_X_valid)
X_test = processor.transform(df_X_test)

dtrain = np.concatenate((df_y_train.values.reshape(-1, 1), X_train), axis=1)
dvalid = np.concatenate((df_y_valid.values.reshape(-1, 1), X_valid), axis=1)
dtest = X_test

In [9]:
dir_names = ['train', 'valid', 'test']
file_names = ['dtrain', 'dvalid', 'dtest']

for dir_name in dir_names:
    make_dirs(os.path.join(PROC_DATA_PATH, dir_name))

for dir_name, file_name, dataset in zip(dir_names, file_names, [dtrain, dvalid, dtest]):
    np.savetxt(os.path.join(PROC_DATA_PATH, dir_name, file_name) + '.csv', dataset, delimiter=',', fmt='%i')

# 하이퍼파라미터 튜닝
### S3 버킷에 데이터셋 업로드하기

In [10]:
sagemaker_session = sagemaker.session.Session()
default_bucket = sagemaker_session.default_bucket()
prefix = 'ieee-fraud-detection'
region = boto3.Session().region_name
role = sagemaker.get_execution_role()

In [11]:
%%time
s3_client = boto3.client('s3')

for dir_name, file_name in zip(dir_names, file_names):
    s3_client.upload_file(os.path.join(PROC_DATA_PATH, dir_name, file_name) + '.csv', default_bucket, prefix + '/' + dir_name + '/' + file_name + '.csv')

CPU times: user 3.59 s, sys: 2.62 s, total: 6.22 s
Wall time: 2min 45s


### 빌트인 알고리즘 XGBoost Estimator 구성
`XGBoost`는 1.2.1 버전 이미지를 이용했고 하이퍼파라미터 종류와 범위는 [해당 페이지](https://xgboost.readthedocs.io/en/latest/parameter.html)를 참조하여 설정하였다. 대회 기준인 AUROC로 검증 셋을 평가하여 조기 종료하게끔 지정했다.

In [12]:
scale_pos_weight = float(df_y_train.shape[0] / df_y_train.sum() - 1.0)

model_output_uri = f's3://{default_bucket}/{prefix}/models'
image_uri = sagemaker.image_uris.retrieve(
    framework='xgboost',
    region=region,
    version='1.2-1',
    py_version='py3',
    instance_type='ml.m5.2xlarge'
)

clf = Estimator(
    image_uri=image_uri,
    role=role,
    instance_count=1,
    instance_type='ml.m5.2xlarge',
    output_path=model_output_uri,
    use_spot_instances=False,
    max_wait=None
)
clf.set_hyperparameters(
    booster='gbtree',
    verbosity=0,
    objective='binary:logistic',
    scale_pos_weight=scale_pos_weight,
    seed=42,
    eval_metric='auc',
    num_round=1000,
    early_stopping_rounds=10
)

### HyperparameterTuner 구성과 실행
베이지안 최적화 과정을 30번 수행하여 최적 파라미터를 찾게끔 설정했다.

In [13]:
hyperparameter_ranges = {
    'max_depth': IntegerParameter(1, 30),
    'eta': ContinuousParameter(0.01, 1.0),
    'gamma': ContinuousParameter(0.0, 1.0),
    'min_child_weight': ContinuousParameter(1e-06, 1.0),
    'subsample': ContinuousParameter(0.1, 1.0),
    'colsample_bytree': ContinuousParameter(0.1, 1.0)
}

In [14]:
tuner = HyperparameterTuner(
    clf,
    'validation:auc',
    hyperparameter_ranges,
    objective_type='Maximize',
    max_jobs=30,
    max_parallel_jobs=3,
    base_tuning_job_name='ifd-xgb-hpo',
    early_stopping_type='Off'
)

In [15]:
train_input = TrainingInput(
    s3_data=f's3://{default_bucket}/{prefix}/train/', 
    content_type='text/csv'
)
valid_input = TrainingInput(
    s3_data=f's3://{default_bucket}/{prefix}/valid/', 
    content_type='text/csv'
)

In [16]:
%%time
tuner.fit(
    {
        'train': train_input, 
        'validation': valid_input
    }
)

best_clf = tuner.best_estimator()
best_params = best_clf.hyperparameters()
tuning_job_name = tuner.latest_tuning_job.name

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

### 하이퍼파라미터 튜닝 결과 분석과 시각화

In [17]:
tuning_job_analytics = sagemaker.HyperparameterTuningJobAnalytics(tuning_job_name)
df_viz = tuning_job_analytics.dataframe()
make_dirs('./images')

In [18]:
df_viz.sort_values('FinalObjectiveValue', ascending=False)[:10]

Unnamed: 0,colsample_bytree,eta,gamma,max_depth,min_child_weight,subsample,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
7,0.409983,0.131392,0.963517,16.0,0.001839,0.976316,ifd-xgb-hpo-210502-1035-023-c431dc85,Completed,0.97059,2021-05-02 12:35:56+09:00,2021-05-02 12:49:06+09:00,790.0
6,0.545325,0.077232,0.333767,22.0,0.572492,0.99458,ifd-xgb-hpo-210502-1035-024-7f5bbf49,Completed,0.96995,2021-05-02 12:53:16+09:00,2021-05-02 13:14:44+09:00,1288.0
2,0.999691,0.025824,0.327267,23.0,0.112314,0.980947,ifd-xgb-hpo-210502-1035-028-7bce5a44,Completed,0.96972,2021-05-02 13:18:05+09:00,2021-05-02 15:12:06+09:00,6841.0
8,0.449414,0.05666,0.758674,29.0,0.138613,0.97865,ifd-xgb-hpo-210502-1035-022-4e63e404,Completed,0.96959,2021-05-02 12:25:15+09:00,2021-05-02 12:52:54+09:00,1659.0
0,0.574778,0.143435,0.11755,23.0,7e-06,0.987302,ifd-xgb-hpo-210502-1035-030-99654e79,Completed,0.9695,2021-05-02 13:26:17+09:00,2021-05-02 13:43:00+09:00,1003.0
20,0.990299,0.023552,0.347267,23.0,0.141772,0.998947,ifd-xgb-hpo-210502-1035-010-83f1084b,Completed,0.96859,2021-05-02 11:10:59+09:00,2021-05-02 13:04:15+09:00,6796.0
10,0.271858,0.106583,0.915412,30.0,0.000738,0.950965,ifd-xgb-hpo-210502-1035-020-a3788d5a,Completed,0.96815,2021-05-02 12:17:25+09:00,2021-05-02 12:32:25+09:00,900.0
12,0.395087,0.216504,0.422065,19.0,0.20545,0.873803,ifd-xgb-hpo-210502-1035-018-46da839c,Completed,0.96781,2021-05-02 12:05:32+09:00,2021-05-02 12:16:31+09:00,659.0
16,0.228507,0.087468,0.340186,19.0,0.120525,0.728464,ifd-xgb-hpo-210502-1035-014-4956a344,Completed,0.96767,2021-05-02 11:32:32+09:00,2021-05-02 11:43:53+09:00,681.0
1,0.925819,0.062394,0.578701,25.0,9e-06,1.0,ifd-xgb-hpo-210502-1035-029-a3743943,Completed,0.9673,2021-05-02 13:18:35+09:00,2021-05-02 13:59:59+09:00,2484.0


In [19]:
class HoverHelper():
    def __init__(self, tuning_job_analytics):
        self.tuning_job_analytics = tuning_job_analytics

    def hovertool(self):
        tooltips = [
            ('TrainingJobName', '@TrainingJobName'),
            ('FinalObjectiveValue', '@FinalObjectiveValue')
        ]
    
        for key in self.tuning_job_analytics.tuning_ranges.keys():
            tooltips.append((key, '@{%s}' % key) )

        hover_tool = HoverTool(tooltips=tooltips)
        return hover_tool

    def tools(self, standard_tools='pan, crosshair, wheel_zoom, zoom_in, zoom_out, undo, reset'):
        return [self.hovertool(), standard_tools]
    
    
def make_grid(figures, n_cols):
    rows = []
    for i, figure in enumerate(figures):
        if i % n_cols == 0:
            cols = []
        elif (i % n_cols == n_cols - 1) or i == (len(figures) - 1):
            rows.append(cols)
        cols.append(figure)
    return rows

In [20]:
hover_helper = HoverHelper(tuning_job_analytics)

p = figure(plot_width=800, plot_height=400, tools=hover_helper.tools(), 
           title='Convergence Plot', x_axis_type='datetime', x_axis_label='Training Start Time', y_axis_label='AUROC')
_ = p.line(source=df_viz, x='TrainingStartTime', y='FinalObjectiveValue', color='coral', line_width=1.5)
_ = p.circle(source=df_viz, x='TrainingStartTime', y='FinalObjectiveValue', line_color='coral', line_width=1.5, fill_color='white')

p.title.align = 'center'
p.title.text_font_size = '11pt'
p.xgrid.grid_line_color = None
p.yaxis.formatter = NumeralTickFormatter(format='0.0%')

show(p)

p.output_backend = 'svg'
_ = export_svgs(p, filename='./images/convergence_plot.svg')

In [21]:
df_viz = df_viz.reset_index()
df_viz['index'] = (df_viz['index'] + df_viz['index'].min()) / (df_viz['index'].max() - df_viz['index'].min())

figures = []
for param_name, param_range in tuning_job_analytics.tuning_ranges.items():
    categorical_args = dict()
    if param_range.get('Values'):          
        values = param_range['Values']
        if sum([is_number(x) for x in values]) == len(values):
            print("Hyperparameter %s is tuned as categorical, but all values are numeric." % param_name)
        else:
            categorical_args['x_range'] = values

    plot = figure(plot_width=400, plot_height=400, tools=hover_helper.tools(), 
               x_axis_label=param_name, y_axis_label='AUROC', **categorical_args)
    plot.circle(source=df_viz, x=param_name, y='FinalObjectiveValue', color='black', alpha='index')
    plot.xgrid.grid_line_color = None
    plot.yaxis.formatter = NumeralTickFormatter(format='0.0%')
    figures.append(plot)

p = gridplot(make_grid(figures, 3), toolbar_location='right')
show(p)

_ = bokeh.io.export_png(p, filename='./images/partial_dependence_plot.png')

# 시험 셋 평가
### Transformer 구성과 예측 점수 생성

In [22]:
%%time
transformer = best_clf.transformer(
    instance_count=1, 
    instance_type='ml.m5.2xlarge', 
    output_path=f's3://{default_bucket}/{prefix}/prediction'
)
_ = transformer.transform(
    data=f's3://{default_bucket}/{prefix}/test/',
    content_type='text/csv', 
    split_type='Line'
)

.......................[34m[2021-05-02:09:03:21:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-05-02:09:03:21:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-05-02:09:03:21:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;

  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }

  server {
    listen 8080 deferred;
    client_max_body_size 0;

    keepalive_timeout 3;

    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
      proxy_read_timeout 60s;
      proxy_pass http://gunicorn;
    }

    locati

In [23]:
s3_client.download_file(default_bucket, f'{prefix}/prediction/dtest.csv.out', os.path.join(PROC_DATA_PATH, 'test', 'dtest.csv.out'))
scores = pd.read_csv(os.path.join(PROC_DATA_PATH, 'test', 'dtest.csv.out'), header=None).values

### 예측 성능 평가

In [24]:
preds = get_pred(scores)

print('Accuracy: {0:.2%}, Precision: {1:.2%}, Recall: {2:.2%}, F1: {3:.2%}'.format(
    accuracy_score(df_y_test, preds), precision_score(df_y_test, preds), recall_score(df_y_test, preds), f1_score(df_y_test, preds)))

Accuracy: 98.71%, Precision: 89.75%, Recall: 71.20%, F1: 79.41%


In [25]:
source = pd.DataFrame(roc_curve(df_y_test, scores), index=['fpr', 'tpr', 'thr']).T

p = figure(plot_height=400, title='ROC Curve (AUROC {:.2%})'.format(roc_auc_score(df_y_test, scores)), 
           x_axis_label='False Positive Rate', y_axis_label='True Positive Rate')
_ = p.line(source=source, x='fpr', y='tpr', color='coral', line_width=1.5)
_ = p.line(source=source, x='fpr', y='fpr', color='black', line_dash='dashed')

p.title.align = 'center'
p.title.text_font_size = '11pt'
p.xgrid.grid_line_color = None
p.xaxis.formatter = NumeralTickFormatter(format='0.0%')
p.yaxis.formatter = NumeralTickFormatter(format='0.0%')

show(p)

p.output_backend = 'svg'
_ = export_svgs(p, filename='./images/roc_curve.svg')

In [26]:
source = pd.DataFrame(precision_recall_curve(df_y_test, scores), index=['recall', 'precision', 'thr']).T

p = figure(plot_height=400, title='Precision - Recall Curve (AUPRC {:.2%})'.format(average_precision_score(df_y_test, scores)), 
           x_axis_label='Recall', y_axis_label='Precision')
_ = p.line(source=source, x='recall', y='precision', color='coral', line_width=1.0)
band = Band(source=ColumnDataSource(data=dict(recall=source['recall'], precision=source['precision'])), 
            base='recall', upper='precision', level='underlay', fill_alpha=0.2, fill_color='coral')
p.add_layout(band)

p.title.align = 'center'
p.title.text_font_size = '11pt'
p.xgrid.grid_line_color = None
p.xaxis.formatter = NumeralTickFormatter(format='0.0%')
p.yaxis.formatter = NumeralTickFormatter(format='0.0%')

show(p)

p.output_backend = 'svg'
_ = export_svgs(p, filename='./images/pr_curve.svg')

# 모델 적합과 배포
Kaggle 대회에 제출할 퀴즈 셋을 

In [27]:
test_identity = pd.read_csv(os.path.join(RAW_DATA_PATH, 'test_identity.csv')) 
test_transaction = pd.read_csv(os.path.join(RAW_DATA_PATH, 'test_transaction.csv'))
df_test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')
df_test = df_test.rename(columns={'id-{:02d}'.format(i): 'id_{:02d}'.format(i) for i in range(1, 39)})

df_test[int_cat_features] = df_test[int_cat_features].applymap(str_to_int)
df_test[cat_features] = df_test[cat_features].astype('str')

In [28]:
df_X_train, df_X_valid, df_y_train, df_y_valid = train_test_split(
    df_train[all_features], df_train['isFraud'], test_size=0.15, random_state=RANDOM_STATE, stratify=df_train['isFraud'])

X_train = processor.fit_transform(df_X_train)
X_valid = processor.transform(df_X_valid)
X_test = processor.transform(df_test[all_features])

dtrain = np.concatenate((df_y_train.values.reshape(-1, 1), X_train), axis=1)
dvalid = np.concatenate((df_y_valid.values.reshape(-1, 1), X_valid), axis=1)
dtest = X_test

In [29]:
for dir_name in dir_names:
    make_dirs(os.path.join(PROC_DATA_PATH, dir_name))

for dir_name, file_name, dataset in zip(dir_names, file_names, [dtrain, dvalid, dtest]):
    np.savetxt(os.path.join(PROC_DATA_PATH, dir_name, file_name) + '.csv', dataset, delimiter=',', fmt='%i')

In [30]:
%%time
for dir_name, file_name in zip(dir_names, file_names):
    s3_client.upload_file(os.path.join(PROC_DATA_PATH, dir_name, file_name) + '.csv', default_bucket, prefix + '/' + dir_name + '/' + file_name + '.csv')

CPU times: user 6.42 s, sys: 5.01 s, total: 11.4 s
Wall time: 5min 50s


In [31]:
%%time
best_params = best_clf.hyperparameters()
_ = best_params.pop('_tuning_objective_metric')

params = clf.hyperparameters()
params.update(best_params)
clf.set_hyperparameters(**params)

valid_input = TrainingInput(
    s3_data=f's3://{default_bucket}/{prefix}/valid/', 
    content_type='text/csv'
)

clf.fit(
    {
    'train': train_input,
    'validation': valid_input
    }
)

2021-05-02 09:49:35 Starting - Starting the training job...
2021-05-02 09:49:39 Starting - Launching requested ML instancesProfilerReport-1619948973: InProgress
......
2021-05-02 09:51:12 Starting - Preparing the instances for training......
2021-05-02 09:52:12 Downloading - Downloading input data...
2021-05-02 09:52:52 Training - Training image download completed. Training in progress.[34m[2021-05-02 09:52:47.393 ip-10-2-195-9.ec2.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter eval_metric value auc to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter booster value gbtree to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself

In [32]:
%%time
transformer = clf.transformer(
    instance_count=1, 
    instance_type='ml.m5.2xlarge', 
    output_path=f's3://{default_bucket}/{prefix}/prediction'
)
_ = transformer.transform(
    data=f's3://{default_bucket}/{prefix}/test/',
    content_type='text/csv', 
    split_type='Line'
)

...........................[34m[2021-05-02:10:11:25:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-05-02:10:11:25:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-05-02:10:11:25:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[35m[2021-05-02:10:11:25:INFO] No GPUs detected (normal if no gpus installed)[0m
[35m[2021-05-02:10:11:25:INFO] No GPUs detected (normal if no gpus installed)[0m
[35m[2021-05-02:10:11:25:INFO] nginx config: [0m
[35mworker_processes auto;[0m
[35mdaemon off;[0m
[35mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;

  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }

  server {
    listen 8080 deferred;
    client_max_body_size 

In [33]:
s3_client.download_file(default_bucket, f'{prefix}/prediction/dtest.csv.out', os.path.join(PROC_DATA_PATH, 'test', 'dtest.csv.out'))
scores = pd.read_csv(os.path.join(PROC_DATA_PATH, 'test', 'dtest.csv.out'), header=None).values

In [34]:
submission = pd.DataFrame({'TransactionID': df_test['TransactionID'].values, 'isFraud': scores.flatten()})
submission.to_csv('./submission.csv', index=False)