In [82]:
import glob
import sys,os
import json
import pprint
import time
import re
import datetime
import pickle
import string
import gc
import warnings
import yaml
warnings.filterwarnings("ignore")
sys.path.append(os.pardir)
sys.path.append('../..')
sys.path.append('../../..')

import numpy as np
import pandas as pd
import pandas_profiling as pdp
import matplotlib.pyplot as plt
import japanize_matplotlib # 日本語対応
import seaborn as sns
# pandasのオプション
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_colwidth', 5000)
pd.options.display.float_format = '{:.3f}'.format
%matplotlib inline
# sns.set_style('whitegrid')
plt.style.use('fivethirtyeight')

from joblib import Parallel, delayed # よりお手軽にサクっと並列処理を実行出来るモジュール
from tqdm import tqdm, tqdm_notebook # プログレスバーを表示できる
from PIL import Image
tqdm.pandas()

# 外部モジュールを自動的にリロードする
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [83]:
CONFIG_FILE = '../configs/config.yaml'

with open(CONFIG_FILE) as file:
    yml = yaml.load(file)
MODEL_DIR_NAME = yml['SETTING']['MODEL_DIR_NAME']
FEATURE_DIR_NAME = yml['SETTING']['FEATURE_PATH']

In [84]:
def load_datasets_train(feats):
    dfs = [pd.read_pickle(FEATURE_DIR_NAME + f'{f}_train.pkl') for f in feats]
    X_train = pd.concat(dfs, axis=1)
    return X_train

def load_datasets_both(feats):
    dfs = [pd.read_pickle(FEATURE_DIR_NAME + f'{f}_train.pkl') for f in feats]
    X_train = pd.concat(dfs, axis=1)
    dfs = [pd.read_pickle(FEATURE_DIR_NAME + f'{f}_test.pkl') for f in feats]
    X_test = pd.concat(dfs, axis=1)
    return X_train, X_test

# 欠損値の確認
def missing_values_table(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

In [85]:
! pwd

/Users/endotakeru/Documents/python/signate/car-run/pipeline/notebooks


In [87]:
features = [
        'acceleration',
        'car_label_encoder',
        'cylinders',
        'displacement',
        'horsepower_mis_val_median',
        'model_year',
        'mpg',
        'origin',
        'weight',
]

In [88]:
train, test = load_datasets_both(features)

In [89]:
display(train.shape, test.shape)

(199, 9)

(199, 8)

In [90]:
display(train.head(), train.tail(), train.shape)

Unnamed: 0,acceleration,car name_label_encoder,cylinders,displacement,horsepower_mis_val_median,model year,mpg,origin,weight
0,16.0,96,4,135.0,84.0,82,29.0,1,2525.0
1,14.0,304,4,89.0,71.0,79,31.9,2,1925.0
2,15.5,276,6,156.0,108.0,76,19.0,3,2930.0
3,14.5,103,4,90.0,75.0,74,28.0,1,2125.0
4,17.3,278,4,89.0,62.0,81,37.7,3,2050.0


Unnamed: 0,acceleration,car name_label_encoder,cylinders,displacement,horsepower_mis_val_median,model year,mpg,origin,weight
194,19.2,79,4,85.0,65.0,80,40.8,3,2110.0
195,12.8,195,8,302.0,139.0,78,20.2,1,3570.0
196,12.0,14,8,304.0,150.0,70,16.0,1,3433.0
197,23.7,300,4,90.0,48.0,80,43.4,2,2335.0
198,15.5,121,4,98.0,90.0,73,26.0,2,2265.0


(199, 9)

In [91]:
display(train.describe(), test.describe())

Unnamed: 0,acceleration,car name_label_encoder,cylinders,displacement,horsepower_mis_val_median,model year,mpg,origin,weight
count,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0
mean,15.648,144.553,5.296,183.312,101.07,76.166,24.307,1.583,2883.839
std,2.702,92.657,1.645,98.4,35.237,3.803,7.798,0.799,819.767
min,8.5,0.0,3.0,71.0,46.0,70.0,9.0,1.0,1613.0
25%,14.0,61.0,4.0,98.0,75.0,73.0,18.0,1.0,2217.5
50%,15.5,140.0,4.0,140.0,90.0,76.0,24.0,1.0,2702.0
75%,17.15,227.0,6.0,250.0,112.5,80.0,30.5,2.0,3426.5
max,23.7,304.0,8.0,454.0,220.0,82.0,44.6,3.0,5140.0


Unnamed: 0,acceleration,car name_label_encoder,cylinders,displacement,horsepower_mis_val_median,model year,origin,weight
count,199.0,199.0,199.0,199.0,199.0,199.0,199.0,199.0
mean,15.488,152.548,5.613,203.54,107.482,75.854,1.563,3057.01
std,2.817,86.267,1.745,109.138,40.847,3.592,0.807,866.497
min,8.0,2.0,3.0,68.0,46.0,70.0,1.0,1795.0
25%,13.5,74.5,4.0,107.0,78.0,73.0,1.0,2255.5
50%,15.5,156.0,6.0,163.0,95.0,76.0,1.0,2945.0
75%,17.15,223.5,8.0,304.0,138.5,79.0,2.0,3672.0
max,24.8,303.0,8.0,455.0,230.0,82.0,3.0,4952.0


In [92]:
# 各データの欠損値を確認
display(
    missing_values_table(train),
    missing_values_table(test)
)

Unnamed: 0,acceleration,car name_label_encoder,cylinders,displacement,horsepower_mis_val_median,model year,mpg,origin,weight
Total,0,0,0,0,0,0,0,0,0
Percent,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
Types,float64,int64,int64,float64,float64,int64,float64,int64,float64


Unnamed: 0,acceleration,car name_label_encoder,cylinders,displacement,horsepower_mis_val_median,model year,origin,weight
Total,0,0,0,0,0,0,0,0
Percent,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
Types,float64,int64,int64,float64,float64,int64,int64,float64


In [93]:
# カラムのタイプを確認
display(
    train.dtypes
)

acceleration                 float64
car name_label_encoder         int64
cylinders                      int64
displacement                 float64
horsepower_mis_val_median    float64
model year                     int64
mpg                          float64
origin                         int64
weight                       float64
dtype: object

In [95]:
submission = pd.read_csv('../models/xgb_0209_1759/xgb_0209_1759_submission.csv', delimiter='\t')

In [109]:
sample_submission = pd.read_csv('../data/raw/sample_submit.csv', names=["id", "mpg"])

In [111]:
sample_submission.to_csv('sample_submit.csv', index = False)