In [3]:
# Basic module
import numpy as np
import seaborn as sns
import pandas as pd
import pickle
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
%matplotlib inline

-----------------
#### ■データの読み込み
------------------

In [4]:
train = pd.read_csv('../1. data/train.csv')
test = pd.read_csv('../1. data/test.csv')

In [5]:
# 確認
train.head(1)

Unnamed: 0,id,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,0,23.059782,6,140,110,2815,17.977429,80,1,dodge aspen


In [6]:
# 統計情報確認
train.describe()

Unnamed: 0,id,mpg,cylinders,displacement,weight,acceleration,model year,origin
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,500.176,27.01094,5.192,194.762,2719.714,15.300328,76.332,1.42
std,288.657179,7.356249,1.617519,106.277425,717.03541,2.261096,3.909007,0.759443
min,0.0,15.787613,4.0,79.0,1755.0,9.530859,70.0,1.0
25%,242.25,22.396641,4.0,104.0,2178.75,13.441562,73.0,1.0
50%,513.0,26.228984,4.0,140.0,2615.0,15.231923,76.0,1.0
75%,750.25,35.088333,6.0,302.0,3193.0,17.190531,80.0,2.0
max,997.0,44.763897,8.0,429.0,4732.0,21.922511,82.0,3.0


In [19]:
# 標準化関数
def stdsc(train, test):
    """学習データの値で標準化を行う関数
    
    Parameter:
    -----------------------------------
    train: ndarray
    test: ndarray
    
    Return:
    -----------------------------------
    ndarray
    """
    mean = np.mean(train)
    std = np.std(train)
    
    stdsc_test = (test - mean) / std
    
    return stdsc_test

In [20]:
# 標準化モジュールのインスタンス生成
# stdsc = StandardScaler()

# 標準化
train['cylinders_std'] = stdsc(train['cylinders'].values, train['cylinders'].values)
train['displacement_std'] = stdsc(train['displacement'].values, train['displacement'].values)
train['weight_std'] = stdsc(train['weight'].values, train['weight'].values)
train['acceleration_std'] = stdsc(train['acceleration'].values, train['acceleration'].values)
train['model year_std'] = stdsc(train['model year'].values, train['model year'].values)
train['origin_std'] = stdsc(train['origin'].values,train['origin'].values)

In [21]:
# 確認
columns = ['displacement_std','weight_std','acceleration_std','model year','origin_std']
for name in columns:
    print('{}: {}, {}'.format(name, train[name].mean(), train[name].var()))

displacement_std: 6.661338147750939e-18, 1.002004008016034
weight_std: 1.9317880628477722e-17, 1.0020040080160313
acceleration_std: -5.460493168740754e-16, 1.0020040080160324
model year: 76.332, 15.280336673346628
origin_std: 2.6645352591003758e-18, 1.0020040080160222


In [22]:
# テストデータの標準化
test.describe()

Unnamed: 0,id,cylinders,displacement,weight,acceleration,model year,origin
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,498.824,5.148,186.534,2740.426,15.301564,76.316,1.448
std,289.26911,1.617197,103.173108,688.956869,2.222428,3.83426,0.764503
min,1.0,4.0,80.0,1613.0,9.550639,70.0,1.0
25%,252.75,4.0,104.0,2197.5,13.467201,73.0,1.0
50%,490.0,4.0,140.0,2702.0,15.255378,76.0,1.0
75%,748.5,6.0,302.0,3212.0,17.182176,80.0,2.0
max,999.0,8.0,429.0,4732.0,21.770354,82.0,3.0


In [23]:
# テストデータの標準化
# 標準化
test['cylinders_std'] = stdsc(train['cylinders'].values, test['cylinders'].values)
test['displacement_std'] = stdsc(train['displacement'].values, test['displacement'].values)
test['weight_std'] = stdsc(train['weight'].values, test['weight'].values)
test['acceleration_std'] = stdsc(train['acceleration'].values, test['acceleration'].values)
test['model year_std'] = stdsc(train['model year'].values, test['model year'].values)
test['origin_std'] = stdsc(train['origin'].values,test['origin'].values)

In [24]:
# 確認
columns = ['displacement_std','weight_std','acceleration_std','model year','origin_std']
for name in columns:
    print('{}: {}, {}'.format(name, test[name].mean(), test[name].var()))

displacement_std: -0.07749755261528722, 0.9443227153240497
weight_std: 0.028914531722199197, 0.9250651514741064
acceleration_std: 0.0005471562542567172, 0.9680256182702732
model year: 76.316, 14.701547094188372
origin_std: 0.036906056197007216, 1.015401198505423


In [25]:
# 学習データの列取得
train.columns

Index(['id', 'mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model year', 'origin', 'car name', 'cylinders_std',
       'displacement_std', 'weight_std', 'acceleration_std', 'model year_std',
       'origin_std'],
      dtype='object')

In [26]:
# テストデータの列取得
test.columns

Index(['id', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model year', 'origin', 'car name', 'cylinders_std',
       'displacement_std', 'weight_std', 'acceleration_std', 'model year_std',
       'origin_std'],
      dtype='object')

In [27]:
# 学習データ
# 不要な列を削除
drop_colums = [
        'id', 
#         'mpg', 
        'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin', 'car name', 
#         'displacement_std', 'weight_std', 'acceleration_std', 'model year_std', 'cylinders_std', 'origin_std'
]

X = train.drop(drop_colums, axis=1)
file_name = '../1. data/train_pre_proccessing.pkl'
with open(file_name, 'wb') as fp:
    pickle.dump(X, fp)

In [28]:
X

Unnamed: 0,mpg,cylinders_std,displacement_std,weight_std,acceleration_std,model year_std,origin_std
0,23.059782,0.500031,-0.515790,0.133022,1.185170,0.939285,-0.553591
1,17.674521,1.737730,1.462149,2.423904,-0.790582,-1.109320,-0.553591
2,17.136353,1.737730,1.010049,0.075785,-0.925440,0.683210,-0.553591
3,22.664666,0.500031,1.933087,-0.739496,-0.046018,-1.365395,-0.553591
4,17.872018,1.737730,2.206231,-0.662714,-2.514097,-1.621471,-0.553591
...,...,...,...,...,...,...,...
495,22.798447,-0.737669,-0.515790,0.160942,-0.806945,1.451437,-0.553591
496,35.173640,-0.737669,-0.920797,-0.678070,0.992704,0.939285,2.082556
497,17.825448,1.737730,1.010049,0.075785,-0.054514,-0.085017,-0.553591
498,28.545147,-0.737669,-0.920797,-0.823257,-0.874637,-1.621471,-0.553591


In [29]:
# テストデータ
# 不要な列を削除
drop_colums = [
        'id', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model year', 'origin', 'car name', 
#         'displacement_std', 'weight_std', 'acceleration_std', 'cylinders_std', 'origin_std'
]

X = test.drop(drop_colums, axis=1)
file_name = '../1. data/test_pre_proccessing.pkl'
with open(file_name, 'wb') as fp:
    pickle.dump(X, fp)

In [30]:
X

Unnamed: 0,cylinders_std,displacement_std,weight_std,acceleration_std,model year_std,origin_std
0,-0.737669,-0.911378,-1.004741,-0.110913,1.195361,-0.553591
1,-0.737669,-0.920797,0.000399,0.017197,0.683210,-0.553591
2,-0.737669,-0.986728,0.121854,1.116183,0.171059,-0.553591
3,-0.737669,-0.515790,0.121854,-0.672902,1.451437,0.764483
4,-0.737669,-0.694747,0.489009,-0.713380,-0.853244,0.764483
...,...,...,...,...,...,...
495,-0.737669,0.284804,1.605831,0.033638,-0.085017,-0.553591
496,-0.737669,-0.515790,0.000399,-0.745289,1.451437,-0.553591
497,1.737730,1.160749,0.075785,-0.897672,-0.341093,-0.553591
498,-0.737669,-0.986728,0.121854,1.103327,0.171059,-0.553591
