In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [5]:
df = pd.read_csv('data_beforeScaling.csv', encoding='utf-8-sig', index_col=0)
print(df.dtypes)
print()
print(df.columns[:])
print()
print(df.describe())

region            int64
education         int64
jobtype           int64
cptype            int64
sales           float64
employees       float64
aversalary      float64
capital         float64
pros_encoded    float64
dtype: object

Index(['region', 'education', 'jobtype', 'cptype', 'sales', 'employees',
       'aversalary', 'capital', 'pros_encoded'],
      dtype='object')

            region    education      jobtype       cptype         sales  \
count  3015.000000  3015.000000  3015.000000  3015.000000  2.253000e+03   
mean      0.764842     0.836153     0.482587     0.670315  3.576065e+06   
std       0.808932     1.170736     0.499780     0.817098  9.453942e+06   
min       0.000000     0.000000     0.000000     0.000000  0.000000e+00   
25%       0.000000     0.000000     0.000000     0.000000  5.941000e+03   
50%       1.000000     0.000000     0.000000     0.000000  1.542990e+05   
75%       1.000000     2.000000     1.000000     1.000000  1.191101e+06   
max       2.000000     5

In [None]:
scaler = StandardScaler()

df.loc[:, 'sales': 'pros_encoded'] = scaler.fit_transform(df.loc[:, 'sales': 'pros_encoded'])
df = df.round(4)

In [None]:
scaler = MinMaxScaler()

df.loc[:, 'sales': 'pros_encoded'] = scaler.fit_transform(df.loc[:, 'sales': 'pros_encoded'])
df = df.round(4)

In [6]:
scaler = RobustScaler()

df.loc[:, 'sales': 'pros_encoded'] = scaler.fit_transform(df.loc[:, 'sales': 'pros_encoded'])
df = df.round(4)

print(df.describe())

            region    education      jobtype       cptype        sales  \
count  3015.000000  3015.000000  3015.000000  3015.000000  2253.000000   
mean      0.764842     0.836153     0.482587     0.670315     2.887175   
std       0.808932     1.170736     0.499780     0.817098     7.976932   
min       0.000000     0.000000     0.000000     0.000000    -0.130200   
25%       0.000000     0.000000     0.000000     0.000000    -0.125200   
50%       1.000000     0.000000     0.000000     0.000000     0.000000   
75%       1.000000     2.000000     1.000000     1.000000     0.874800   
max       2.000000     5.000000     1.000000     2.000000    95.856900   

         employees   aversalary      capital  pros_encoded  
count  2482.000000  2149.000000  2303.000000   3015.000000  
mean      1.121039     0.154087    24.992462      0.153018  
std       3.190629     0.750271   162.426437      0.609906  
min      -0.209200    -1.782600    -0.113000     -0.500000  
25%      -0.187600    -0.304

In [7]:
# Robust Scaling 이후로도 절댓값이 10보다 큰 값들에 대해 크기 조정
def log_transform(x):
    if x == 0:
        return 0
    elif x > 10:
        return 10 + np.log10(x-10)
    elif x < -10:
        return -1 * np.log(abs(x) - 10)
    else:
        return x

df.loc[:, 'sales': 'pros_encoded'] = df.loc[:, 'sales': 'pros_encoded'].map(log_transform)

print(df.describe())

            region    education      jobtype       cptype        sales  \
count  3015.000000  3015.000000  3015.000000  3015.000000  2253.000000   
mean      0.764842     0.836153     0.482587     0.670315     1.609243   
std       0.808932     1.170736     0.499780     0.817098     3.502903   
min       0.000000     0.000000     0.000000     0.000000    -0.130200   
25%       0.000000     0.000000     0.000000     0.000000    -0.125200   
50%       1.000000     0.000000     0.000000     0.000000     0.000000   
75%       1.000000     2.000000     1.000000     1.000000     0.874800   
max       2.000000     5.000000     1.000000     2.000000    11.933775   

         employees   aversalary      capital  pros_encoded  
count  2482.000000  2149.000000  2303.000000   3015.000000  
mean      0.972960     0.154087     1.615437      0.153018  
std       2.423187     0.750271     3.639170      0.609906  
min      -0.209200    -1.782600    -0.113000     -0.500000  
25%      -0.187600    -0.304

In [8]:
df.to_csv('data.csv', encoding='utf-8-sig')