## Import

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import random
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

## Data Load

In [18]:
train_df = pd.read_csv('/content/drive/MyDrive/LG Aimers 해커톤 data/open/train.csv')

In [13]:
train_df.head()

Unnamed: 0,ID,X_01,X_02,X_03,X_04,X_05,X_06,X_07,X_08,X_09,...,Y_05,Y_06,Y_07,Y_08,Y_09,Y_10,Y_11,Y_12,Y_13,Y_14
0,TRAIN_00001,70.544,103.32,67.47,1,101.892,74.983,29.45,62.38,245.71,...,29.632,16.083,4.276,-25.381,-25.529,-22.769,23.792,-25.47,-25.409,-25.304
1,TRAIN_00002,69.524,103.321,65.17,1,101.944,72.943,28.73,61.23,233.61,...,33.179,16.736,3.229,-26.619,-26.523,-22.574,24.691,-26.253,-26.497,-26.438
2,TRAIN_00003,72.583,103.32,64.07,1,103.153,72.943,28.81,105.77,272.2,...,31.801,17.08,2.839,-26.238,-26.216,-22.169,24.649,-26.285,-26.215,-26.37
3,TRAIN_00004,71.563,103.32,67.57,1,101.971,77.022,28.92,115.21,255.36,...,34.503,17.143,3.144,-25.426,-25.079,-21.765,24.913,-25.254,-25.021,-25.345
4,TRAIN_00005,69.524,103.32,63.57,1,101.981,70.904,29.68,103.38,241.46,...,32.602,17.569,3.138,-25.376,-25.242,-21.072,25.299,-25.072,-25.195,-24.974


In [19]:
train_x = train_df.filter(regex='X') # Input : X Featrue
train_y = train_df.filter(regex='Y') # Output : Y Feature

# 데이터 탐색

In [20]:
# 열 이름. 데이터 형식, 값 개수 확인
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39607 entries, 0 to 39606
Data columns (total 71 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      39607 non-null  object 
 1   X_01    39607 non-null  float64
 2   X_02    39607 non-null  float64
 3   X_03    39607 non-null  float64
 4   X_04    39607 non-null  int64  
 5   X_05    39607 non-null  float64
 6   X_06    39607 non-null  float64
 7   X_07    39607 non-null  float64
 8   X_08    39607 non-null  float64
 9   X_09    39607 non-null  float64
 10  X_10    39607 non-null  float64
 11  X_11    39607 non-null  float64
 12  X_12    39607 non-null  float64
 13  X_13    39607 non-null  float64
 14  X_14    39607 non-null  float64
 15  X_15    39607 non-null  float64
 16  X_16    39607 non-null  float64
 17  X_17    39607 non-null  float64
 18  X_18    39607 non-null  float64
 19  X_19    39607 non-null  float64
 20  X_20    39607 non-null  float64
 21  X_21    39607 non-null  float64
 22

In [None]:
# 기초 통계량 
train_df.describe()

Unnamed: 0,X_01,X_02,X_03,X_04,X_05,X_06,X_07,X_08,X_09,X_10,...,Y_05,Y_06,Y_07,Y_08,Y_09,Y_10,Y_11,Y_12,Y_13,Y_14
count,39607.0,39607.0,39607.0,39607.0,39607.0,39607.0,39607.0,39607.0,39607.0,39607.0,...,39607.0,39607.0,39607.0,39607.0,39607.0,39607.0,39607.0,39607.0,39607.0,39607.0
mean,68.41204,103.320166,68.826354,1.0,102.337203,70.597211,29.40749,164.44932,225.39747,0.002449,...,31.290467,16.529382,3.155054,-26.294839,-26.308623,-22.400062,24.325061,-26.237762,-26.233869,-26.245868
std,2.655983,0.000372,5.151167,0.0,0.548353,2.25982,7.338204,220.402444,66.734725,0.086255,...,2.543222,1.893014,0.41894,0.660537,0.65358,0.920952,0.830197,0.656329,0.65509,0.655989
min,56.268,103.32,56.47,1.0,101.774,61.726,14.14,38.46,37.58,0.0,...,18.589,-19.963,0.502,-29.652,-29.523,-31.119,19.844,-29.544,-29.448,-29.62
25%,66.465,103.32,65.07,1.0,101.949,68.864,27.89,105.99,188.54,0.0,...,29.768,16.146,2.863,-26.689,-26.702,-22.871,23.836,-26.63,-26.624,-26.64
50%,68.504,103.32,67.27,1.0,102.006,69.884,28.84,115.04,234.45,0.0,...,31.71,16.694,3.126,-26.254,-26.266,-22.275,24.42,-26.198,-26.193,-26.204
75%,69.524,103.32,71.77,1.0,103.144,71.923,29.87,132.62,263.96,0.0,...,33.184,17.164,3.4335,-25.855,-25.871,-21.791,24.9115,-25.799,-25.794,-25.809
max,84.82,103.321,89.17,1.0,103.16,87.219,163.86,2387.44,637.49,3.6,...,37.25,18.998,5.299,-23.785,-23.96,-20.052,26.703,-23.722,-23.899,-23.856


In [None]:
# 결측치 확인
print(train_df.isna().sum())

ID      0
X_01    0
X_02    0
X_03    0
X_04    0
       ..
Y_10    0
Y_11    0
Y_12    0
Y_13    0
Y_14    0
Length: 71, dtype: int64


In [None]:
# 상관관계 확인
print(df_corr)

          X_01      X_02      X_03  X_04      X_05      X_06      X_07  \
X_01  1.000000 -0.084611  0.211988   NaN  0.553974  0.626521 -0.008373   
X_02 -0.084611  1.000000 -0.028076   NaN -0.048915 -0.047862 -0.003771   
X_03  0.211988 -0.028076  1.000000   NaN -0.076944  0.407266  0.011814   
X_04       NaN       NaN       NaN   NaN       NaN       NaN       NaN   
X_05  0.553974 -0.048915 -0.076944   NaN  1.000000  0.104653 -0.032107   
...        ...       ...       ...   ...       ...       ...       ...   
Y_10  0.034081 -0.008553  0.107851   NaN  0.001554  0.060159 -0.017691   
Y_11  0.009947 -0.007557  0.034114   NaN  0.013407  0.012831 -0.002733   
Y_12  0.066904 -0.013885  0.111755   NaN  0.033491  0.070004 -0.015829   
Y_13  0.068074 -0.013285  0.112385   NaN  0.034294  0.069610 -0.016125   
Y_14  0.067083 -0.013482  0.109578   NaN  0.034546  0.069117 -0.017208   

          X_08      X_09      X_10  ...      Y_05      Y_06      Y_07  \
X_01  0.007622 -0.033579  0.011021  ..

## 전처리
###### 1. X4 : 1차 검사 통과 여부 // 범주로 처리 후 가변수화
###### 2. X23 : 2차 검사 통과 여부 // 범주로 처리 후 가변수화
###### 3. X50 ~ 56 // 이상치 평균치로 처리 or 선형보간법

In [21]:
# X4 : 1차 검사 통과 여부 // 범주로 처리
train_x['X_04'] = train_x['X_04'].map({1:'통과', 0:'탈락'})

# 확인
print(train_x.head())

     X_01     X_02   X_03 X_04     X_05    X_06   X_07    X_08    X_09  X_10  \
0  70.544  103.320  67.47   통과  101.892  74.983  29.45   62.38  245.71   0.0   
1  69.524  103.321  65.17   통과  101.944  72.943  28.73   61.23  233.61   0.0   
2  72.583  103.320  64.07   통과  103.153  72.943  28.81  105.77  272.20   0.0   
3  71.563  103.320  67.57   통과  101.971  77.022  28.92  115.21  255.36   0.0   
4  69.524  103.320  63.57   통과  101.981  70.904  29.68  103.38  241.46   0.0   

   ...  X_47  X_48      X_49        X_50        X_51        X_52        X_53  \
0  ...     1     1   9706.03  137.043591  135.359219  147.837968  134.313475   
1  ...     1     1  10423.43  133.736691  135.979817  149.924692  123.630583   
2  ...     1     1  10948.53  132.805112  131.055355  146.814592  128.939070   
3  ...     1     1  15007.03  134.138760  133.239422  139.720132  132.260824   
4  ...     1     1  11051.03  142.728970  136.620022  134.853555  134.760252   

         X_54        X_55        X_56 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [22]:
# X23 : 2차 검사 통과 여부 // 범주로 처리
train_x['X_23'] = train_x['X_23'].map({1:'통과', 0:'탈락'})

# 확인
print(train_x.head())

     X_01     X_02   X_03 X_04     X_05    X_06   X_07    X_08    X_09  X_10  \
0  70.544  103.320  67.47   통과  101.892  74.983  29.45   62.38  245.71   0.0   
1  69.524  103.321  65.17   통과  101.944  72.943  28.73   61.23  233.61   0.0   
2  72.583  103.320  64.07   통과  103.153  72.943  28.81  105.77  272.20   0.0   
3  71.563  103.320  67.57   통과  101.971  77.022  28.92  115.21  255.36   0.0   
4  69.524  103.320  63.57   통과  101.981  70.904  29.68  103.38  241.46   0.0   

   ...  X_47  X_48      X_49        X_50        X_51        X_52        X_53  \
0  ...     1     1   9706.03  137.043591  135.359219  147.837968  134.313475   
1  ...     1     1  10423.43  133.736691  135.979817  149.924692  123.630583   
2  ...     1     1  10948.53  132.805112  131.055355  146.814592  128.939070   
3  ...     1     1  15007.03  134.138760  133.239422  139.720132  132.260824   
4  ...     1     1  11051.03  142.728970  136.620022  134.853555  134.760252   

         X_54        X_55        X_56 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [23]:
# 가변수화
dummy_vars = ['X_23', 'X_04']
train_x = pd.get_dummies(train_x, columns=dummy_vars, drop_first=True)

# 확인
train_x

Unnamed: 0,X_01,X_02,X_03,X_05,X_06,X_07,X_08,X_09,X_10,X_11,...,X_47,X_48,X_49,X_50,X_51,X_52,X_53,X_54,X_55,X_56
0,70.544,103.320,67.47,101.892,74.983,29.45,62.38,245.71,0.0,0.0,...,1,1,9706.03,137.043591,135.359219,147.837968,134.313475,125.605427,136.721425,125.028256
1,69.524,103.321,65.17,101.944,72.943,28.73,61.23,233.61,0.0,0.0,...,1,1,10423.43,133.736691,135.979817,149.924692,123.630583,127.893337,143.322659,124.877308
2,72.583,103.320,64.07,103.153,72.943,28.81,105.77,272.20,0.0,0.0,...,1,1,10948.53,132.805112,131.055355,146.814592,128.939070,127.012195,140.395688,122.238232
3,71.563,103.320,67.57,101.971,77.022,28.92,115.21,255.36,0.0,0.0,...,1,1,15007.03,134.138760,133.239422,139.720132,132.260824,130.723186,147.624829,134.875225
4,69.524,103.320,63.57,101.981,70.904,29.68,103.38,241.46,0.0,0.0,...,1,1,11051.03,142.728970,136.620022,134.853555,134.760252,125.647793,139.331105,123.272762
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39602,66.465,103.320,62.27,103.150,66.825,30.20,77.83,298.05,0.0,0.0,...,1,1,60630.73,129.965741,130.807148,133.481737,125.273130,121.780933,133.780110,129.029812
39603,66.465,103.321,62.77,102.021,66.825,29.21,102.25,270.67,0.0,0.0,...,1,1,60763.43,127.633885,120.158764,142.667802,122.465490,122.987209,143.090741,122.811413
39604,68.504,103.320,64.67,103.144,68.864,29.96,102.61,198.07,0.0,0.0,...,1,1,8813.33,132.501286,136.893025,134.419328,129.115431,130.920147,140.489232,119.166699
39605,66.465,103.320,63.67,102.025,67.845,30.30,112.60,275.52,0.0,0.0,...,1,1,62222.33,128.189679,121.495930,141.288011,130.141676,125.518825,136.603634,124.525929


In [27]:
# X50 ~ 56 // 이상치 평균치로 처리

train_x['X_50'].replace('######',np.NaN)
train_x['X_51'].replace('######',np.NaN)
train_x['X_52'].replace('######',np.NaN)
train_x['X_53'].replace('######',np.NaN)
train_x['X_54'].replace('######',np.NaN)
train_x['X_55'].replace('######',np.NaN)
train_x['X_56'].replace('######',np.NaN)

mean_X10 = train_x['X_50'].mean()
mean_X11 = train_x['X_51'].mean()
mean_X11 = train_x['X_52'].mean()
mean_X11 = train_x['X_53'].mean()
mean_X11 = train_x['X_54'].mean()
mean_X11 = train_x['X_55'].mean()
mean_X11 = train_x['X_56'].mean()

# 채우기
train_x['X_50'].fillna(mean_X10, inplace=True)
train_x['X_51'].fillna(mean_X11, inplace=True)
train_x['X_52'].fillna(mean_X11, inplace=True)
train_x['X_53'].fillna(mean_X11, inplace=True)
train_x['X_54'].fillna(mean_X11, inplace=True)
train_x['X_55'].fillna(mean_X11, inplace=True)
train_x['X_56'].fillna(mean_X11, inplace=True)


In [28]:
# 확인
train_x['X_50']

0        137.043591
1        133.736691
2        132.805112
3        134.138760
4        142.728970
            ...    
39602    129.965741
39603    127.633885
39604    132.501286
39605    128.189679
39606    135.096272
Name: X_50, Length: 39607, dtype: float64

In [None]:
# X50 ~ 56 // 이상치 선형보간법으로 처리

train_x['X_50'].replace('######',np.NaN)
train_x['X_51'].replace('######',np.NaN)
train_x['X_52'].replace('######',np.NaN)
train_x['X_53'].replace('######',np.NaN)
train_x['X_54'].replace('######',np.NaN)
train_x['X_55'].replace('######',np.NaN)
train_x['X_56'].replace('######',np.NaN)

# 채우기
train_x['X_50'].interpolate(method='linear', inplace=True)
train_x['X_51'].interpolate(method='linear', inplace=True)
train_x['X_52'].interpolate(method='linear', inplace=True)
train_x['X_53'].interpolate(method='linear', inplace=True)
train_x['X_54'].interpolate(method='linear', inplace=True)
train_x['X_55'].interpolate(method='linear', inplace=True)
train_x['X_56'].interpolate(method='linear', inplace=True)

## Regression Model Fit

In [None]:
LR = MultiOutputRegressor(LinearRegression()).fit(train_x, train_y)
print('Done.')

Done.


## Inference

In [None]:
test_x = pd.read_csv('/content/drive/MyDrive/LG Aimers 해커톤 data/open/test.csv').drop(columns=['ID'])

In [None]:
preds = LR.predict(test_x)
print('Done.')

Done.


## Submit

In [None]:
submit = pd.read_csv('/content/drive/MyDrive/LG Aimers 해커톤 data/open/sample_submission.csv')

In [None]:
for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = preds[:,idx-1]
print('Done.')

Done.


In [None]:
submit.to_csv('./submit.csv', index=False)