# HR Data Analytics Internship 개인 과제

## 전복의 physical 특징을 기반으로 나이를 예측

### 데이터 특성

|구분|단위|상세설명|
|------|---|---|
|성별|-|M: male, F: female, I: infant|
|길이|inch|전복의 가장 긴 단면길이|
|지름|inch|전복의 가장 짧은 단면길이|
|높이|inch|전복의 높이|
|총무게|lb|전복 전체의 무게|
|속살무게|lb|전복 속살의 무게|
|내장무게|lb|전복 내장의 무게|
|나이|years|전복의 나이|

### 데이터 불러오기

In [1]:
import pandas as pd

In [20]:
data = pd.read_excel("2020 HR Data Analytics 데이터 (사후과제시 열람).xlsx", sheet_name = 1, columns=['성별', '길이', '지름', '높이', '총무게', '속살무게', '내장무게', '나이'])
data = data.drop("ID", axis=1)
data

Unnamed: 0,성별,길이,지름,높이,총무게,속살무게,내장무게,나이
0,M,4.55,3.65,0.95,0.5140,0.2245,0.1010,15
1,M,3.50,2.65,0.90,0.2255,0.0995,0.0485,7
2,F,5.30,4.20,1.35,0.6770,0.2565,0.1415,9
3,M,4.40,3.65,1.25,0.5160,0.2155,0.1140,10
4,I,3.30,2.55,0.80,0.2050,0.0895,0.0395,7
...,...,...,...,...,...,...,...,...
4172,F,5.65,4.50,1.65,0.8870,0.3700,0.2390,11
4173,M,5.90,4.40,1.35,0.9660,0.4390,0.2145,10
4174,M,6.00,4.75,2.05,1.1760,0.5255,0.2875,9
4175,F,6.25,4.85,1.50,1.0945,0.5310,0.2610,10


### 데이터 전처리 (정규화)

> 데이터가 가진 feature의 스케일이 차이가 나기 때문에 정규화가 필요하다.

In [21]:
import numpy as np

#### 성별 정규화

In [22]:
data.loc[data['성별'] == 'M', '성별'] = 1
data.loc[data['성별'] == 'F', '성별'] = 0.5
data.loc[data['성별'] == 'I', '성별'] = 0

In [23]:
data.head(5)

Unnamed: 0,성별,길이,지름,높이,총무게,속살무게,내장무게,나이
0,1.0,4.55,3.65,0.95,0.514,0.2245,0.101,15
1,1.0,3.5,2.65,0.9,0.2255,0.0995,0.0485,7
2,0.5,5.3,4.2,1.35,0.677,0.2565,0.1415,9
3,1.0,4.4,3.65,1.25,0.516,0.2155,0.114,10
4,0.0,3.3,2.55,0.8,0.205,0.0895,0.0395,7


#### 길이, 지름, 높이, 총무게, 속살무게, 내장무게

첫 번째 방법, Min-Max Normalization

In [24]:
data["길이"] = (data["길이"] - min(data["길이"])) / (max(data["길이"]) - min(data["길이"]))
data["지름"] = (data["지름"] - min(data["지름"])) / (max(data["지름"]) - min(data["지름"]))
data["높이"] = (data["높이"] - min(data["높이"])) / (max(data["높이"]) - min(data["높이"]))
data["총무게"] = (data["총무게"] - min(data["총무게"])) / (max(data["총무게"]) - min(data["총무게"]))
data["속살무게"] = (data["속살무게"] - min(data["속살무게"])) / (max(data["속살무게"]) - min(data["속살무게"]))
data["내장무게"] = (data["내장무게"] - min(data["내장무게"])) / (max(data["내장무게"]) - min(data["내장무게"]))

In [25]:
data.head(5)

Unnamed: 0,성별,길이,지름,높이,총무게,속살무게,내장무게,나이
0,1.0,0.513514,0.050832,0.278689,0.181335,0.150303,0.132324,15
1,1.0,0.371622,0.032348,0.262295,0.079157,0.066241,0.063199,7
2,0.5,0.614865,0.060998,0.409836,0.239065,0.171822,0.185648,9
3,1.0,0.493243,0.050832,0.377049,0.182044,0.14425,0.14944,10
4,0.0,0.344595,0.030499,0.229508,0.071897,0.059516,0.05135,7


### 훈련데이터와 검증데이터 분류

전처리된 데이터 중 일부는 모델을 훈련시킬 train_x와 train_y로 분류합니다.

나머지 데이터는 정확도를 검증할 test_x, text_y로 분류합니다

훈련데이터와 검증데이터의 비율은 7:3 으로 하였습니다.

In [26]:
4177 * 0.7

2923.8999999999996

In [27]:
x_train = data[:2923]
del x_train["나이"]
x_train = x_train.to_numpy()
y_train = data[:2923][["나이"]].to_numpy()

In [28]:
x_test = data[2923:]
del x_test["나이"]
x_test = x_test.to_numpy()
y_test = data[2923:][["나이"]].to_numpy()

### 모델 생성

#### Decision Tree

In [29]:
from sklearn.tree import DecisionTreeRegressor

In [30]:
dt_model = DecisionTreeRegressor(random_state=1)

In [31]:
dt_model.fit(x_train, y_train)

DecisionTreeRegressor(random_state=1)

#### Model Validation

In [32]:
from sklearn.metrics import mean_absolute_error

In [34]:
predicted_ages = dt_model.predict(x_test)
mean_absolute_error(y_test, predicted_ages)

2.1251993620414673

#### Compare Different Tree Sizes

In [35]:
def get_mae(max_leaf_nodes, x_train, x_test, y_train, y_test):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(x_train, y_train)
    preds_val = model.predict(x_test)
    mae = mean_absolute_error(y_test, preds_val)
    return(mae)

In [38]:
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
ans = 0
res_min = 10000000
for max_leaf_nodes in candidate_max_leaf_nodes:
    res = get_mae(max_leaf_nodes, x_train, x_test, y_train, y_test)
    print(res)
    if res_min > res:
        res_min = res
        ans = max_leaf_nodes

print("best_tree_size =", ans)

1.7080663871881563
1.5789745300298699
1.5929975410350548
1.692747621590008
1.8132810801009511
1.9587343437733253
best_tree_size = 25


In [39]:
my_dt_model = DecisionTreeRegressor(max_leaf_nodes=250, random_state=0)
my_dt_model.fit(x_train, y_train)
score = my_dt_model.score(x_test, y_test)
print('score:', score)

score: 0.09106368668423914


#### Random Forests

In [40]:
from sklearn.ensemble import RandomForestRegressor

In [41]:
rf_model = RandomForestRegressor(random_state=1)

In [42]:
rf_model.fit(x_train, y_train.ravel())

RandomForestRegressor(random_state=1)

In [43]:
# rf_val_mae = mean_absolute_error(rf_model.predict(val_X), val_y)

### 딥러닝

In [44]:
from keras.layers import Dense, Dropout, Activation
from keras.models import Sequential

In [84]:
model = Sequential()
model.add(Dense(512, input_dim=7))
model.add(Activation('relu'))
model.add(Dropout(0.1))
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.1))
model.add(Dense(1))
model.add(Activation('relu'))
model.compile('adam', 'mean_squared_error', metrics=['accuracy'])

model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 512)               4096      
_________________________________________________________________
activation_9 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_6 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 512)               262656    
_________________________________________________________________
activation_10 (Activation)   (None, 512)               0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 1)                

#### 모델 fitting

In [85]:
data

Unnamed: 0,성별,길이,지름,높이,총무게,속살무게,내장무게,나이
0,1,0.513514,0.050832,0.278689,0.181335,0.150303,0.132324,15
1,1,0.371622,0.032348,0.262295,0.079157,0.066241,0.063199,7
2,0.5,0.614865,0.060998,0.409836,0.239065,0.171822,0.185648,9
3,1,0.493243,0.050832,0.377049,0.182044,0.144250,0.149440,10
4,0,0.344595,0.030499,0.229508,0.071897,0.059516,0.051350,7
...,...,...,...,...,...,...,...,...
4172,0.5,0.662162,0.066543,0.508197,0.313441,0.248151,0.314022,11
4173,1,0.695946,0.064695,0.409836,0.341420,0.294553,0.281764,10
4174,1,0.709459,0.071165,0.639344,0.415796,0.352724,0.377880,9
4175,0.5,0.743243,0.073013,0.459016,0.386931,0.356422,0.342989,10


In [86]:
x_train

array([[1.        , 0.5135135 , 0.05083179, ..., 0.18133523, 0.15030262,
        0.13232389],
       [1.        , 0.3716216 , 0.0323475 , ..., 0.07915708, 0.06624075,
        0.06319948],
       [0.5       , 0.6148649 , 0.06099815, ..., 0.23906499, 0.17182246,
        0.18564846],
       ...,
       [1.        , 0.7094595 , 0.06654344, ..., 0.47387993, 0.4142569 ,
        0.4279131 ],
       [0.5       , 0.7094595 , 0.06654344, ..., 0.34018064, 0.29354405,
        0.29229757],
       [1.        , 0.7094595 , 0.06931608, ..., 0.3702851 , 0.31203765,
        0.30809742]], dtype=float32)

In [87]:
y_train

array([[15.],
       [ 7.],
       [ 9.],
       ...,
       [10.],
       [ 9.],
       [11.]], dtype=float32)

In [88]:
x_train = np.asarray(x_train).astype(np.float32)
y_train = np.asarray(y_train).astype(np.float32)

In [89]:
x_train

array([[1.        , 0.5135135 , 0.05083179, ..., 0.18133523, 0.15030262,
        0.13232389],
       [1.        , 0.3716216 , 0.0323475 , ..., 0.07915708, 0.06624075,
        0.06319948],
       [0.5       , 0.6148649 , 0.06099815, ..., 0.23906499, 0.17182246,
        0.18564846],
       ...,
       [1.        , 0.7094595 , 0.06654344, ..., 0.47387993, 0.4142569 ,
        0.4279131 ],
       [0.5       , 0.7094595 , 0.06654344, ..., 0.34018064, 0.29354405,
        0.29229757],
       [1.        , 0.7094595 , 0.06931608, ..., 0.3702851 , 0.31203765,
        0.30809742]], dtype=float32)

In [90]:
y_train

array([[15.],
       [ 7.],
       [ 9.],
       ...,
       [10.],
       [ 9.],
       [11.]], dtype=float32)

In [91]:
model.fit(x_train, y_train, epochs=30, batch_size=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x23233ed7a60>

In [92]:
x_test = np.asarray(x_test).astype(np.float32)
y_test = np.asarray(y_test).astype(np.float32)

In [100]:
loss, accuracy = model.evaluate(x_test, y_test, verbose=2)
print('loss:', loss)
print('accuracy:', accuracy)

40/40 - 0s - loss: 3.8074 - accuracy: 0.0000e+00
loss: 3.8073904514312744
accuracy: 0.0


In [101]:
model.predict(x_test)

array([[11.645298],
       [12.714873],
       [10.036525],
       ...,
       [11.231854],
       [ 9.3495  ],
       [10.592229]], dtype=float32)

In [102]:
y_test

array([[13.],
       [ 9.],
       [10.],
       ...,
       [ 9.],
       [10.],
       [12.]], dtype=float32)