In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 1) 載入資料集

In [None]:
df_train = pd.read_csv('data/Train.csv')
df_train = df_train.drop(labels=['Id'],axis=1) # 移除 Id

## 2) 檢查缺失值
使用 numpy 所提供的函式來檢查是否有 NA 缺失值，假設有缺失值使用 dropna() 來移除。使用的時機在於當只有少量的缺失值適用，若遇到有大量缺失值的情況，或是本身的資料量就很少的情況下建議可以透過機器學習的方法補值來預測缺失值。

```python
# 移除缺失值
train=train.dropna()
```

In [None]:
df_train.info()

In [None]:
# checked missing data
print("Before data clean(NAN mount):", len(np.where(np.isnan(df_train)==1)[0]))

In [None]:
unique_col = []
for i in df_train.columns:
    if np.unique(df_train[i]).shape[0]==1:
        unique_col.append(i)        

In [None]:
unique_col

In [None]:
df_train.describe()[unique_col]

In [None]:
df_train = df_train.drop(unique_col, axis=1)

## 3) 資料前處理

#### **特徵標準化**
通常有兩種標準化的方法：
- min max normalization：
    - 會將特徵數據按比例縮放到 0 到 1 的區間，（或是 -1 到 1）。
- standard deviation normalization：
    - 會將所有特徵數據縮放成平均為 0、平方差為 1。
    
#### **特徵組合**
特徵需要適當地增加和減少，以提升精確度並減少計算時間。
- 增加特徵：特徵組合 (Feature Combination)、群聚編碼 (GroupBy Encoding)、產生合成樣本 (Oversampling)
- 減少特徵：特徵篩選 (Feature Selection)、剔除一些樣本 (Undersampling)

<img src="https://drive.google.com/uc?export=download&id=10KIP1EWz3UUyoATe7GcNWeYm1nc7EXgU" width=800>

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [None]:
def data_preprocessing(df_input, train=True, sc=None):
    # numeric feature standardization
    if train:
        sc = StandardScaler()
#         sc = MinMaxScaler()
        df = sc.fit_transform(df_input.iloc[:, 0:-1])
    else:
        df = sc.transform(df_input)
    return df, sc

In [None]:
X, train_sc = data_preprocessing(df_train)

In [None]:
X.shape

In [None]:
train_sc.mean_

In [None]:
train_sc.var_

## 3) One hot encoding
對`Cover Type`輸出欄位的資料做 one-hot encoding，使用 Keras 提供的工具函式 to_categorical 將每筆資料的輸出值 y 轉換成一個向量。

In [None]:
y = df_train['Cover_Type'].values

In [None]:
y.shape

In [None]:
np.unique(y)

In [None]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y-1)

In [None]:
y.shape

In [None]:
np.unique(y, axis=0)

## 4) 切割訓練集與測試集

In [None]:
from sklearn.model_selection import train_test_split
X_train , X_valid , y_train , y_valid = train_test_split(X, y, test_size=0.3, random_state=17, stratify=y)

In [None]:
unique, counts = np.unique(y.argmax(-1), return_counts=True)
plt.bar(unique, counts)

In [None]:
unique, counts = np.unique(y_train.argmax(-1), return_counts=True)
plt.bar(unique, counts)

In [None]:
print('訓練資料: ', X_train.shape, '\t訓練目標: ', y_train.shape)
print('驗證資料: ', X_valid.shape, '\t驗證目標: ', y_valid.shape)

## 5) 建立網路模型

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Activation, Dropout, BatchNormalization
from tensorflow.keras import Sequential, callbacks
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, Nadam

print(tf.__version__)

In [None]:
# 此範例使用 Sequential API 搭建神經網路。
def build_model(X):
    tf.random.set_seed(17)
    model = Sequential()
    model.add(Dense(32, input_shape=X.shape[1:]))
    model.add(Activation('sigmoid'))
    model.add(Dense(32))
    model.add(Activation('sigmoid'))
    model.add(Dense(32))
    model.add(Activation('sigmoid'))
    model.add(Dense(y_train.shape[1], Activation('softmax')))
    return model

In [None]:
tf.keras.backend.clear_session()
model = build_model(X_train)
model.summary()

In [None]:
# 編譯模型
optim = Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy',
              optimizer=optim,   # w_new = w_old - learning_rate * gradient
              metrics=['acc'])

In [None]:
# rlp = callbacks.ReduceLROnPlateau(
#     monitor='val_loss',  # 是否進步的指標
#     factor=0.1,  # 以 factor 的倍數調整 learning rate
#     patience=5,  # 經過 patience 次沒有進步調整 learning rate
#     verbose=2,
#     mode='min')

In [None]:
batch_size = 64
epochs = 20

# 訓練模型
history = model.fit(X_train, y_train,
                    validation_data=(X_valid, y_valid),
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    shuffle=True)#, callbacks=[rlp])

In [None]:
# a_old = model.layers[0].trainable_weights[0].numpy()

In [None]:
# a_new = model.layers[0].trainable_weights[0].numpy()

# (a_new-a_old).mean()

## 6) 觀察訓練結果

In [None]:
history_dict = history.history
history_dict.keys()

In [None]:
import matplotlib.pyplot as plt

acc = history_dict['acc']
val_acc = history_dict['val_acc']
loss = history_dict['loss']
val_loss = history_dict['val_loss']
epochs_ = range(1,len(acc)+1)

In [None]:
plt.figure(figsize=(15, 4))
plt.subplot(1, 2, 1)
plt.plot(epochs_ , loss , label = 'training loss')
plt.plot(epochs_ , val_loss , label = 'val los')
plt.title('training and val loss')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(epochs_ , acc , label='train accuracy')
plt.plot(epochs_ , val_acc , label = 'val accuracy')
plt.title('train and val acc')
plt.xlabel('epochs')
plt.ylabel('acc')
plt.ylim((0.5, 1))
plt.legend()

## 觀察訓練集上的成效

In [None]:
loss, acc = model.evaluate(X_train, y_train, verbose=0)
print(f'loss:{loss}, accuracy:{acc}')

## 觀察驗證集上的成效

In [None]:
from sklearn.metrics import accuracy_score
label=np.argmax(y_valid,axis=1)
pred =  np.argmax(model.predict(X_valid), axis=1)
accuracy_score(label, pred)

In [None]:
model.predict(X_valid)[:10].argmax(-1)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(label, pred))

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(label, pred))

## 預測 test.csv

In [None]:
df_test = pd.read_csv('data/Test.csv')
df_test = df_test.drop(unique_col, axis=1)
df_test = df_test.drop(labels=['Id'],axis=1)
test, _ = data_preprocessing(df_test, train=False, sc=train_sc)

In [None]:
pred = model.predict(test)

In [None]:
predict_class = np.argmax(pred, axis=1)

In [None]:
df_test = pd.read_csv('data/Test.csv')
ans = df_test[['Id']]

In [None]:
ans

In [None]:
ans.loc[:, 'class'] = list(predict_class)

In [None]:
ans.to_csv('ans.csv', index=False)