## 資料來源
Skin Cancer MNIST: HAM10000 https://www.kaggle.com/kmader/skin-cancer-mnist-ham10000

### 資料準備

In [None]:
#keras.utils: 做one-hot encoding用
#sklearn.model_selection: 分割訓練集和測試集
#os: 用來建立檔案、刪除檔案
#PIL: (圖像處理庫)匯入圖像
#seed: 設定種子，使每次隨機產生的資料有相同結果。可將數字改成自己的學號(或其他數字)
import numpy as np
import pandas as pd
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
import os
from PIL import Image
np.random.seed(152273)

In [None]:
#7項皮膚疾病簡稱與全名
lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'Melanoma',
    'bkl': 'Benign keratosis-like lesions ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}

In [None]:
pd.Categorical(lesion_type_dict).codes

array([5, 4, 2, 1, 0, 6, 3], dtype=int8)

In [None]:
!pip uninstall gdown -y && pip install gdown
!gdown -V

Found existing installation: gdown 4.6.0
Uninstalling gdown-4.6.0:
  Successfully uninstalled gdown-4.6.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown
  Using cached gdown-4.6.0-py3-none-any.whl (14 kB)
Installing collected packages: gdown
Successfully installed gdown-4.6.0
gdown 4.6.0 at /usr/local/lib/python3.8/dist-packages


In [None]:
# Download from Google Drive
import gdown
url = 'https://drive.google.com/uc?id=1kklF0GDZ-4Vh52MIdTexky6Bqzek7S-c'
output = 'project03.zip'
gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1kklF0GDZ-4Vh52MIdTexky6Bqzek7S-c
To: /content/project03.zip
100%|██████████| 26.6M/26.6M [00:00<00:00, 86.7MB/s]


'project03.zip'

In [None]:
!unzip project03.zip

Archive:  project03.zip
replace project3_test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: project3_test.csv       
replace project3_train.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: project3_train.csv      


In [None]:
#讀取影像資料，28*28*3個像素值欄位(pixel0000-pixel2351) + 1個分類類別欄位label
load_img = pd.read_csv('project3_train.csv')

In [None]:
#列出data的標籤
load_img.head()

Unnamed: 0,pixel0000,pixel0001,pixel0002,pixel0003,pixel0004,pixel0005,pixel0006,pixel0007,pixel0008,pixel0009,...,pixel2343,pixel2344,pixel2345,pixel2346,pixel2347,pixel2348,pixel2349,pixel2350,pixel2351,label
0,30,15,20,35,19,27,94,69,73,152,...,22,9,13,11,2,4,9,1,0,0
1,1,0,0,7,1,5,103,56,68,192,...,127,72,74,24,5,6,0,1,1,0
2,129,91,92,182,145,145,205,169,168,189,...,64,39,41,5,2,6,2,2,1,0
3,9,8,8,11,10,10,10,9,9,24,...,11,9,10,8,7,7,5,5,5,0
4,26,13,19,25,10,17,24,6,5,23,...,22,6,9,27,9,10,23,5,6,0


In [None]:
#檢查讀取圖片的大小與數量
load_img.shape

(8008, 2353)

In [None]:
load_img.iloc[: , :-1].values

array([[ 30,  15,  20, ...,   9,   1,   0],
       [  1,   0,   0, ...,   0,   1,   1],
       [129,  91,  92, ...,   2,   2,   1],
       ...,
       [127, 101, 108, ..., 121, 108, 125],
       [157,  82,  86, ..., 210, 126, 130],
       [176, 149, 166, ..., 175, 142, 159]])

In [None]:
#iloc選取特定範圍，讀取種類編號
X_img , y_label = load_img.iloc[: , :-1].values , load_img.iloc[: , -1].values

In [None]:
#將串列轉成矩陣
X_img_train = np.asarray(X_img.tolist())

#將一維的數據，轉換成三維(長*寬*RGB三色)
X_img_train=X_img_train.reshape(X_img_train.shape[0],28,28,3)

In [None]:
#檢查學習資料的照片數量、尺寸大小、維度
print("train data:",'images:',X_img_train.shape," labels:",y_label.shape) 

train data: images: (8008, 28, 28, 3)  labels: (8008,)


In [None]:
#標準化: 同除255(因為image的數字是0~255)
X_img_train_normalize = X_img_train.astype('float32') / 255.0

In [None]:
#使用np_utils.to_categorical()傳入各參數的label標籤欄位，再執行OneHot encoding (轉成0或1的組合)
y_label_train_OneHot = np_utils.to_categorical(y_label)

In [None]:
#檢查標籤總共有多少種分類
#這裡是共8008筆資料，每筆是7個0或1的組合
y_label_train_OneHot.shape

(8008, 7)

### 建立與訓練隨機森林模型

In [None]:
X_img_normalize = X_img.astype('float32') / 255.0

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix

#建立隨機森林的實體
model_RF = RandomForestClassifier()
#分割學習資料集與驗證資料集
x_train, x_validate, y_train, y_validate = train_test_split(X_img_normalize, y_label, test_size = 0.2)

In [None]:
#建立自己的隨機森林
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_leaf_nodes=4,random_state=409570298)
dt.fit(x_train,y_train)

DecisionTreeClassifier(max_leaf_nodes=4, random_state=409570298)

In [None]:
# 使用訓練資料訓練模型
model_RF.fit(x_train, y_train)
# 使用訓練資料預測分類
predicted = model_RF.predict(x_train)

In [None]:
print('特徵重要程度: ',model_RF.feature_importances_)

特徵重要程度:  [0.00379749 0.00035008 0.00046606 ... 0.00144337 0.0003755  0.00043944]


In [None]:
# 預測成功的比例
print('訓練集: ',model_RF.score(x_train,y_train))
print('測試集: ',model_RF.score(x_validate,y_validate))

訓練集:  1.0
測試集:  0.7540574282147315


In [None]:
#用grid search調整隨機森林
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators':[3,10,30,50,100],'max_features':[2,4,6,8],'max_depth' : [1,2,3,4]}
]



forest_clf = RandomForestClassifier(random_state=409570224)
grid_search = GridSearchCV(forest_clf, param_grid, cv=5,
                           scoring='f1',
                           return_train_score=True)
grid_search.fit(x_train, y_train)

[1;30;43m串流輸出內容已截斷至最後 5000 行。[0m
    return fbeta_score(
  File "/usr/local/lib/python3.8/dist-packages/sklearn/metrics/_classification.py", line 1261, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/usr/local/lib/python3.8/dist-packages/sklearn/metrics/_classification.py", line 1544, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_label)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/metrics/_classification.py", line 1365, in _check_set_wise_labels
    raise ValueError(
ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/metrics/_scorer.py", line 216, in __call__


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=409570224),
             param_grid=[{'max_depth': [1, 2, 3, 4],
                          'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30, 50, 100]}],
             return_train_score=True, scoring='f1')

In [None]:
grid_search.best_estimator_

RandomForestClassifier(max_depth=1, max_features=2, n_estimators=3,
                       random_state=409570224)

In [None]:
grid_search.best_estimator_.fit(x_train,y_train)

RandomForestClassifier(max_depth=1, max_features=2, n_estimators=3,
                       random_state=409570224)

In [None]:
### 使用最後的模型進行測試資料預測
load_test_img = pd.read_csv('project3_test.csv')
img_test = load_test_img.values

In [None]:
x_test_normalize = img_test.astype('float32') / 255.0

In [None]:
df_submit = pd.DataFrame([], columns=['Id', 'Label'])
df_submit['Id'] = [f'{i:04d}' for i in range(len(x_test_normalize))]
df_submit['Label'] = model_RF.predict(x_test_normalize)

In [None]:
df_submit.to_csv('submission_RF(1).csv', index=None)

### 建立與訓練CNN模型

In [None]:
#匯入keras中的Sequential、layers模組(Dense、 Dropout、 Activation、 Flatten、Conv2D、 MaxPooling2D、 ZeroPadding2D)
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D, ZeroPadding2D
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.optimizers import SGD

In [None]:
#搭建第一層CNN模型
model = Sequential()
model.add(Conv2D(filters=32,kernel_size=(3, 3),input_shape=(28,28,3),activation='relu', padding='same'))

In [None]:
#設計卷積層與池化層、平坦層、全連接層
def Net_model(nb_classes, lr = 0.001,decay=1e-6,momentum=0.9):
    model.add(Convolution2D(filters = 10, kernel_size = (5, 5),
                            padding = 'valid',
                            input_shape = (28, 28, 3)))
    model.add(Activation('tanh'))
    model.add(MaxPooling2D(pool_size = (2, 2)))

    model.add(Convolution2D(filters = 20, kernel_size = (10, 10)))
    model.add(Activation('tanh'))
    model.add(MaxPooling2D(pool_size = (2, 2)))
    model.add(Dropout(0.25))

    model.add(Flatten())
    model.add(Dense(1000))
    model.add(Activation('tanh'))
    model.add(Dropout(0.5))
    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))
    sgd = SGD(lr = lr, decay = decay, momentum = momentum, nesterov = True)
    model.compile(loss='categorical_crossentropy', optimizer = sgd)
    return model
nb_classes = 4
nb_epoch = 30
nb_step = 6
batch_size = 64

In [None]:
x_train.shape

(6406, 2352)

In [None]:
X_train=x_train.reshape(x_train.shape[0], 147, 16, 1)

In [None]:
X_train.shape

(6406, 147, 16, 1)

In [None]:
from keras.preprocessing.image import ImageDataGenerator
dataGenerator = ImageDataGenerator()
dataGenerator.fit(X_train)
data_generator = dataGenerator.flow(X_train, y_train, batch_size, True) #generator函數，用來生成批處理數據

model = Net_model(nb_classes, lr = 0.001) #加載網絡模型

history = model.fit_generator(data_generator, epochs = nb_epoch, steps_per_epoch = nb_step, shuffle = True)

Epoch 1/30


  super(SGD, self).__init__(name, **kwargs)
  history = model.fit_generator(data_generator, epochs = nb_epoch, steps_per_epoch = nb_step, shuffle = True)


ValueError: ignored

In [None]:
# 使用最後的模型進行測試資料預測
load_test_img = pd.read_csv('project3_test.csv')
img_test = load_test_img.values

In [None]:
x_test=img_test.reshape(img_test.shape[0],28,28,3)
x_test_normalize = x_test.astype('float32') / 255.0

In [None]:
df_submit = pd.DataFrame([], columns=['Id', 'Label'])
df_submit['Id'] = [f'{i:04d}' for i in range(len(x_test_normalize))]
df_submit['Label'] = np.argmax(model.predict(x_test_normalize), axis=-1)

In [None]:
df_submit.to_csv('submission_CNN.csv', index=None)