In [20]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import OneHotEncoder

# 可視化ツールのimport
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

import tensorflow as tf
from keras import models, layers, callbacks, initializers, regularizers
from keras.layers import ELU, Dense, Dropout, Convolution2D, Input, GlobalAveragePooling2D
from keras.utils import plot_model
from keras.models import Model
from keras.utils import to_categorical

from sklearn.preprocessing import StandardScaler

import keras.backend as K
import optuna

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

temp=dict(layout=go.Layout(font=dict(family="Franklin Gothic", size=12), 
                           height=500, width=1000))
#色コード
class CLR:
    map_1 = 'GnBu'
    blue_thin = '#88CAC9'
    blue_dark = '#38A6A5'
    pink_thin = '#eb8792'
    pink_dark = '#ec616e'
    orange = '#f8b287'
    orange_dark = '#ee572a'

In [21]:
df_train = pd.read_csv('./kaggle/input/titanic/train.csv')
df_test = pd.read_csv('./kaggle/input/titanic/test.csv')
df_example = pd.read_csv('./kaggle/input/titanic/gender_submission.csv')
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


<div style="padding:20px;color:white;margin:0;font-size:180%;text-align:center;display:fill;border-radius:5px;background-color:#38A6A5;overflow:hidden;font-weight:500">データの前処理</div>

In [22]:
def preprocess(df:pd.core.frame.DataFrame, istrain:bool) -> pd.core.frame.DataFrame:
    droped_columns = ['PassengerId','Name','Ticket',"Cabin"]
    df = df.drop(columns=droped_columns)
    df = df.fillna(df.mode().iloc[0])
    df = pd.get_dummies(df)
    if istrain == True:
        df.drop(columns='Survived', inplace=True)

    scaler=StandardScaler()
    df_scaled =pd.DataFrame(scaler.fit_transform(df),columns=df.columns)
    return df_scaled

df_preprocessed = preprocess(df_train, True)
df_preprocessed_test = preprocess(df_test, False)
df_preprocessed.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0.827377,-0.497793,0.432793,-0.473674,-0.502445,-0.737695,0.737695,-0.482043,-0.307562,0.615838
1,-1.566107,0.715048,0.432793,-0.473674,0.786845,1.355574,-1.355574,2.074505,-0.307562,-1.623803
2,0.827377,-0.194583,-0.474545,-0.473674,-0.488854,1.355574,-1.355574,-0.482043,-0.307562,0.615838
3,-1.566107,0.48764,0.432793,-0.473674,0.42073,1.355574,-1.355574,-0.482043,-0.307562,0.615838
4,0.827377,0.48764,-0.474545,-0.473674,-0.486337,-0.737695,0.737695,-0.482043,-0.307562,0.615838


In [23]:
# 欠損値がないか確認
df_preprocessed.isnull().sum(axis=0)#データの欠測値を一覧表示

Pclass        0
Age           0
SibSp         0
Parch         0
Fare          0
Sex_female    0
Sex_male      0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64

In [24]:
from sklearn.model_selection import train_test_split
df_train_x = df_preprocessed
df_train_y = df_train['Survived']
x_train, x_val, y_train, y_val = train_test_split(df_train_x, df_train_y, test_size=0.15, random_state=42)

X_test = df_preprocessed_test

In [25]:
x_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
599,-1.566107,1.548876,0.432793,-0.473674,0.49783,-0.737695,0.737695,2.074505,-0.307562,-1.623803
830,0.827377,-1.028411,0.432793,-0.473674,-0.357391,1.355574,-1.355574,2.074505,-0.307562,-1.623803
306,-1.566107,-0.346188,-0.474545,-0.473674,1.584179,1.355574,-1.355574,2.074505,-0.307562,-1.623803
231,0.827377,0.032825,-0.474545,-0.473674,-0.491874,-0.737695,0.737695,-0.482043,-0.307562,0.615838
845,0.827377,1.018258,-0.474545,-0.473674,-0.496405,-0.737695,0.737695,-0.482043,-0.307562,0.615838


In [26]:
def create_model(num_layer, num_filters, num_activation, dropout_rate):

    model = models.Sequential()

    for i in range(num_layer):
        model.add(Dense(num_filters[i], activation = num_activation[i], input_shape = x_train.shape[1:], kernel_regularizer=regularizers.l2(0.001)))
        model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation = "sigmoid", name ="output_layer"))

    return model

In [29]:
def objective(trial):
    num_layer = trial.suggest_int('num_layer', 1, 20) # 追加する層を選ぶ
    activation = [trial.suggest_categorical('activation', ['relu', 'sigmoid']) for i in range(num_layer)] # 活性化関数
    num_filters = [int(trial.suggest_discrete_uniform("num_filter_"+str(i), 16, 256, 16)) for i in range(num_layer)]
    dropout_rate = [float(trial.suggest_uniform('dropout_rate', 0, 1)) for i in range(num_layer)] # ドロップアウト率
    optimizer = trial.suggest_categorical('optimizer', ['sgd', 'adam', 'rmsprop']) # 最適化アルゴリズム
    
    # 早期終了コールバックの設定
    early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=3, verbose=0, restore_best_weights=True)
    # 学習率スケジューリングコールバックの設定
    lr_scheduler = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, verbose=0)

    # 学習モデルの構築と学習の開始
    model = create_model(num_layer, num_filters, activation, dropout_rate)
    model.compile(optimizer=optimizer,loss='binary_crossentropy',)
    # モデルの訓練
    history = model.fit(
        x_train, y_train,
        validation_data=(x_val, y_val),
        batch_size=12,
        epochs=50,
        callbacks=[early_stopping, lr_scheduler],
        verbose=0
    )
    
    print(history.history)
    
    # 学習モデルの保存
    model_json = model.to_json()
    with open('keras_model.json', 'w') as f_model:
        f_model.write(model_json)
    model.save_weights('keras_model.hdf5')

    # 最小値探索なので
    return -np.amax(history.history['val_loss'])

In [30]:
study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(objective, n_trials=2)

[I 2023-09-05 17:01:56,511] A new study created in memory with name: no-name-66999520-53f0-48ca-9d97-66bf018f1f9d
  num_filters = [int(trial.suggest_discrete_uniform("num_filter_"+str(i), 16, 256, 16)) for i in range(num_layer)]
  dropout_rate = [float(trial.suggest_uniform('dropout_rate', 0, 1)) for i in range(num_layer)] # ドロップアウト率
[W 2023-09-05 17:01:56,870] Trial 0 failed with parameters: {'num_layer': 8, 'activation': 'relu', 'num_filter_0': 160.0, 'num_filter_1': 48.0, 'num_filter_2': 48.0, 'num_filter_3': 16.0, 'num_filter_4': 224.0, 'num_filter_5': 160.0, 'num_filter_6': 192.0, 'num_filter_7': 16.0, 'dropout_rate': 0.9699098521619943, 'optimizer': 'sgd'} because of the following error: ValueError('in user code:\n\n    File "c:\\Users\\Intern\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\keras\\src\\engine\\training.py", line 1338, in train_function  *\n        return step_function(self, iterator)\n    File "c:\\Users\\Intern\\AppData\\Local\\Programs\\Python

ValueError: in user code:

    File "c:\Users\Intern\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1338, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\Intern\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1322, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\Intern\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1303, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\Intern\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1080, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\Intern\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None

    ValueError: Exception encountered when calling layer 'dropout_1699' (type Dropout).
    
    `rate` must be a scalar or scalar tensor. Received: rate=ListWrapper([0.9699098521619943, 0.9699098521619943, 0.9699098521619943, 0.9699098521619943, 0.9699098521619943, 0.9699098521619943, 0.9699098521619943, 0.9699098521619943])
    
    Call arguments received by layer 'dropout_1699' (type Dropout):
      • inputs=tf.Tensor(shape=(None, 160), dtype=float32)
      • training=True


In [None]:
print(study.best_params)
print(study)

{'num_layer': 18, 'activation': 'relu', 'num_filter_0': 176.0, 'num_filter_1': 224.0, 'num_filter_2': 224.0, 'num_filter_3': 176.0, 'num_filter_4': 144.0, 'num_filter_5': 144.0, 'num_filter_6': 128.0, 'num_filter_7': 128.0, 'num_filter_8': 224.0, 'num_filter_9': 224.0, 'num_filter_10': 96.0, 'num_filter_11': 96.0, 'num_filter_12': 128.0, 'num_filter_13': 144.0, 'num_filter_14': 80.0, 'num_filter_15': 144.0, 'num_filter_16': 176.0, 'num_filter_17': 224.0, 'dropout_rate': 0.4395547121319182, 'optimizer': 'sgd'}
<optuna.study.study.Study object at 0x000001723E580850>


In [None]:
import matplotlib.pyplot as plt
# lossの変化を取得

print(history)

# train_loss = history['loss']
# val_loss = history['val_loss']

# # グラフのプロット
# plt.figure(figsize=(8, 6))
# plt.plot(range(1, len(train_loss) + 1), train_loss, label='Training Loss')
# plt.plot(range(1, len(val_loss) + 1), val_loss, label='Validation Loss')
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.title('Training and Validation Loss')
# plt.legend()
# plt.grid(True)
# plt.show()

NameError: name 'history' is not defined

In [None]:
df_processed_test =preprocess(df_test)
df_processed_test =pd.DataFrame(scaler.fit_transform(df_processed_test),columns=df_processed_test.columns)
df_processed_test.head()

TypeError: preprocess() missing 1 required positional argument: 'istrain'

In [None]:
y_pred = model.predict(df_processed_test)

print(y_pred)

df_submit = pd.DataFrame(predictions,columns=['Survived'])#新たな提出用dfを定義
df_submit.Survived = round(df_submit.Survived)
df_submit

NameError: name 'df_processed_test' is not defined

In [None]:
df_answer = pd.read_csv('./kaggle/input/titanic/answer.csv')

Y_test = df_answer['Survived']

Y_test.head()

from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

fpr, tpr, thresholds = roc_curve(Y_test, y_pred)
auc_score = roc_auc_score(Y_test, y_pred)
plt.plot(fpr, tpr, label='AUC = %.3f' % (auc_score))
plt.legend()
plt.title('ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid(True)


# accuracy: 0.8208955223880597
# f1_score: 0.7446808510638298


NameError: name 'y_pred' is not defined