In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve
from sklearn.model_selection import cross_val_score

# データ読み込み
train_data = pd.read_csv('/home-asustor/teramoto/SIGNATE/Beginner/data/train.csv')
test_data = pd.read_csv('/home-asustor/teramoto/SIGNATE/Beginner/data/test.csv')

# One-Hot Encoding for categorical variables like 'Gender'
train_data = pd.get_dummies(train_data, columns=['Gender'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['Gender'], drop_first=True)

# 特徴量とターゲット変数の設定
X = train_data[['Age', 'T_Bil', 'D_Bil', 'ALP', 'ALT_GPT', 'AST_GOT', 'TP', 'Alb', 'AG_ratio', 'Gender_Male']]
y = train_data['disease']

# データを訓練セットとテストセットに分割
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# ハイパーパラメータグリッドの設定
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],   # 正則化の強さ
    'penalty': ['l1', 'l2'],        # 正則化タイプ
    'solver': ['liblinear']          # ソルバー
}

# グリッドサーチの実行
grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

# 最適なパラメータを表示
print(f'Best parameters: {grid.best_params_}')

# 最適なモデルで再度学習
model = grid.best_estimator_
model.fit(X_train, y_train)

# 検証データでの予測確率
y_pred_prob = model.predict_proba(X_val)[:, 1]

# AUCスコアの計算
auc_score = roc_auc_score(y_val, y_pred_prob)
print(f'AUC: {auc_score}')

# ROC曲線に基づく最適な閾値の算出
fpr, tpr, thresholds = roc_curve(y_val, y_pred_prob)
optimal_idx = (tpr - fpr).argmax()
optimal_threshold = thresholds[optimal_idx]
print(f'最適な閾値: {optimal_threshold}')

# テストデータでの予測確率
test_features = test_data[['Age', 'T_Bil', 'D_Bil', 'ALP', 'ALT_GPT', 'AST_GOT', 'TP', 'Alb', 'AG_ratio', 'Gender_Male']]
test_pred_prob = model.predict_proba(test_features)[:, 1]

# 最適な閾値を用いて予測結果を0か1に変換
test_pred = (test_pred_prob >= optimal_threshold).astype(int)

# 連番IDの生成
test_data['id'] = range(0, len(test_data))

# 予測結果を保存
output = pd.DataFrame({
    'id': test_data['id'],    # 生成した連番ID
    'prediction': test_pred  # 閾値に基づいた二値分類の予測結果
})

# CSVファイルとして保存（ヘッダーなし）
output.to_csv('/home-asustor/teramoto/SIGNATE/Beginner/data/03predictions.csv', index=False, header=False)
print("予測結果が保存されました。")

# 検証データで精度の計算
y_val_pred = (y_pred_prob >= optimal_threshold).astype(int)
accuracy = accuracy_score(y_val, y_val_pred)
print(f'Accuracy: {accuracy}')

# 交差検証の実行
cross_val_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f'Cross-validated accuracy: {cross_val_scores.mean()}')


Best parameters: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
AUC: 0.9336684820555787
最適な閾値: 0.3072823354387245
予測結果が保存されました。
Accuracy: 0.8823529411764706
Cross-validated accuracy: 0.8376470588235294


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve
from sklearn.model_selection import cross_val_score

# データ読み込み
train_data = pd.read_csv('/home-asustor/teramoto/SIGNATE/Beginner/data/train.csv')
test_data = pd.read_csv('/home-asustor/teramoto/SIGNATE/Beginner/data/test.csv')

# One-Hot Encoding for categorical variables like 'Gender'
train_data = pd.get_dummies(train_data, columns=['Gender'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['Gender'], drop_first=True)

# 特徴量とターゲット変数の設定
X = train_data[['Age', 'T_Bil', 'D_Bil', 'ALP', 'ALT_GPT', 'AST_GOT', 'TP', 'Alb', 'AG_ratio', 'Gender_Male']]
y = train_data['disease']

# データを訓練セットとテストセットに分割
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBoostモデルの構築
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# ハイパーパラメータの設定
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 1.0]
}

# グリッドサーチの実行
grid = GridSearchCV(xgb_model, param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

# 最適なパラメータを表示
print(f'Best parameters: {grid.best_params_}')

# 最適なモデルで再度学習
model = grid.best_estimator_
model.fit(X_train, y_train)

# 検証データでの予測確率
y_pred_prob = model.predict_proba(X_val)[:, 1]

# AUCスコアの計算
auc_score = roc_auc_score(y_val, y_pred_prob)
print(f'AUC: {auc_score}')

# ROC曲線に基づく最適な閾値の算出
fpr, tpr, thresholds = roc_curve(y_val, y_pred_prob)
optimal_idx = (tpr - fpr).argmax()
optimal_threshold = thresholds[optimal_idx]
print(f'最適な閾値: {optimal_threshold}')

# テストデータでの予測確率
test_features = test_data[['Age', 'T_Bil', 'D_Bil', 'ALP', 'ALT_GPT', 'AST_GOT', 'TP', 'Alb', 'AG_ratio', 'Gender_Male']]
test_pred_prob = model.predict_proba(test_features)[:, 1]

# 最適な閾値を用いて予測結果を0か1に変換
test_pred = (test_pred_prob >= optimal_threshold).astype(int)

# 連番IDの生成
test_data['id'] = range(0, len(test_data))

# 予測結果を保存
output = pd.DataFrame({
    'id': test_data['id'],    # 生成した連番ID
    'prediction': test_pred  # 閾値に基づいた二値分類の予測結果
})

# CSVファイルとして保存（ヘッダーなし）
output.to_csv('/home-asustor/teramoto/SIGNATE/Beginner/data/04predictions.csv', index=False, header=False)
print("予測結果が保存されました。")

# 検証データで精度の計算
y_val_pred = (y_pred_prob >= optimal_threshold).astype(int)
accuracy = accuracy_score(y_val, y_val_pred)
print(f'Accuracy: {accuracy}')

# 交差検証の実行
cross_val_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f'Cross-validated accuracy: {cross_val_scores.mean()}')


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best parameters: {'learning_rate': 0.3, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}
AUC: 0.966624773076386
最適な閾値: 0.0541701465845108
予測結果が保存されました。
Accuracy: 0.9058823529411765


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Cross-validated accuracy: 0.8541176470588235


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

