In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
# データの読み込み
train = pd.read_csv('C:/SIGNATE/banking_tageting/train.csv')
train.head()

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,1,39,blue-collar,married,secondary,no,1756,yes,no,cellular,3,apr,939,1,-1,0,unknown,1
1,2,51,entrepreneur,married,primary,no,1443,no,no,cellular,18,feb,172,10,-1,0,unknown,1
2,3,36,management,single,tertiary,no,436,no,no,cellular,13,apr,567,1,595,2,failure,1
3,4,63,retired,married,secondary,no,474,no,no,cellular,25,jan,423,1,-1,0,unknown,1
4,5,31,management,single,tertiary,no,354,no,no,cellular,30,apr,502,1,9,2,success,1


In [13]:
test = pd.read_csv("C:/SIGNATE/banking_tageting/test.csv")
test.head()

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,1,30,management,single,tertiary,no,1028,no,no,cellular,4,feb,1294,2,-1,0,unknown
1,2,39,self-employed,single,tertiary,no,426,no,no,unknown,18,jun,1029,1,-1,0,unknown
2,3,38,technician,single,tertiary,no,-572,yes,yes,unknown,5,jun,26,24,-1,0,unknown
3,4,34,technician,single,secondary,no,-476,yes,no,unknown,27,may,92,4,-1,0,unknown
4,5,37,entrepreneur,married,primary,no,62,no,no,cellular,31,jul,404,2,-1,0,unknown


In [14]:
# データ型の確認
print(train.dtypes)

id            int64
age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y             int64
dtype: object


In [15]:
import warnings
warnings.filterwarnings('ignore')

In [16]:
# 欠損値の確認
print(train.isnull().sum())

id           0
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


In [17]:
print(test.isnull().sum())

id           0
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
dtype: int64


In [19]:
# 目的変数と説明変数の分離
X = train.drop(["id", "y", "duration"], axis = 1)
y = train["y"]

In [20]:
# カテゴリ変数をダミー変数にする。
from sklearn.preprocessing import LabelEncoder

In [23]:
X_encoded = X.copy()
for col in X_encoded.select_dtypes(include = 'object').columns:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X_encoded[col])

X_encoded.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome
0,39,1,1,1,0,1756,1,0,0,3,0,1,-1,0,3
1,51,2,1,0,0,1443,0,0,0,18,3,10,-1,0,3
2,36,4,2,2,0,436,0,0,0,13,0,1,595,2,0
3,63,5,1,1,0,474,0,0,0,25,4,1,-1,0,3
4,31,4,2,2,0,354,0,0,0,30,0,1,9,2,2


In [24]:
print(X_encoded.dtypes)

age          int64
job          int32
marital      int32
education    int32
default      int32
balance      int64
housing      int32
loan         int32
contact      int32
day          int64
month        int32
campaign     int64
pdays        int64
previous     int64
poutcome     int32
dtype: object


In [32]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X_encoded, y, test_size = 0.2, random_state = 42
)

smote = SMOTE(random_state=42)
X_train_resampling, y_train_resampling = smote.fit_resample(X_train, y_train)

In [33]:
import collections
collections.Counter(y_train_resampling)

Counter({1: 19137, 0: 19137})

In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve

In [37]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_resampling, y_train_resampling)

In [38]:
# 検証データで予測
y_pred = rf_model.predict(X_valid)
y_proba = rf_model.predict_proba(X_valid)[:, 1]

In [39]:
print("ROC-AUC:", roc_auc_score(y_valid, y_proba))
print(classification_report(y_valid, y_pred))

ROC-AUC: 0.7599310460727997
              precision    recall  f1-score   support

           0       0.93      0.91      0.92      4817
           1       0.39      0.44      0.41       609

    accuracy                           0.86      5426
   macro avg       0.66      0.68      0.67      5426
weighted avg       0.87      0.86      0.86      5426



In [46]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 25.3 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0


In [47]:
from lightgbm import LGBMClassifier

In [48]:
lgb_model = LGBMClassifier(random_state = 42)
lgb_model.fit(X_train_resampling, y_train_resampling)

  File "c:\Users\ryosu\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\ryosu\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ryosu\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\ryosu\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


[LightGBM] [Info] Number of positive: 19137, number of negative: 19137
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001471 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 729
[LightGBM] [Info] Number of data points in the train set: 38274, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [49]:
# 検証データに対する予測
y_pred_lgb = lgb_model.predict(X_valid)
y_proba_lgb = lgb_model.predict_proba(X_valid)[:, 1]

In [50]:
# 評価指標の表示
from sklearn.metrics import classification_report, roc_auc_score

print("ROC-AUC:", roc_auc_score(y_valid, y_proba_lgb))
print(classification_report(y_valid, y_pred_lgb))

ROC-AUC: 0.7745774492569251
              precision    recall  f1-score   support

           0       0.93      0.90      0.92      4817
           1       0.38      0.50      0.43       609

    accuracy                           0.85      5426
   macro avg       0.66      0.70      0.67      5426
weighted avg       0.87      0.85      0.86      5426



In [52]:
test_df = pd.read_csv("C:/SIGNATE/banking_tageting/test.csv")

In [53]:
# test.csv にも train.csv と同じ処理を適用
test_data = test_df.drop(columns=["id", "duration"])  # duration は除外

# ラベルエンコーディング（trainと同じ手法）
test_encoded = test_data.copy()
for col in test_encoded.select_dtypes(include="object").columns:
    le = LabelEncoder()
    test_encoded[col] = le.fit_transform(test_encoded[col])


In [54]:
# 予測確率を出力（確率がそのままスコアとして提出可能）
y_submit_proba = lgb_model.predict_proba(test_encoded)[:, 1]

In [55]:
# idと確率を組み合わせて DataFrame にする
submit = pd.DataFrame({
    "id": test_df["id"],
    "y": y_submit_proba
})

# CSVファイルとして出力（インデックスなし）
submit.to_csv("submit_lightgbm.csv", index=False)


In [56]:
submit.head()

Unnamed: 0,id,y
0,1,0.289188
1,2,0.086754
2,3,0.014733
3,4,0.018529
4,5,0.255706


In [57]:
import os
print(os.getcwd())

c:\Users\ryosu\OneDrive\Desktop\講義資料\機械学習入門


In [58]:
# 1. 行数一致しているか（18083件のはず）
print("Submit rows:", submit.shape[0])
print("Test rows:  ", test_df.shape[0])

Submit rows: 18083
Test rows:   18083


In [59]:
# 2. 欠損がないか
print(submit.isnull().sum())

id    0
y     0
dtype: int64


In [60]:
# 3. idの順番は test_df と一致しているか
print("test_df ids:", test_df["id"].head(5).tolist())
print("submit ids:", submit["id"].head(5).tolist())

test_df ids: [1, 2, 3, 4, 5]
submit ids: [1, 2, 3, 4, 5]


In [61]:

# 4. カラム名とデータ型
print(submit.columns)
print(submit.dtypes)

Index(['id', 'y'], dtype='object')
id      int64
y     float64
dtype: object


In [63]:
sample_submission = pd.read_csv("C:/SIGNATE/banking_tageting/submit_sample.csv")
print(sample_submission.head())

   1  0.236
0  2  0.128
1  3  0.903
2  4  0.782
3  5  0.597
4  6  0.555


In [64]:
print(sample_submission.dtypes)

1          int64
0.236    float64
dtype: object


In [65]:
# 正しく明示的にカラムを指定して作り直す
submit = pd.DataFrame({
    "id": test_df["id"].values,
    "y": y_submit_proba
})


In [66]:
print(submit.head())
print(submit.columns)
print(submit.dtypes)
print(submit.shape)

   id         y
0   1  0.289188
1   2  0.086754
2   3  0.014733
3   4  0.018529
4   5  0.255706
Index(['id', 'y'], dtype='object')
id      int64
y     float64
dtype: object
(18083, 2)


In [67]:
submit.to_csv("submit_lightgbm_fixed.csv", index=False)

In [70]:
# id順に並び替え（ここが最重要）
test_df_sorted = test_df.sort_values("id").reset_index(drop=True)

# duration除外
test_data = test_df_sorted.drop(columns=["id", "duration"])

# ラベルエンコーディング（再度）
from sklearn.preprocessing import LabelEncoder

test_encoded = test_data.copy()
for col in test_encoded.select_dtypes(include="object").columns:
    le = LabelEncoder()
    test_encoded[col] = le.fit_transform(test_encoded[col])

# 再予測
y_submit_proba = lgb_model.predict_proba(test_encoded)[:, 1]

# 正しくidと紐づけて提出用ファイル作成
submit = pd.DataFrame({
    "id": test_df_sorted["id"].values,
    "y": y_submit_proba
})

# 書き出し
submit.to_csv("submit_lightgbm_fixed(2).csv", index=False)


In [71]:
print(submit.shape)  # → (18083, 2)
print(submit.isnull().sum())  # → 0
print(submit.head())


(18083, 2)
id    0
y     0
dtype: int64
   id         y
0   1  0.289188
1   2  0.086754
2   3  0.014733
3   4  0.018529
4   5  0.255706


In [74]:
# SIGNATEが配布したsubmit_sample.csvを読み込む
submit_sample = pd.read_csv("C:/SIGNATE/banking_tageting/submit_sample.csv")  # こちらは既にアップ済みのはず

# 念のため確認（行数とid）
assert submit_sample.shape[0] == len(y_submit_proba), "予測数とsubmit_sampleの行数が一致していません"
assert (submit_sample["id"] == test_df["id"]).all(), "submit_sampleとtest_dfのID順が一致していません"

# y列に予測確率を上書き
submit_sample["y"] = y_submit_proba

# 再出力（今度こそSIGNATE公式形式）
safe_path = "/mnt/data/submit_lightgbm_safemode.csv"
submit_sample.to_csv(safe_path, index=False, encoding="utf-8")
safe_path


AssertionError: 予測数とsubmit_sampleの行数が一致していません

In [75]:
# 1. 各データの行数を表示
print("予測件数 y_submit_proba:", len(y_submit_proba))
print("submit_sample.csv の行数:", submit_sample.shape[0])

# 2. id の先頭5つを比較
print("submit_sample id head:", submit_sample["id"].head().tolist())
print("test_df id head:", test_df["id"].head().tolist())


予測件数 y_submit_proba: 18083
submit_sample.csv の行数: 18082


KeyError: 'id'

In [76]:
print(submit_sample.columns)


Index(['1', '0.236'], dtype='object')


In [77]:
submit_sample = pd.read_csv("C:/SIGNATE/banking_tageting/submit_sample.csv", header=None)
submit_sample.columns = ["id", "y"]


In [78]:
# 念のため整列
submit_sample["id"] = test_df["id"].values  # 正しい順で代入
submit_sample["y"] = y_submit_proba         # 確率を上書き

# 再出力
submit_sample.to_csv("C:/SIGNATE/banking_tageting/submit_lightgbm_final.csv", index=False, encoding="utf-8")


In [79]:
import pandas as pd

# 再読込（万全を期す）
test = pd.read_csv("C:/SIGNATE/banking_tageting/test.csv")
submit = pd.read_csv("C:/SIGNATE/banking_tageting/submit_lightgbm_final.csv")

# 差分チェック（これで"ID 3がない"本当の理由が出る）
print("test にあるが submit にないID：", set(test["id"]) - set(submit["id"]))
print("submit にあるが test にないID：", set(submit["id"]) - set(test["id"]))
print("submit の重複ID：", submit["id"][submit["id"].duplicated()].tolist())


test にあるが submit にないID： set()
submit にあるが test にないID： set()
submit の重複ID： []
