In [14]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
from lightgbm.callback import early_stopping, log_evaluation

import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ktestdata/k2_clean.csv
/kaggle/input/model/other/default/1/model.pkl
/kaggle/input/keplercompleteddata/clean_kepler.csv
/kaggle/input/ramidata/df_merged.csv
/kaggle/input/kepler/cumulative_2025.10.03_23.57.10.csv


In [15]:
df = pd.read_csv("/kaggle/input/keplercompleteddata/clean_kepler.csv")
df.head()

Unnamed: 0,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag,label
0,1.0,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,...,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,1
1,0.969,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,1
2,0.0,0,0,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,...,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436,0
3,0.0,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597,0
4,1.0,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,...,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509,1


In [16]:
df = df.drop(["koi_disposition", "koi_pdisposition", "koi_score", "kepid", "kepoi_name", "kepler_name"], axis=1, errors="ignore")

X = df.drop("label", axis=1)
y = df["label"]



# stratify ‚Äî —á—Ç–æ–±—ã —Å–æ—Ö—Ä–∞–Ω–∏—Ç—å –±–∞–ª–∞–Ω—Å –∫–ª–∞—Å—Å–æ–≤
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [17]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [18]:
params = {
    "objective": "binary",
    "metric": "binary_logloss",
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 31,
    "verbose": -1
}

In [19]:
model = lgb.train(
    params=params,
    train_set=train_data,
    valid_sets=[train_data, test_data],
    callbacks=[
        early_stopping(stopping_rounds=50),
        log_evaluation(period=100)
    ],
    num_boost_round=1000
)

Training until validation scores don't improve for 50 rounds
[100]	training's binary_logloss: 0.0857364	valid_1's binary_logloss: 0.137198
[200]	training's binary_logloss: 0.0402262	valid_1's binary_logloss: 0.13177
Early stopping, best iteration is:
[196]	training's binary_logloss: 0.0415259	valid_1's binary_logloss: 0.131542


In [20]:
from sklearn.metrics import accuracy_score, f1_score

y_pred = (model.predict(X_test) > 0.5).astype(int)

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {acc:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.9597
F1 Score: 0.9509


In [21]:
"koi_score" in X_train.columns

False

In [22]:
sample = X_test.iloc[4]  
prediction = model.predict([sample])
prediction

array([0.98626281])

In [23]:
import joblib

# –°–æ—Ö—Ä–∞–Ω—è–µ–º –º–æ–¥–µ–ª—å
joblib.dump(model, 'model.pkl')

['model.pkl']

In [24]:
import numpy as np

# üîπ –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–µ–π
y_pred_proba = model.predict(X_test)

# üîπ –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º –≤ DataFrame
results = pd.DataFrame({
    'id': np.arange(len(y_pred_proba)),        # —Å–æ–∑–¥–∞—ë–º id –¥–ª—è –∫–∞–∂–¥–æ–π —Å—Ç—Ä–æ–∫–∏
    'procent': y_pred_proba,                   # –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å "Planet"
    'xai': (y_pred_proba > 0.5).astype(int)    # 1 –µ—Å–ª–∏ > 0.5, –∏–Ω–∞—á–µ 0
})

# üîπ –§–∏–ª—å—Ç—Ä—É–µ–º –ø–æ –ø–æ—Ä–æ–≥—É
results_filtered = results[results['procent'] > 0.5]

# üîπ –í—ã–≤–æ–¥–∏–º –ø–µ—Ä–≤—ã–µ 50 —Å—Ç—Ä–æ–∫
print(results_filtered.head(50))

# üîπ –°–æ—Ö—Ä–∞–Ω—è–µ–º —Ñ–∞–π–ª –¥–ª—è –æ—Ç–ø—Ä–∞–≤–∫–∏
results_filtered.to_csv('submission.csv', index=False)

      id   procent  xai
1      1  0.987664    1
2      2  0.986936    1
4      4  0.986263    1
5      5  0.989571    1
6      6  0.985875    1
9      9  0.979474    1
10    10  0.640618    1
14    14  0.957411    1
15    15  0.985559    1
19    19  0.989492    1
20    20  0.964803    1
23    23  0.915622    1
25    25  0.968456    1
27    27  0.958198    1
28    28  0.986454    1
29    29  0.979456    1
30    30  0.986174    1
32    32  0.978827    1
34    34  0.993370    1
36    36  0.983916    1
38    38  0.921675    1
39    39  0.907808    1
43    43  0.983213    1
44    44  0.903161    1
46    46  0.988622    1
48    48  0.988102    1
50    50  0.972643    1
51    51  0.941529    1
54    54  0.708072    1
59    59  0.972598    1
60    60  0.966830    1
64    64  0.983628    1
67    67  0.505578    1
69    69  0.980925    1
70    70  0.993436    1
75    75  0.982418    1
76    76  0.907940    1
77    77  0.959993    1
85    85  0.989718    1
86    86  0.974836    1
92    92  0.8930