In [18]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.impute import KNNImputer

In [None]:
from sklearn.linear_model import LogisticRegression
import numpy as np
class multimodal():
    def __init__(self, n_datasets, datasets:list[any]):
        self.models = [[LogisticRegression(penalty='elasticnet')]*n_datasets]
        self.dataset = datasets
        self.col = n_datasets
    

    def fit(self):
        for i in range(0,self.col):
            X = self.dataset[i].drop(columns = 'type')
            Y = self.dataset[i]['type']
            self.models[i].fit(X,Y)
    

    def aggregate(self,prob : np.array):
        return prob.mean()
    

    def process_data(self,x: list[any]):
       data = np.array([x])
       data = self.imputer.transform(data)
       return data
    
    
    def predict(self,data):
       data = self.process_data(data)
       ans = []
       for i in range(0,self.col):
           p = self.models[i].predict_proba(data[i])[0]
           ans.append(p)
       p_final = self.aggregate(np.array(ans))

       return p_final
       



    

In [19]:
df = pd.read_csv('cumulative_2025.09.20_00.18.02.csv')

In [20]:

X = df.drop(columns=['kepler_name','koi_score','kepid','kepoi_name','koi_pdisposition','koi_disposition','koi_fpflag_nt',
 'koi_fpflag_ss',
 'koi_fpflag_co',
 'koi_fpflag_ec','koi_tce_delivname'])

Y = df['koi_pdisposition']
x_train,x_val,y_train,y_val = train_test_split(X,Y,test_size=0.3)


In [22]:
X.columns

Index(['koi_period', 'koi_time0bk', 'koi_impact', 'koi_duration', 'koi_depth',
       'koi_prad', 'koi_teq', 'koi_insol', 'koi_model_snr', 'koi_tce_plnt_num',
       'koi_steff', 'koi_slogg', 'koi_srad', 'ra', 'dec', 'koi_kepmag'],
      dtype='object')

In [17]:
import pickle
imputer = KNNImputer(n_neighbors=10)
imputer.fit(X)
filename = 'knn_imput.sav'
pickle.dump(imputer, open(filename, 'wb'))

In [15]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer, f1_score


param_grid = {
    
    'learning_rate': [0.01, 0.1],
    'depth': [6,9,11],
    'l2_leaf_reg' : [0.4, 0.7]
}
model = CatBoostClassifier(iterations=300)
scorer = make_scorer(f1_score, average='weighted')
grid_search = GridSearchCV(model, param_grid, cv=5, scoring=scorer, n_jobs=-1)
grid_search.fit(x_train, y_train)





0:	learn: 0.6873581	total: 3.08ms	remaining: 922ms
1:	learn: 0.6821585	total: 6.38ms	remaining: 951ms
2:	learn: 0.6766796	total: 9.36ms	remaining: 926ms
3:	learn: 0.6715083	total: 12.4ms	remaining: 916ms
4:	learn: 0.6664876	total: 15.8ms	remaining: 930ms
5:	learn: 0.6611890	total: 18.6ms	remaining: 909ms
6:	learn: 0.6559142	total: 21.3ms	remaining: 890ms
7:	learn: 0.6507009	total: 24.2ms	remaining: 884ms
8:	learn: 0.6453811	total: 27.7ms	remaining: 896ms
9:	learn: 0.6405627	total: 33ms	remaining: 957ms
10:	learn: 0.6353849	total: 37ms	remaining: 971ms
11:	learn: 0.6312812	total: 40.1ms	remaining: 964ms
12:	learn: 0.6271507	total: 43.6ms	remaining: 962ms
13:	learn: 0.6227203	total: 46.9ms	remaining: 959ms
14:	learn: 0.6184939	total: 49.8ms	remaining: 946ms
15:	learn: 0.6145336	total: 52.9ms	remaining: 940ms
16:	learn: 0.6101251	total: 55.8ms	remaining: 929ms
17:	learn: 0.6065944	total: 58.5ms	remaining: 917ms
18:	learn: 0.6029139	total: 60.9ms	remaining: 901ms
19:	learn: 0.5989150	total

0,1,2
,estimator,<catboost.cor...001DA4A35A630>
,param_grid,"{'depth': [6, 9, ...], 'l2_leaf_reg': [0.4, 0.7], 'learning_rate': [0.01, 0.1]}"
,scoring,make_scorer(f...rage=weighted)
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False


In [18]:
model_best = grid_search.best_estimator_
y_pred_grid = model_best.predict(x_val)

# Print classification report for the Grid Search model
print("Grid Search - Classification Report:")
print(classification_report(y_val, y_pred_grid))

model_best.save_model("catboost_model.cbm")

Grid Search - Classification Report:
                precision    recall  f1-score   support

     CANDIDATE       0.81      0.86      0.83      1450
FALSE POSITIVE       0.85      0.80      0.82      1420

      accuracy                           0.83      2870
     macro avg       0.83      0.83      0.83      2870
  weighted avg       0.83      0.83      0.83      2870



In [14]:
from predict import model_nasa

In [13]:
X.shape

(9564, 16)

In [None]:
importance = pd.Series(model_best.feature_importances_,
                         model_best.feature_names_)

fig, ax = plt.subplots(figsize=(5,5))
importance.plot.bar(ax=ax)
ax.set_title("Важность признаков")
ax.set_ylabel('Важность')
fig.tight_layout()