In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv("C:\\Users\\Sarrang\\kepler\\df.csv")

In [4]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1]

## Splitting the dataset into the Training set and Test set

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

## Feature Scaling

In [6]:
"""feature scaling all except'kepoi_name' and 'koi_tce_delivname'"""

"feature scaling all except'kepoi_name' and 'koi_tce_delivname'"

In [7]:
from scipy.stats import boxcox

# # List of columns to transform using Box-Cox
columns_to_transform = ['koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
       'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2',
       'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact',
       'koi_impact_err1', 'koi_impact_err2', 'koi_duration',
       'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1',
       'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2',
       'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2',
       'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_steff_err1',
       'koi_steff_err2', 'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2',
       'koi_srad', 'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec', 'koi_kepmag']

# Apply Box-Cox transformation to specified columns in X_train
for column in columns_to_transform:
    # Add a small constant value to handle non-positive values
    X_train[column] = X_train[column] - X_train[column].min() + 1
    X_train[column], _ = boxcox(X_train[column])


### standardizing to bring into same range

In [8]:
from sklearn.preprocessing import StandardScaler

# Initialize StandardScaler
scaler = StandardScaler()

# Fit scaler on X_train and transform X_train
X_train = scaler.fit_transform(X_train)

# Transform X_test using the same scaler
X_test= scaler.transform(X_test)

In [9]:
from joblib import dump

In [10]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for GridSearchCV
param_grid = {
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'n_estimators': [50, 100, 150, 200],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0]
}

# Create a LightGBM classifier
lgb_classifier = lgb.LGBMClassifier(random_state=42)

# Instantiate GridSearchCV
grid_search = GridSearchCV(estimator=lgb_classifier, param_grid=param_grid, 
                           cv=5, scoring='precision', n_jobs=-1)

# Fit the GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Hyperparameters:", best_params)
print("Best Precision:", best_score)

# Get the best model
best_lgb_model = grid_search.best_estimator_

# Evaluate the best model on the test set
test_accuracy = best_lgb_model.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

dump(best_lgb_model, 'lightgbm_kepler.joblib')

[LightGBM] [Info] Number of positive: 3877, number of negative: 3774
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002496 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9269
[LightGBM] [Info] Number of data points in the train set: 7651, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.506731 -> initscore=0.026926
[LightGBM] [Info] Start training from score 0.026926
Best Hyperparameters: {'colsample_bytree': 0.6, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.6}
Best Precision: 0.9971152209314763
Test Accuracy: 0.9869315211709357


['lightgbm_kepler.joblib']

In [11]:
from joblib import load
light=load('lightgbm_kepler.joblib')
#retraining
light.fit(X_train,y_train)

[LightGBM] [Info] Number of positive: 3877, number of negative: 3774
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001629 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9269
[LightGBM] [Info] Number of data points in the train set: 7651, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.506731 -> initscore=0.026926
[LightGBM] [Info] Start training from score 0.026926


In [12]:
y_pred=light.predict(X_test)

In [13]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       964
           1       0.98      1.00      0.99       949

    accuracy                           0.99      1913
   macro avg       0.99      0.99      0.99      1913
weighted avg       0.99      0.99      0.99      1913



In [None]:
import warnings

warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# df.to_csv('df.csv', index=False)
df = pd.read_csv("C:\\Users\\Sarrang\\kepler\\df.csv")

X=df.iloc[:,:-1]
y=df.iloc[:,-1]

## Splitting the dataset into the Training set and Test set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

## Feature Scaling

"""feature scaling all except'kepoi_name' and 'koi_tce_delivname'"""

from scipy.stats import boxcox

# # List of columns to transform using Box-Cox
columns_to_transform = ['koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
       'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2',
       'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact',
       'koi_impact_err1', 'koi_impact_err2', 'koi_duration',
       'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1',
       'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2',
       'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2',
       'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_steff_err1',
       'koi_steff_err2', 'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2',
       'koi_srad', 'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec', 'koi_kepmag']

# Apply Box-Cox transformation to specified columns in X_train
for column in columns_to_transform:
    # Add a small constant value to handle non-positive values
    X_train[column] = X_train[column] - X_train[column].min() + 1
    X_train[column], _ = boxcox(X_train[column])


### standardizing to bring into same range

from sklearn.preprocessing import StandardScaler

# Initialize StandardScaler
scaler = StandardScaler()

# Fit scaler on X_train and transform X_train
X_train = scaler.fit_transform(X_train)

# Transform X_test using the same scaler
X_test= scaler.transform(X_test)

from joblib import dump

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5]
}

# Create an XGBoost classifier
xgb_classifier = XGBClassifier(random_state=42)

# Instantiate GridSearchCV
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, 
                           cv=5, scoring='accuracy', n_jobs=-1)

# Fit the GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Hyperparameters:", best_params)
print("Best Accuracy:", best_score)

# Get the best model
best_xgb_model = grid_search.best_estimator_

# Evaluate the best model on the test set
test_accuracy = best_xgb_model.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

# Save the best model
dump(best_xgb_model, 'xgb_kepler.joblib')


from joblib import load
xgb=load('xgb_kepler.joblib')
#retraining
xgb.fit(X_train,y_train)

y_pred=xgb.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))



