In [591]:
import warnings

warnings.filterwarnings("ignore")

In [592]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [593]:
# df.to_csv('df.csv', index=False)
df = pd.read_csv("C:\\Users\\Sarrang\\thomas_ai\\df.csv")

In [594]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1]

## Splitting the dataset into the Training set and Test set

In [595]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [596]:
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier
rf_classifier = RandomForestClassifier()

# Fit the classifier on your data
rf_classifier.fit(X_train, y_train)

# Get feature importances
feature_importances = rf_classifier.feature_importances_

# Create a DataFrame to hold feature names and their importances
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Set a threshold for feature importance (you can choose based on your preference)
threshold = 0.01

# Filter features with importance above the threshold
relevant_features = feature_importance_df[feature_importance_df['Importance'] > threshold]['Feature'].tolist()

# Drop irrelevant columns from X_train and X_test
X_train_relevant = X_train[relevant_features]
X_test_relevant = X_test[relevant_features]


## Feature Scaling

In [597]:
"""feature scaling all except'kepoi_name' and 'koi_tce_delivname'"""

"feature scaling all except'kepoi_name' and 'koi_tce_delivname'"

### Applying Box-cox transformation
The Box-Cox transformation is a statistical technique used to stabilize variance and make data more normally distributed. It is particularly useful when dealing with data that violates the assumptions of normality required by many statistical models.

In essence, the Box-Cox transformation applies a power transformation to the data, defined by the formula:


![Screenshot%202024-02-06%20202654.png](attachment:Screenshot%202024-02-06%20202654.png)
where 
y is the original data and 

λ is a parameter that determines the type of transformation applied. The optimal value of 

λ is determined through maximum likelihood estimation or other optimization techniques.

The Box-Cox transformation works well for data that follows a wide range of distributions, including skewed distributions. It is commonly used in regression analysis, time series analysis, and other statistical modeling tasks to improve the validity of assumptions and enhance the performance of models.

In [598]:
# X_train['loan_percent_income'] += 0.001  # Adding a small constant value
X_train_relevant.columns

Index(['koi_score', 'koi_fpflag_nt', 'koi_fpflag_co', 'koi_fpflag_ss',
       'koi_prad', 'koi_fpflag_ec', 'koi_steff_err1', 'koi_prad_err2',
       'koi_period', 'koi_prad_err1', 'koi_steff_err2', 'koi_depth',
       'koi_insol', 'koi_insol_err1', 'koi_model_snr',
       'koi_tce_delivname_freq', 'koi_insol_err2', 'koi_period_err1',
       'koi_period_err2'],
      dtype='object')

In [599]:
from scipy.stats import boxcox

# # List of columns to transform using Box-Cox
columns_to_transform = ['koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
       'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2',
       'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact',
       'koi_impact_err1', 'koi_impact_err2', 'koi_duration',
       'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1',
       'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2',
       'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2',
       'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_steff_err1',
       'koi_steff_err2', 'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2',
       'koi_srad', 'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec', 'koi_kepmag']

# Apply Box-Cox transformation to specified columns in X_train
for column in columns_to_transform:
    # Add a small constant value to handle non-positive values
    X_train[column] = X_train[column] - X_train[column].min() + 1
    X_train[column], _ = boxcox(X_train[column])


### standardizing to bring into same range

In [600]:
from sklearn.preprocessing import StandardScaler

# Initialize StandardScaler
scaler = StandardScaler()

# Fit scaler on X_train and transform X_train
X_train = scaler.fit_transform(X_train)

# Transform X_test using the same scaler
X_test= scaler.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Instantiate GridSearchCV
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, 
                           cv=5, scoring='accuracy', n_jobs=-1)

# Fit the GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Hyperparameters:", best_params)
print("Best Accuracy:", best_score)

# Get the best model
best_rf_model = grid_search.best_estimator_

# Evaluate the best model on the test set
test_accuracy = best_rf_model.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)
dump(best_rf_model, 'rf_kepler.joblib')

In [601]:
#  RandomForestClassifier(max_depth=None, min_samples_split=2, n_estimators=300))
rf=RandomForestClassifier(max_depth=None, min_samples_split=2, n_estimators=300)
rf.fit(X_train,y_train)

In [602]:
y_pred=rf.predict(X_test)

In [603]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.86      0.99      0.92       826
           1       0.99      0.88      0.93      1087

    accuracy                           0.93      1913
   macro avg       0.93      0.93      0.93      1913
weighted avg       0.93      0.93      0.93      1913

