In [1]:
import warnings

warnings.filterwarnings("ignore")

In [63]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [64]:
df=pd.read_excel("C:\\Users\\Sarrang\\Desktop\\kepler_data.xlsx")

In [65]:
""" dropping the columns koi_teq_err2, koi_teq_err1 because 100% of values are missing"""
df.drop(columns=['koi_teq_err2', 'koi_teq_err1'], inplace=True)

In [66]:
""" the column kepler name contains 75% missing values, dropping this column could result in lots of bias. However 
It a categorical column- meaning we'd have to impute with the mode, the mode doesnt make sense in this case because the 
kepler name is never repeated twice. therefore im making a decision to drop the column"""
df['kepler_name'].value_counts(ascending=False)
df.drop(columns=['kepler_name'], inplace=True)

In [67]:
df['koi_disposition'].value_counts()

koi_disposition
FALSE POSITIVE    4840
CANDIDATE         2366
CONFIRMED         2358
Name: count, dtype: int64

In [68]:
df['koi_disposition'] = df['koi_disposition'].replace({'CANDIDATE': 'CANDIDATE_OR_CONFIRMED', 'CONFIRMED': 'CANDIDATE_OR_CONFIRMED'})

In [69]:
df['koi_disposition'].value_counts()

koi_disposition
FALSE POSITIVE            4840
CANDIDATE_OR_CONFIRMED    4724
Name: count, dtype: int64

### imputation process

In [70]:
missing_values = df.isnull().sum()
columns_with_missing_values = missing_values[missing_values > 0].index
numerical_columns_with_missing = []
categorical_columns_with_missing = []
for column in columns_with_missing_values:
    if df[column].dtype == 'object':  # Categorical columns have dtype 'object'
        categorical_columns_with_missing.append(column)
    else:
        numerical_columns_with_missing.append(column)
print("Numerical columns with missing data:", numerical_columns_with_missing)
print("Categorical columns with missing data:", categorical_columns_with_missing)


Numerical columns with missing data: ['koi_score', 'koi_period_err1', 'koi_period_err2', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact', 'koi_impact_err1', 'koi_impact_err2', 'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1', 'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2', 'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2', 'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_steff_err1', 'koi_steff_err2', 'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2', 'koi_srad', 'koi_srad_err1', 'koi_srad_err2', 'koi_kepmag']
Categorical columns with missing data: ['koi_tce_delivname']


In [71]:
"""
OUTLIERS ARE PREVALENT IN THE NUMERICAL COLUMNS WITH MISSING VALUES, SO BETTER STRATEGY IS TO REPLACE WITH MEDIAN
"""

'\nOUTLIERS ARE PREVALENT IN THE NUMERICAL COLUMNS WITH MISSING VALUES, SO BETTER STRATEGY IS TO REPLACE WITH MEDIAN\n'

In [72]:
df.shape

(9564, 46)

In [73]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
imputer.fit(df[numerical_columns_with_missing])
df[numerical_columns_with_missing] = imputer.transform(df[numerical_columns_with_missing])

In [74]:
df.shape

(9564, 46)

In [75]:
# df.isnull().sum()
# df['koi_tce_delivname'].value_counts()
"""
koi_tce_delivname is categorical column with close to 3.6% values missing therefore using mode to replace 
"""
mode_value = df['koi_tce_delivname'].mode()[0]

# Fill missing values with the mode
df['koi_tce_delivname'].fillna(mode_value, inplace=True)

In [76]:
"""missing values have been addressed"""

'missing values have been addressed'

In [77]:
numeric_columns = df.select_dtypes(include=['number']).columns
purely_numeric_columns = [col for col in numeric_columns if df[col].nunique() > 10]  # Adjust the threshold as needed

### encoding

In [78]:
li_objs=[]
for i in df.columns:
    if i not in numeric_columns:
        li_objs.append(i)

In [79]:
df[li_objs]

Unnamed: 0,kepoi_name,koi_disposition,koi_pdisposition,koi_tce_delivname
0,K00752.01,CANDIDATE_OR_CONFIRMED,CANDIDATE,q1_q17_dr25_tce
1,K00752.02,CANDIDATE_OR_CONFIRMED,CANDIDATE,q1_q17_dr25_tce
2,K00753.01,CANDIDATE_OR_CONFIRMED,CANDIDATE,q1_q17_dr25_tce
3,K00754.01,FALSE POSITIVE,FALSE POSITIVE,q1_q17_dr25_tce
4,K00755.01,CANDIDATE_OR_CONFIRMED,CANDIDATE,q1_q17_dr25_tce
...,...,...,...,...
9559,K07985.01,FALSE POSITIVE,FALSE POSITIVE,q1_q17_dr25_tce
9560,K07986.01,CANDIDATE_OR_CONFIRMED,CANDIDATE,q1_q17_dr25_tce
9561,K07987.01,FALSE POSITIVE,FALSE POSITIVE,q1_q17_dr25_tce
9562,K07988.01,CANDIDATE_OR_CONFIRMED,CANDIDATE,q1_q17_dr25_tce


In [81]:
""" kepoi name and koi_tce_delivname are nominal categorical variables, where are koi disposition is a ordinal categorical
variable, however using onehotencoding nominals will result in far too many columns making model overly complex, therefore
will be using frequency encoding for"""
# Calculate the frequency of each category in the 'kepoi_name' column
kepoi_name_frequency = df['kepoi_name'].value_counts(normalize=True)

# Map the frequencies to the original 'kepoi_name' column and convert to float
df['kepoi_name_freq'] = df['kepoi_name'].map(kepoi_name_frequency)

# Convert the column to float, handling non-numeric values
df['kepoi_name_freq'] = pd.to_numeric(df['kepoi_name_freq'], errors='coerce')

# Calculate the frequency of each category in the 'koi_tce_delivname' column
koi_tce_delivname_frequency = df['koi_tce_delivname'].value_counts(normalize=True)

# Map the frequencies to the original 'koi_tce_delivname' column and convert to float
df['koi_tce_delivname_freq'] = df['koi_tce_delivname'].map(koi_tce_delivname_frequency)

# Convert the column to float, handling non-numeric values
df['koi_tce_delivname_freq'] = pd.to_numeric(df['koi_tce_delivname_freq'], errors='coerce')

# Drop the original columns
df = df.drop(columns=['kepoi_name', 'koi_tce_delivname'])

# Print the data types of the new columns
print(df[['kepoi_name_freq', 'koi_tce_delivname_freq']].dtypes)



kepoi_name_freq           float64
koi_tce_delivname_freq    float64
dtype: object


In [82]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit label encoder and transform the 'koi_disposition' column
df['koi_disposition_encoded'] = label_encoder.fit_transform(df['koi_disposition'])
df['koi_disposition_encoded'] = label_encoder.fit_transform(df['koi_pdisposition'])

# Print the mapping of original classes to encoded integers
print("Mapping of original classes to encoded integers:")
for label, code in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)):
    print(f"{label}: {code}")

# Drop the original 'koi_disposition' column
df.drop(columns=['koi_disposition'], inplace=True)
df.drop(columns=['koi_pdisposition'], inplace=True)
# Print the first few rows of the DataFrame to see the encoded column and ensure the original column is dropped
print("\nFirst few rows of the DataFrame with encoded column:")
print(df.head())


Mapping of original classes to encoded integers:
CANDIDATE: 0
FALSE POSITIVE: 1

First few rows of the DataFrame with encoded column:
      kepid  koi_score  koi_fpflag_nt  koi_fpflag_ss  koi_fpflag_co  \
0  10797460      1.000              0              0              0   
1  10797460      0.969              0              0              0   
2  10811496      0.000              0              0              0   
3  10848459      0.000              0              1              0   
4  10854555      1.000              0              0              0   

   koi_fpflag_ec  koi_period  koi_period_err1  koi_period_err2  koi_time0bk  \
0              0    9.488036     2.775000e-05    -2.775000e-05   170.538750   
1              0   54.418383     2.479000e-04    -2.479000e-04   162.513840   
2              0   19.899140     1.494000e-05    -1.494000e-05   175.850252   
3              0    1.736952     2.630000e-07    -2.630000e-07   170.307565   
4              0    2.525592     3.761000e-0

In [83]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1]

## Splitting the dataset into the Training set and Test set

In [84]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

## Feature Scaling

In [85]:
"""feature scaling all except'kepoi_name' and 'koi_tce_delivname'"""

"feature scaling all except'kepoi_name' and 'koi_tce_delivname'"

### Applying Box-cox transformation
The Box-Cox transformation is a statistical technique used to stabilize variance and make data more normally distributed. It is particularly useful when dealing with data that violates the assumptions of normality required by many statistical models.

In essence, the Box-Cox transformation applies a power transformation to the data, defined by the formula:


![Screenshot%202024-02-06%20202654.png](attachment:Screenshot%202024-02-06%20202654.png)
where 
y is the original data and 

λ is a parameter that determines the type of transformation applied. The optimal value of 

λ is determined through maximum likelihood estimation or other optimization techniques.

The Box-Cox transformation works well for data that follows a wide range of distributions, including skewed distributions. It is commonly used in regression analysis, time series analysis, and other statistical modeling tasks to improve the validity of assumptions and enhance the performance of models.

In [86]:
from scipy.stats import boxcox

# # List of columns to transform using Box-Cox
columns_to_transform = ['koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
       'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2',
       'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact',
       'koi_impact_err1', 'koi_impact_err2', 'koi_duration',
       'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1',
       'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2',
       'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2',
       'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_steff_err1',
       'koi_steff_err2', 'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2',
       'koi_srad', 'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec', 'koi_kepmag']


# Apply Box-Cox transformation to specified columns in X_train
for column in columns_to_transform:
    # Add a small constant value to handle non-positive values
    X_train[column] = X_train[column] - X_train[column].min() + 1
    X_train[column], _ = boxcox(X_train[column])


### standardizing to bring into same range

In [87]:
from sklearn.preprocessing import StandardScaler

# Initialize StandardScaler
scaler = StandardScaler()

# Fit scaler on X_train and transform X_train
X_train = scaler.fit_transform(X_train)

# Transform X_test using the same scaler
X_test= scaler.transform(X_test)