#### Problem Statement : Create an ML algorithm to classify the planets as Candidate/False positive/Confirmed etc
#### based on the column “koi_disposition”.

In [1]:
# importing the required lib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# Load the data 
file_path = r"C:\Users\HP\Downloads\keplers data.csv"  
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,-81,4.47,0.06,-0.1,0.93,0.11,-0.06,291.93,48.14,15.35
1,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.97,0,0,0,0,...,-81,4.47,0.06,-0.1,0.93,0.11,-0.06,291.93,48.14,15.35
2,10811496,K00753.01,0,CANDIDATE,CANDIDATE,0.0,0,0,0,0,...,-176,4.54,0.04,-0.18,0.87,0.23,-0.08,297.0,48.13,15.44
3,10848459,K00754.01,0,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,...,-174,4.56,0.05,-0.17,0.79,0.2,-0.07,285.53,48.29,15.6
4,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,-211,4.44,0.07,-0.21,1.05,0.33,-0.13,288.75,48.23,15.51


In [3]:
# basic information about the dataset
print("Initial Data Information:")
print(data.info())

Initial Data Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9564 entries, 0 to 9563
Data columns (total 49 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   kepid              9564 non-null   int64  
 1   kepoi_name         9564 non-null   object 
 2   kepler_name        9564 non-null   object 
 3   koi_disposition    9564 non-null   object 
 4   koi_pdisposition   9564 non-null   object 
 5   koi_score          9564 non-null   float64
 6   koi_fpflag_nt      9564 non-null   int64  
 7   koi_fpflag_ss      9564 non-null   int64  
 8   koi_fpflag_co      9564 non-null   int64  
 9   koi_fpflag_ec      9564 non-null   int64  
 10  koi_period         9564 non-null   float64
 11  koi_period_err1    9564 non-null   float64
 12  koi_period_err2    9564 non-null   float64
 13  koi_time0bk        9564 non-null   float64
 14  koi_time0bk_err1   9564 non-null   float64
 15  koi_time0bk_err2   9564 non-null   float64
 16

In [4]:
# locating the missing values if any
print("\nMissing Values Before Filling:")
print(data.isnull().sum())


Missing Values Before Filling:
kepid                0
kepoi_name           0
kepler_name          0
koi_disposition      0
koi_pdisposition     0
koi_score            0
koi_fpflag_nt        0
koi_fpflag_ss        0
koi_fpflag_co        0
koi_fpflag_ec        0
koi_period           0
koi_period_err1      0
koi_period_err2      0
koi_time0bk          0
koi_time0bk_err1     0
koi_time0bk_err2     0
koi_impact           0
koi_impact_err1      0
koi_impact_err2      0
koi_duration         0
koi_duration_err1    0
koi_duration_err2    0
koi_depth            0
koi_depth_err1       0
koi_depth_err2       0
koi_prad             0
koi_prad_err1        0
koi_prad_err2        0
koi_teq              0
koi_teq_err1         0
koi_teq_err2         0
koi_insol            0
koi_insol_err1       0
koi_insol_err2       0
koi_model_snr        0
koi_tce_plnt_num     0
koi_tce_delivname    0
koi_steff            0
koi_steff_err1       0
koi_steff_err2       0
koi_slogg            0
koi_slogg_err1       0
ko

In [5]:
# Fill missing values with mean for numeric and 'missing' for categorical columns
numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = data.select_dtypes(include=['object']).columns


In [6]:
# Fill numeric columns with their mean
for column in numeric_cols:
    data[column].fillna(data[column].mean(), inplace=True)

In [7]:
# Fill categorical columns with a placeholder (e.g., 'missing')
for column in categorical_cols:
    data[column].fillna('missing', inplace=True)

In [8]:
# Convert all columns to numeric, coercing errors
for column in data.columns:
    data[column] = pd.to_numeric(data[column], errors='coerce')

In [9]:
# Check for any remaining missing values after conversion
print("\nMissing Values After Conversion to Numeric:")
print(data.isnull().sum())


Missing Values After Conversion to Numeric:
kepid                   0
kepoi_name           9564
kepler_name          2360
koi_disposition      9564
koi_pdisposition     9564
koi_score               0
koi_fpflag_nt           0
koi_fpflag_ss           0
koi_fpflag_co           0
koi_fpflag_ec           0
koi_period              0
koi_period_err1         0
koi_period_err2         0
koi_time0bk             0
koi_time0bk_err1        0
koi_time0bk_err2        0
koi_impact              0
koi_impact_err1         0
koi_impact_err2         0
koi_duration            0
koi_duration_err1       0
koi_duration_err2       0
koi_depth               0
koi_depth_err1          0
koi_depth_err2          0
koi_prad                0
koi_prad_err1           0
koi_prad_err2           0
koi_teq                 0
koi_teq_err1            0
koi_teq_err2            0
koi_insol               0
koi_insol_err1          0
koi_insol_err2          0
koi_model_snr           0
koi_tce_plnt_num        0
koi_tce_delivname  

In [10]:
# Fill any remaining NaNs that were caused by conversion
data.fillna(0, inplace=True)  # Or any other value you prefer

In [11]:
# Check the shape of the DataFrame after cleaning
print(f"\nShape of the DataFrame after cleaning: {data.shape}")




Shape of the DataFrame after cleaning: (9564, 49)


In [12]:
# Encode the target variable 'koi_disposition' into numeric values
if 'koi_disposition' in data.columns:
    data['koi_disposition'] = data['koi_disposition'].astype('category').cat.codes
else:
    print("Column 'koi_disposition' not found in the DataFrame.")
    exit()



In [13]:
# Define features (X) and target variable (y)
X = data.drop('koi_disposition', axis=1)  # Features: all columns except 'koi_disposition'
y = data['koi_disposition']  # Target variable: 'koi_disposition'



In [14]:
# Check sizes of X and y before splitting
print(f"Features shape: {X.shape}, Target shape: {y.shape}")



Features shape: (9564, 48), Target shape: (9564,)


In [15]:
# Ensure that X and y are not empty
if X.shape[0] == 0 or y.shape[0] == 0:
    print("Features or target variable is empty. Cannot proceed with model training.")
else:
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=20)

In [16]:
# Initialize the Random Forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the model on the training data
model.fit(X_train, y_train)

    

In [17]:
# Make predictions on the test data
y_pred = model.predict(X_test) 

In [18]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.2f}")   


Model Accuracy: 1.00


In [19]:
# Print classification report
report = classification_report(y_test, y_pred, output_dict=True)
# Convert classification report to a DataFrame for better visualization
report_df = pd.DataFrame(report).transpose()
# Display the classification report in a tabular format
print("\nClassification Report:")
print(report_df)


Classification Report:
              precision  recall  f1-score  support
0                   1.0     1.0       1.0   1913.0
accuracy            1.0     1.0       1.0      1.0
macro avg           1.0     1.0       1.0   1913.0
weighted avg        1.0     1.0       1.0   1913.0
