In [53]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler as SDS
from sklearn.preprocessing import OneHotEncoder as OHE
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedShuffleSplit

In [54]:
pwd = os.getcwd()
filepath_1 = os.path.join(pwd, "Datasets\\cosmicclassifierTraining.csv")
filepath_2 = os.path.join(pwd, "Datasets\\cosmicclassifierTest.csv")
filepath_1

'D:\\ml_projects\\Cosmic_Classsifier\\Datasets\\cosmicclassifierTraining.csv'

In [55]:
cosmic_train_df = pd.read_csv(filepath_1)
cosmic_test_df = pd.read_csv(filepath_2)

In [56]:
cosmic_train_df.head()

Unnamed: 0,Atmospheric Density,Surface Temperature,Gravity,Water Content,Mineral Abundance,Orbital Period,Proximity to Star,Magnetic Field Strength,Radiation Levels,Atmospheric Composition Index,Prediction
0,0.472806,,-0.313872,-2.089299,-0.152201,-0.885649,0.900105,,Category_6,0.692907,5.0
1,4.180154,-1.157515,2.430956,-1.59585,-3.188678,-0.609434,-0.199828,Category_9,Category_9,,0.0
2,-0.129008,1.621592,-0.785741,2.081196,-1.413796,-0.095152,-3.502577,,Category_8,-0.677182,4.0
3,-3.122,-2.299818,1.072092,0.353524,-0.192529,2.917067,-1.972329,,Category_11,0.109429,1.0
4,-1.459426,2.890268,0.148757,-0.804439,0.494875,0.04491,-0.438796,Category_6,Category_10,0.407941,9.0


In [57]:
cosmic_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Data columns (total 11 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Atmospheric Density            57016 non-null  float64
 1   Surface Temperature            56968 non-null  float64
 2   Gravity                        57016 non-null  float64
 3   Water Content                  56923 non-null  float64
 4   Mineral Abundance              57079 non-null  float64
 5   Orbital Period                 57003 non-null  float64
 6   Proximity to Star              57055 non-null  float64
 7   Magnetic Field Strength        56942 non-null  object 
 8   Radiation Levels               56979 non-null  object 
 9   Atmospheric Composition Index  57058 non-null  float64
 10  Prediction                     56961 non-null  float64
dtypes: float64(9), object(2)
memory usage: 5.0+ MB


In [58]:
print(cosmic_train_df["Prediction"].unique())

[ 5.  0.  4.  1.  9.  2. nan  3.  6.  7.  8.]


In [59]:
cosmic_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Atmospheric Density            10000 non-null  float64
 1   Surface Temperature            10000 non-null  float64
 2   Gravity                        10000 non-null  float64
 3   Water Content                  10000 non-null  float64
 4   Mineral Abundance              10000 non-null  float64
 5   Orbital Period                 10000 non-null  float64
 6   Proximity to Star              10000 non-null  float64
 7   Magnetic Field Strength        10000 non-null  object 
 8   Radiation Levels               10000 non-null  object 
 9   Atmospheric Composition Index  10000 non-null  float64
dtypes: float64(8), object(2)
memory usage: 781.4+ KB


In [60]:
simple_imputer = SimpleImputer(strategy="most_frequent")
cosmic_train_df[["Prediction", "Magnetic Field Strength", "Radiation Levels"]] = simple_imputer.fit_transform(cosmic_train_df[["Prediction", "Magnetic Field Strength", "Radiation Levels"]])

In [62]:

numeric_cols = cosmic_train_df.select_dtypes(include=['number']).columns
iter_imputer = IterativeImputer(random_state=42)
cosmic_train_df[numeric_cols] = iter_imputer.fit_transform(cosmic_train_df[numeric_cols])

In [64]:
cosmic_train_df.head(10)

Unnamed: 0,Atmospheric Density,Surface Temperature,Gravity,Water Content,Mineral Abundance,Orbital Period,Proximity to Star,Magnetic Field Strength,Radiation Levels,Atmospheric Composition Index,Prediction
0,0.472806,-0.000758,-0.313872,-2.089299,-0.152201,-0.885649,0.900105,Category_9,Category_6,0.692907,5.0
1,4.180154,-1.157515,2.430956,-1.59585,-3.188678,-0.609434,-0.199828,Category_9,Category_9,-0.000395,0.0
2,-0.129008,1.621592,-0.785741,2.081196,-1.413796,-0.095152,-3.502577,Category_9,Category_8,-0.677182,4.0
3,-3.122,-2.299818,1.072092,0.353524,-0.192529,2.917067,-1.972329,Category_9,Category_11,0.109429,1.0
4,-1.459426,2.890268,0.148757,-0.804439,0.494875,0.04491,-0.438796,Category_6,Category_10,0.407941,9.0
5,0.749979,1.064353,2.510823,0.105772,1.203825,1.355501,0.000339,Category_10,Category_9,0.551453,2.0
6,-2.971646,-0.648251,-0.915859,0.255504,-0.537165,-2.072251,1.355523,Category_8,Category_7,1.876232,1.0
7,1.981474,-1.347445,-0.483752,-3.365811,2.690204,1.199946,-1.907013,Category_12,Category_5,0.443053,1.0
8,-3.306354,-0.316716,-0.431264,0.389815,-1.961216,-1.510182,0.538593,Category_8,Category_7,0.934055,1.0
9,-1.188687,1.775882,2.666702,0.434578,0.205058,-0.000999,-1.545261,Category_10,Category_9,-1.973059,4.0


In [65]:
cat_encoder = OHE(handle_unknown='ignore', sparse_output=False)
cosmic_cat = cosmic_train_df[["Magnetic Field Strength", "Radiation Levels"]]
encoded_array = cat_encoder.fit_transform(cosmic_cat)
encoded_df = pd.DataFrame(encoded_array, columns=cat_encoder.get_feature_names_out(["Magnetic Field Strength", "Radiation Levels"]), index=cosmic_train_df.index)
cosmic_train_df.drop(columns=["Magnetic Field Strength", "Radiation Levels"], inplace=True)
cosmic_train_df = pd.concat([cosmic_train_df, encoded_df], axis=1)

In [72]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
X = cosmic_train_df.drop(columns=["Prediction"])
y = cosmic_train_df["Prediction"]
for train_idx, test_idx in split.split(X, y):
    X_train = X.iloc[train_idx]
    X_test = X.iloc[test_idx]
    y_train = y.iloc[train_idx]
    y_test = y.iloc[test_idx]

In [73]:
std_scaler = SDS()
scaled_array = std_scaler.fit_transform(cosmic_train_df)
cosmic_train_df = pd.DataFrame(scaled_array, columns=cosmic_train_df.columns, index=cosmic_train_df.index)