In [1]:
!conda install scikit-learn
#!conda update scikit-learn

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [2]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Analyze and clean the dataset

In [3]:
df = pd.read_csv("./dataset/exoplanet_data.csv")

In [4]:
df.shape

(6991, 41)

In [5]:
# drop all the columns where all the values are null
df = df.dropna(axis = "columns", how ="all")
# drop all the rows where all the values are null
df = df.dropna(axis = "rows", how = "all")

In [6]:
df.shape

(6991, 41)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6991 entries, 0 to 6990
Data columns (total 41 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   koi_disposition    6991 non-null   object 
 1   koi_fpflag_nt      6991 non-null   int64  
 2   koi_fpflag_ss      6991 non-null   int64  
 3   koi_fpflag_co      6991 non-null   int64  
 4   koi_fpflag_ec      6991 non-null   int64  
 5   koi_period         6991 non-null   float64
 6   koi_period_err1    6991 non-null   float64
 7   koi_period_err2    6991 non-null   float64
 8   koi_time0bk        6991 non-null   float64
 9   koi_time0bk_err1   6991 non-null   float64
 10  koi_time0bk_err2   6991 non-null   float64
 11  koi_impact         6991 non-null   float64
 12  koi_impact_err1    6991 non-null   float64
 13  koi_impact_err2    6991 non-null   float64
 14  koi_duration       6991 non-null   float64
 15  koi_duration_err1  6991 non-null   float64
 16  koi_duration_err2  6991 

### Here we can conclude that there are no NULL values present inside the dataset

### We are converting the multivariate classes into numeric labels using label encoding

### Splitting the dataset into train - test datasets, and storing them as .csv file in the folder 'dataset'

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [9]:
y = df["koi_disposition"]
x = df.drop(columns = ["koi_disposition"])
encoder = LabelEncoder()
y_encoded = pd.DataFrame(encoder.fit(y).transform(y))
y_encoded.columns = ["koi_disposition"]
x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, random_state = 1, stratify = y_encoded, \
                                                    train_size = 0.8)
print(x_train.shape, x_test.shape)
print(type(x_train), type(y_train))

(5592, 40) (1399, 40)
<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'>


## Scaling the data using MinMaxScaler with feature range = [0,1]

In [10]:
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
#
# Transform numpy ndarray to pandas dataframe
#
x_train = pd.DataFrame(x_train)
x_test = pd.DataFrame(x_test)
#
# Resetting the index values
#
x_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
x_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [11]:
print("The shape of the training examples is - {0} and the shape of the test examples is - {1}".format\
      (x_train.shape, x_test.shape))
print("The type of the training examples is - {0} and the type of the test examples is - {1}".format\
      (type(x_train), type(y_train)))

The shape of the training examples is - (5592, 40) and the shape of the test examples is - (1399, 40)
The type of the training examples is - <class 'pandas.core.frame.DataFrame'> and the type of the test examples is - <class 'pandas.core.frame.DataFrame'>


### Concatenate the labels and the examples into a single dataframe

In [12]:
train_frames = [x_train, y_train]
test_frames = [x_test, y_test]
train_data = pd.concat(train_frames, axis = 1)
test_data = pd.concat(test_frames, axis = 1)

In [13]:
print("The shape of the train data is - {0} and the shape of the test data is - {1}".format\
      (train_data.shape, test_data.shape))

The shape of the train data is - (5592, 41) and the shape of the test data is - (1399, 41)


### Saving the training data and test data as .csv file into the folder dataset

In [14]:
train_data.to_csv("./dataset/train_dataset.csv", index=False)
test_data.to_csv("./dataset/test_dataset.csv", index=False)

# Exit