In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import tensorflow as tf
from tensorflow import keras
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [2]:
%load_ext nb_black

<IPython.core.display.Javascript object>

## Load data and engineer our features

In [3]:
# Load in the data from nasa exoplanet archive
df = pd.read_csv("planets_2020.06.23_15.26.09.csv", skiprows=147)

<IPython.core.display.Javascript object>

In [4]:
# Turn the catagorical discovery method column into a numerical variable based on the % w/ which that category appears in the data
representation_map = {}
for category in df["pl_discmethod"].unique():
    representation_map[category] = len(df[(df["pl_discmethod"] == category)]) / len(df)
df["pct_discmethod"] = df["pl_discmethod"].map(representation_map)

# Turn the number of planets column into 0's for 1 planet and 1 for > 1 planet
def multiple_planet_check(row):
    return 1 if row["pl_pnum"] > 1 else 0


x = df[["st_mass", "st_teff", "st_rad", "pl_orbsmax", "pct_discmethod"]]
y = df.apply(multiple_planet_check, axis=1)

<IPython.core.display.Javascript object>

## Split the data into a testing and training set

In [5]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

<IPython.core.display.Javascript object>

## Train

In [6]:
# Fit model to training data
import pickle as pkl

model = XGBClassifier()
model.fit(X_train, y_train)

# save model to file
pkl.dump(model, open("num_planets_model.p", "wb"))

<IPython.core.display.Javascript object>

## Test

In [7]:
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

<IPython.core.display.Javascript object>

In [8]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 83.43%


<IPython.core.display.Javascript object>