In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import tensorflow as tf
from tensorflow import keras
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
# Fit model to training data
import pickle as pkl

## Load data and engineer our features

In [2]:
# Load in the data from nasa exoplanet archive
df = pd.read_csv("planets_2020.06.23_15.26.09.csv", skiprows=147)

In [3]:
# Turn the catagorical discovery method column into a numerical variable based on the % w/ which that category appears in the data
representation_map = {}
for category in df["pl_discmethod"].unique():
    representation_map[category] = len(df[(df["pl_discmethod"] == category)]) / len(df)
df["pct_discmethod"] = df["pl_discmethod"].map(representation_map)

# Turn the number of planets column into 0's for 1 planet and 1 for > 1 planet
def multiple_planet_check(row):
    return 1 if row["pl_pnum"] > 1 else 0


x = df[["st_mass", "st_teff", "st_rad", "pct_discmethod"]]
y = df.apply(multiple_planet_check, axis=1)

## Split the data into a testing and training set

In [5]:
# Split the dataset into training, and testing set
X_train, X_test_eval, y_train, y_test_eval = train_test_split(x, y, test_size=0.5)

# Further splitting the test set into a test and validation set
X_eval, X_test, y_eval, y_test = train_test_split(X_test_eval, y_test_eval, test_size=0.5)

# Create the model, use the validation set to help estimate performance
eval_set = [(X_eval, y_eval)]
model = XGBClassifier()
model.fit(X_train, y_train, eval_metric = "error", eval_set = eval_set, early_stopping_rounds = 50, verbose = False)

# save model to file
pkl.dump(model, open("num_planets_model.p", "wb"))

## Train

## Test

In [6]:
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [7]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 83.29%
