In [None]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [4]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CANDIDATE,0,0,0,0,0.25982,4.04e-07,-4.04e-07,131.85061,0.00137,...,-62,4.736,0.028,-0.035,0.526,0.035,-0.035,281.11646,43.28244,15.174
1,FALSE POSITIVE,0,1,0,0,0.299698,1.91e-07,-1.91e-07,132.017121,0.000528,...,-154,4.547,0.058,-0.071,0.782,0.09,-0.074,296.96381,50.74538,14.828
2,CANDIDATE,0,0,0,0,0.306702,7.19e-07,-7.19e-07,131.51216,0.00207,...,-184,4.512,0.095,-0.085,0.786,0.11,-0.099,289.82599,43.725231,15.229
3,FALSE POSITIVE,0,1,0,0,0.306938,3.33e-07,-3.33e-07,131.635518,0.0009,...,-138,4.58,0.044,-0.061,0.741,0.081,-0.066,287.73572,42.823421,14.409
4,FALSE POSITIVE,0,1,0,0,0.328687,4.62e-07,-4.62e-07,132.77146,0.00126,...,-160,4.535,0.048,-0.143,0.847,0.181,-0.077,288.41684,47.731091,15.316


# Select your features (columns)

In [110]:
# NEW:  Dropping "CANDIDATE" values from disposition column to try running date only on confirmed of false postives
drop_candidate = df[df['koi_disposition'] != "CANDIDATE"]
drop_candidate.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
1,FALSE POSITIVE,0,1,0,0,0.299698,1.91e-07,-1.91e-07,132.017121,0.000528,...,-154,4.547,0.058,-0.071,0.782,0.09,-0.074,296.96381,50.74538,14.828
3,FALSE POSITIVE,0,1,0,0,0.306938,3.33e-07,-3.33e-07,131.635518,0.0009,...,-138,4.58,0.044,-0.061,0.741,0.081,-0.066,287.73572,42.823421,14.409
4,FALSE POSITIVE,0,1,0,0,0.328687,4.62e-07,-4.62e-07,132.77146,0.00126,...,-160,4.535,0.048,-0.143,0.847,0.181,-0.077,288.41684,47.731091,15.316
5,FALSE POSITIVE,0,1,0,0,0.33907,4.23e-07,-4.23e-07,131.86957,0.00105,...,-152,3.846,0.56,-0.14,2.183,0.496,-1.158,296.7019,42.508652,12.771
6,CONFIRMED,0,0,0,0,0.341842,2.28e-07,-2.28e-07,131.660336,0.000545,...,-136,4.601,0.03,-0.07,0.747,0.086,-0.058,285.41061,44.412209,14.915


In [111]:
# Set features. This will also be used as your x values.

stellar_parameters = ['koi_steff', 'koi_slogg', 'koi_srad']
kic_parameters = ['ra', 'dec', 'koi_kepmag']
selected_features = drop_candidate[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 
                        'koi_time0bk', 'koi_impact', 'koi_duration', 'koi_depth', 'koi_prad', 'koi_teq', 'koi_insol', 
                        'koi_model_snr', 'koi_tce_plnt_num', 
                        'koi_steff', 'koi_slogg', 'koi_srad', 
                        'ra', 'dec', 'koi_kepmag']]



# Create a Train Test Split

Use `koi_disposition` for the y values

In [112]:
y = pd.get_dummies(drop_candidate["koi_disposition"])
X = selected_features
print(X.shape, y.shape)

(5304, 19) (5304, 2)


In [119]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
942,0,0,0,0,170.408984,0.791,1.8594,1130.7,3.57,1569,1435.82,128.2,1,5550,4.5,0.954,288.32791,47.381592,15.342
6125,1,0,0,0,249.3136,0.174,8.73,432.6,1.95,351,3.58,8.1,1,5910,4.505,0.94,294.07483,50.39724,15.541
5492,0,0,0,0,359.87359,0.864,5.902,1051.6,2.75,390,5.45,28.9,4,4955,4.61,0.721,288.53867,40.61615,15.565
228,0,1,1,0,131.84831,0.039,1.3741,334.5,1.29,1598,1534.24,40.3,1,4842,4.616,0.726,292.5015,41.549061,15.553
1244,0,0,1,0,131.65812,0.057,1.284,42.2,1.04,2004,3820.16,11.2,1,5925,3.989,1.589,286.112,39.80323,13.095


In [120]:
# NEW Cell:  Create model usin linear regression
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [121]:
# NEW Cell:  Fit the model to the training date and calculate the scores for the training and testing data
model.fit(X_train, y_train)
training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)

print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.7402343980485975
Testing Score: 0.7338098426400593


In [None]:
# NEW Cell:


In [None]:
# NEW Cell:

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [122]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)
y_scaler = MinMaxScaler().fit(y_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

# Train the Model



In [117]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: -4.065974307346926
Testing Data Score: -4.123030994308509


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [None]:
# Create the GridSearchCV model

In [None]:
# Train the model with GridSearch

In [None]:
print(grid2.best_params_)
print(grid2.best_score_)

# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'your_name.sav'
joblib.dump(your_model, filename)