In [2]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in c:\users\todds\anaconda3\envs\pythonadv\lib\site-packages (0.0)


In [3]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [4]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [5]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CANDIDATE,0,0,0,0,0.25982,4.04e-07,-4.04e-07,131.85061,0.00137,...,-62,4.736,0.028,-0.035,0.526,0.035,-0.035,281.11646,43.28244,15.174
1,FALSE POSITIVE,0,1,0,0,0.299698,1.91e-07,-1.91e-07,132.017121,0.000528,...,-154,4.547,0.058,-0.071,0.782,0.09,-0.074,296.96381,50.74538,14.828
2,CANDIDATE,0,0,0,0,0.306702,7.19e-07,-7.19e-07,131.51216,0.00207,...,-184,4.512,0.095,-0.085,0.786,0.11,-0.099,289.82599,43.725231,15.229
3,FALSE POSITIVE,0,1,0,0,0.306938,3.33e-07,-3.33e-07,131.635518,0.0009,...,-138,4.58,0.044,-0.061,0.741,0.081,-0.066,287.73572,42.823421,14.409
4,FALSE POSITIVE,0,1,0,0,0.328687,4.62e-07,-4.62e-07,132.77146,0.00126,...,-160,4.535,0.048,-0.143,0.847,0.181,-0.077,288.41684,47.731091,15.316


# Select your features (columns)

In [6]:
# NEW:  Dropping "CANDIDATE" values from disposition column to try running date only on confirmed of false postives
drop_candidate = df[df['koi_disposition'] != "CANDIDATE"]
drop_candidate.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
1,FALSE POSITIVE,0,1,0,0,0.299698,1.91e-07,-1.91e-07,132.017121,0.000528,...,-154,4.547,0.058,-0.071,0.782,0.09,-0.074,296.96381,50.74538,14.828
3,FALSE POSITIVE,0,1,0,0,0.306938,3.33e-07,-3.33e-07,131.635518,0.0009,...,-138,4.58,0.044,-0.061,0.741,0.081,-0.066,287.73572,42.823421,14.409
4,FALSE POSITIVE,0,1,0,0,0.328687,4.62e-07,-4.62e-07,132.77146,0.00126,...,-160,4.535,0.048,-0.143,0.847,0.181,-0.077,288.41684,47.731091,15.316
5,FALSE POSITIVE,0,1,0,0,0.33907,4.23e-07,-4.23e-07,131.86957,0.00105,...,-152,3.846,0.56,-0.14,2.183,0.496,-1.158,296.7019,42.508652,12.771
6,CONFIRMED,0,0,0,0,0.341842,2.28e-07,-2.28e-07,131.660336,0.000545,...,-136,4.601,0.03,-0.07,0.747,0.086,-0.058,285.41061,44.412209,14.915


In [7]:
# Further clean above by dropping error measurement columns
drop_columns = ['koi_period_err1', 'koi_period_err2', 'koi_time0bk_err1', 'koi_time0bk_err2',
                'koi_impact_err1', 'koi_impact_err2', 'koi_steff_err1', 'koi_steff_err2', 
                'koi_duration_err1', 'koi_duration_err2', 'koi_slogg_err1', 'koi_slogg_err2', 
                'koi_srad_err1', 'koi_srad_err2', 'koi_depth_err1', 'koi_depth_err2', 'koi_prad_err1', 
                'koi_prad_err2', 'koi_insol_err1', 'koi_insol_err2']

# Check resulting table
clean_df = drop_candidate.drop(drop_columns, axis=1)
clean_df.head(2)

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,...,koi_teq,koi_insol,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
1,FALSE POSITIVE,0,1,0,0,0.299698,132.017121,0.964,0.8473,346.9,...,2246,6005.79,53.6,1,5180,4.547,0.782,296.96381,50.74538,14.828
3,FALSE POSITIVE,0,1,0,0,0.306938,131.635518,0.005,0.5925,119.2,...,2142,5019.53,29.3,1,5088,4.58,0.741,287.73572,42.823421,14.409


In [8]:
# Assign X (data) and y (target); starting with full dataset

X = clean_df.drop('koi_disposition', axis=1)
y = clean_df['koi_disposition']


# Binary code y-data using pd.get_dummies
y = pd.get_dummies(y)

# Confirm data shapes
print(X.shape, y.shape)


(5304, 20) (5304, 2)


In [9]:
# Separeate features into their respective categories; run different "X1, X2, ..." scenarios through 
# the testing and training to identify best sets of data

disposition_params = ['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec']
transit_params = ['koi_period', 'koi_time0bk', 'koi_impact', 'koi_duration', 'koi_depth', 'koi_prad', 
                  'koi_teq', 'koi_insol']
threshold_params = ['koi_model_snr', 'koi_tce_plnt_num']
stellar_params = ['koi_steff', 'koi_slogg', 'koi_srad']
kic_params = ['ra', 'dec', 'koi_kepmag']

# Form smaller data sets by referencing previous "X_" dataset and dropping additional colunns
X1 = X.drop(disposition_params, axis=1)
X2 = X1.drop(transit_params, axis=1)
X3 = X2.drop(threshold_params, axis=1)
X4 = X3.drop(stellar_params, axis=1)
print(X1.shape, X2.shape, X3.shape, X4.shape)


(5304, 16) (5304, 8) (5304, 6) (5304, 3)


In [10]:
# Review column headers
X4.columns

Index(['ra', 'dec', 'koi_kepmag'], dtype='object')

In [11]:
# Split data into training and testing models

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [12]:
# Call the Logistic Regression Model

from sklearn.linear_model import LinearRegression
model = LinearRegression()
model

LinearRegression()

In [13]:
# Fit the model

model.fit(X_train, y_train)


LinearRegression()

In [14]:
# Validate the model using hte test data

print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")

Training Data Score: 0.7507894237307152
Testing Data Score: 0.7309605744455857


In [15]:
# Loop through different scenarios to test data combinations:

regress_set = [X, X1, X2, X3, X4]

for item in regress_set:
    X_train, X_test, y_train, y_test = train_test_split(item, y, random_state=1)
    model.fit(X_train, y_train)
    print(f"Data columms include: {item.columns}")
    print(f"Training Data Score: {model.score(X_train, y_train)}")
    print(f"Testing Data Score: {model.score(X_test, y_test)}")
    print('--------------------')
    

Data columms include: Index(['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec',
       'koi_period', 'koi_time0bk', 'koi_impact', 'koi_duration', 'koi_depth',
       'koi_prad', 'koi_teq', 'koi_insol', 'koi_model_snr', 'koi_tce_plnt_num',
       'koi_steff', 'koi_slogg', 'koi_srad', 'ra', 'dec', 'koi_kepmag'],
      dtype='object')
Training Data Score: 0.7507894237307152
Testing Data Score: 0.7309605744455857
--------------------
Data columms include: Index(['koi_period', 'koi_time0bk', 'koi_impact', 'koi_duration', 'koi_depth',
       'koi_prad', 'koi_teq', 'koi_insol', 'koi_model_snr', 'koi_tce_plnt_num',
       'koi_steff', 'koi_slogg', 'koi_srad', 'ra', 'dec', 'koi_kepmag'],
      dtype='object')
Training Data Score: 0.3141664569780041
Testing Data Score: 0.2815802837867136
--------------------
Data columms include: Index(['koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_slogg',
       'koi_srad', 'ra', 'dec', 'koi_kepmag'],
      dtype='object')
Training Dat

In [16]:
# Check predidcted against actual values

predictions = model.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10]}")

First 10 Predictions:   [[0.3622222  0.6377778 ]
 [0.25706129 0.74293871]
 [0.35188042 0.64811958]
 [0.50538432 0.49461568]
 [0.2390077  0.7609923 ]
 [0.21694875 0.78305125]
 [0.31748538 0.68251462]
 [0.2668863  0.7331137 ]
 [0.38577883 0.61422117]
 [0.24387941 0.75612059]]
First 10 Actual labels:       CONFIRMED  FALSE POSITIVE
5221          1               0
5580          1               0
5292          1               0
4785          0               1
3700          0               1
6218          0               1
6720          0               1
5445          0               1
2207          1               0
1183          0               1


# Save the Model

In [17]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'todd_schanzlin_linear_regresssion.sav'
joblib.dump(model, filename)

['todd_schanzlin_linear_regresssion.sav']