I originally did a Lasso model, but then discovered that Lasso is for regression models (this is a classification model).  I'm keeping in the notebook, however, because I believe 'Lasso_path' can be used for classification.  For future reference.

In [None]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

In [None]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib

In [None]:
import pandas as pd

 ## Read the CSV and Perform Basic Data Cleaning     

In [None]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

In [None]:
df = pd.get_dummies(df)
df.head()

## Select your features (columns)     

In [None]:
# Set features. This will also be used as your x values.
X = df.drop(columns=['koi_disposition_CANDIDATE','koi_disposition_CONFIRMED', 'koi_disposition_FALSE POSITIVE'])
y = df[['koi_disposition_CANDIDATE','koi_disposition_CONFIRMED','koi_disposition_FALSE POSITIVE']]
print(X.shape, y.shape)

## Create a Train Test Split     
*Use koi_disposition for the y values*

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

## Pre-processing     
*Scale the data using the MinMaxScaler and perform some feature selection*

In [None]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler

#X
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

#y
y_scaler = MinMaxScaler().fit(y_train)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

## Train the Model     
---------------

In [None]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=.01).fit(X_train_scaled, y_train_scaled)
lasso

In [None]:
from sklearn.metrics import mean_squared_error

predictions = lasso.predict(X_test_scaled)
MSE = mean_squared_error(y_test_scaled, predictions)
r2 = lasso.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

In [None]:
print(f"Training Data Score: {lasso.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {lasso.score(X_test_scaled, y_test)}")

In [None]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'alpha': [.0001], 'tol': [.001], 'selection': ['random', 'cyclic']} # 0.521678474811022
grid = GridSearchCV(lasso, param_grid, verbose=3)

In [None]:
# Train the model with GridSearch
grid.fit(X_train_scaled,y_train_scaled)

In [None]:
print(grid.best_params_)
print(grid.best_score_)