In [1]:
import joblib
import numpy as np
import pandas as pd
# Import packages
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn Packages
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Sklearn Evaluation Metrics
from sklearn import metrics
from sklearn.metrics import mean_squared_error, precision_score, confusion_matrix, accuracy_score

# Visualizes all the columns
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:

array = joblib.load('Dataset.joblib')

# if it does't work use the line below
# array = joblib.load('Dataset1.joblib')


In [3]:
len(array)

3768

In [4]:
df = pd.DataFrame(array)

In [5]:
#df.to_csv("Exoplanet_Flux_Dataset.csv",index=False)

In [5]:
df.shape

(3768, 53256)

In [6]:
max_length = df.shape[1] - 1

In [7]:
max_length

53255

In [8]:
df['EC'] = df[1].apply(lambda x: 1 if x == 'CONFIRMED' else 1 if x == 'CANDIDATE' else 0 )

In [9]:
features = df.drop(columns=[0,1,'EC'])
target = df.EC

In [10]:
medians=features.median(axis=1)

In [11]:
df_dict = features.to_dict(orient='index')
    

In [12]:
for x in range(0,len(df_dict)):
    for y in range(2,max_length+1):
        df_dict[x][y] = df_dict[x][y] /medians[x] -1
    

In [13]:

for x in range(0,len(df_dict)):
    for y in range(2,max_length+1):
        if str(df_dict[x][y]) == "nan":
            df_dict[x][y] = -999
        else:
            df_dict[x][y] = float(df_dict[x][y])


In [14]:
features = pd.DataFrame.from_dict(df_dict,orient='index')

In [15]:
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=1, test_size=.25)

In [16]:
# Evaluation function

def evaluation(y_true, y_pred):
    
# Print Accuracy, Recall, F1 Score, and Precision metrics.
    print('Evaluation Metrics:')
    print('Accuracy: ' + str(metrics.accuracy_score(y_test, y_pred)))
    print('Recall: ' + str(metrics.recall_score(y_test, y_pred)))
    print('F1 Score: ' + str(metrics.f1_score(y_test, y_pred)))
    print('Precision: ' + str(metrics.precision_score(y_test, y_pred)))
    
# Print Confusion Matrix
    print('\nConfusion Matrix:')
    print(' TN,  FP, FN, TP')
    print(confusion_matrix(y_true, y_pred).ravel())
    
# Function Prints best parameters for GridSearchCV
def print_results(results):
    print('Best Parameters: {}\n'.format(results.best_params_))

In [17]:
tree = DecisionTreeClassifier()

# Fitting Model to the train set
tree.fit(X_train, y_train)

# Predicting on the test set
y_pred = tree.predict(X_test)

# Evaluating model
evaluation(y_test, y_pred)

Evaluation Metrics:
Accuracy: 0.7016985138004246
Recall: 0.7988422575976846
F1 Score: 0.7971119133574007
Precision: 0.7953890489913544

Confusion Matrix:
 TN,  FP, FN, TP
[109 142 139 552]


In [18]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from tqdm.notebook import tqdm
from time import sleep
parameter_n_estimators = [500]
for i in tqdm(parameter_n_estimators):
    # Instantiate model
    forest = RandomForestClassifier(n_estimators=i, criterion='gini')
    # Fitting Model to the train set
    forest.fit(X_train, y_train)
    # Predicting on the test set
    y_pred = forest.predict(X_test)

    # Evaluating model
    evaluation(y_test, y_pred)
    print('Tree: %s ' % (i))

  0%|          | 0/1 [00:00<?, ?it/s]

Evaluation Metrics:
Accuracy: 0.7579617834394905
Recall: 0.9479015918958031
F1 Score: 0.8517555266579974
Precision: 0.7733175914994097

Confusion Matrix:
 TN,  FP, FN, TP
[ 59 192  36 655]
Tree: 500 


In [20]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier

gradient_booster = GradientBoostingClassifier(learning_rate=0.1)
gradient_booster.get_params()

gradient_booster.fit(X_train,y_train)
y_pred = gradient_booster.predict(X_test)
evaluation(y_test, y_pred)

Evaluation Metrics:
Accuracy: 0.7574468085106383
Recall: 0.9572039942938659
F1 Score: 0.8547770700636943
Precision: 0.7721518987341772

Confusion Matrix:
 TN,  FP, FN, TP
[ 41 198  30 671]


In [22]:
import lightkurve as lk

T_name = 'TIC 145241359'
search_result = lk.search_lightcurve(T_name)
data = []
for x in range(0,17):
    try:
        lc = search_result[x].download() 
        y = lc.flux
        for i in range(1,len(y),10):
            try:
                data.append(float(y[i].value))
            except:
                pass
    except :
        pass

arr2 = pd.DataFrame(data)
medians= arr2.median(axis=0)
arr3 =[]
for x in range(0,max_length-1):
    try:
        arr3.append((arr2[0][x] / medians)-1)
    except:
        arr3.append(-999)

for x in range(0,max_length-1):
    temp = str(arr3[x])
    if (temp == 'nan') or (temp == '0   NaN\ndtype: float64') :
        arr3[x] = -999

In [23]:
test = forest.predict([arr3])

In [26]:
print(test)

[0]
