In [None]:
!pip install xgboost

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
import matplotlib.pyplot as plt


from datetime import datetime

import xgboost as xgb
import math

In [None]:
def drop_columns(dataframe,to_keep):
    """
    Sterge toate coloanele care nu sunt in lista din parametrul 2, si returneaza
    dataframe-ul nou cu indexul resetat
    """
    dataframe = dataframe.drop(columns=[col for col in df if col not in to_keep])
    dataframe.reset_index(inplace=True) 
    dataframe.drop(["index"],axis=1,inplace=True)
    return dataframe

def encode_dates(x):
    """
    Da encode la date de la 1 la 12
    """
    dicts = {"January":1, "February": 2, "March":3, 
             "April":4, "May":5, "June": 6, 
             "July":7, "August":8, "September":9, 
             "October":10, "November":11, "December":12}
    
    x = x.split()
    x[1] = str(dicts[x[1]])
    return x[0]+'/'+x[1]+'/'+x[2]

In [None]:
df = pd.read_csv("../input/solar-eclipses/solar.csv")
df.head()

In [None]:
# Multe coloane nu ne sunt folositoare. Astfel, vom pastra doar coloanele de mai jos:
to_keep = ['Calendar Date','Eclipse Type']
df = drop_columns(df,to_keep)

In [None]:
def date_to_jd(date):
        
    year, month, day = date
    
    if date >= [1582,10,15]:
        #dată gregoriană
        return 367*year - (7*(year+int((month+9)/12)))//4 - (3*(int((year+(month-9)/7)/100)+1))//4+(275*month)//9+day+1721028.5
    
    elif date <= [1582,10,4]:
        #dată iuliană
        return 367*year - (7*(year+5001+int((month-9)/7)))//4+(275*month)//9+day+1729776.5

def jd_to_date(jd):
    
    Z = int(jd+0.5)
    F = (jd+0.5)%1
    if Z < 2299161:
        A = Z
    else:
        g = int((Z - 1867216.25) / 36524.25)
        A = Z + 1 + g - g//4 

    B = A + 1524
    C = int((B-122.1) / 365.25)
    D = int(365.25 * C)
    E = int((B-D) / 30.6001)
 
    d = B - D - int(30.6001*E) + F
    if E<14:
        mo = E-1
    else:
        mo = E-13    

    if mo >2:
        y = C- 4716
    else:
        y = C - 4715
    
    return y, mo, int(d)

In [None]:
# Putem vedea toate tipurile de eclipse. Dintre acestea, cele care sunt Hibride sau Partiale nu ne intereseaza
print("Every type:",df['Eclipse Type'].unique())

types_drop = []
for x in df['Eclipse Type'].unique():
    if x[0] == "P":
        types_drop.append(x)


print("Types to drop:",types_drop)

In [None]:
print("Every type:",df['Eclipse Type'].unique())
# O sa avem un numar de len(types_drop) operatii, in care dam drop
for tip in types_drop:
    df.drop(df[df['Eclipse Type'] == tip].index, inplace=True)    
    
print("Types after drop:",df['Eclipse Type'].unique())

In [None]:
# Facem o coloana in care sa fie data in format de timp julian
# Pentru asta trebuie sa combinam coloana calendar date cu eclipse time

df['Calendar Date'] = df['Calendar Date'].apply(lambda x:encode_dates(x))

# Pentru ca trebuie sa combinam doua date, 
df['Time']=df['Calendar Date'].apply(lambda x : date_to_jd([int(j) for j in x.split('/')]))


In [None]:
to_keep = ['Eclipse Type','Time']
df = drop_columns(df,to_keep)
df.head()

In [None]:
# Vrem sa calculam diferentele intre zile. Pentru asta vom folosi .diff() din pandas
difference = df['Time'].diff().to_list()[1:-1]
plt.hist(difference,bins=100)
plt.xlabel('Time (days)')
plt.show()

In [None]:
# Vrem sa impartim datele ca sa antrenam modelul pe cele de dinainte de 20.03.2021, si sa testam ce am prezis pe datele de dupa 20.03.2021
day = date_to_jd([2021, 3, 20])

cols = df.columns.to_list()

before_list = []
after_list = []
after_df
for vals in df.values:
    if vals[1] <= day:
        before_list.append([vals[0],vals[1]])
    else:
        after_list.append([vals[0],vals[1]])

before_df = pd.DataFrame(data = before_list,columns=cols)
after_df = pd.DataFrame(data = after_list,columns=cols)

before_df.tail()

In [None]:
before_list = before_df['Time'].tolist()
after_list = after_df['Time'].tolist()

In [None]:
before_diff = [int(j) for j in difference[:len(before_list)-1]]
after_diff = [int(j) for j in difference[len(before_list)-1:]]

In [None]:
L = len(before_diff)//82

X = []
y = []
for j in range(len(before_diff)-L):
    X.append(before_diff[j:j+L])
    y.append(before_diff[j+L]) 
 
X = np.array(X)
y = np.array(y)

In [None]:
#Impartim setul de date in validare si testare.


p = 0.15
X_val = X[int((1-p)*len(X)):]
y_val = y[int((1-p)*len(y)):]

X_train = X[:int((1-p)*len(X))]
y_train = y[:int((1-p)*len(y))]

In [None]:
lrn = RandomForestClassifier(1000)
lrn.fit(X_train, y_train)

k = 0
pred = lrn.predict(X_val)
for i in range(len(y_val)):
    if int(y_val[i]) == int(pred[i]):
        k+=1
print("Acuratete RandomForestClassifier {:.2f}%".format((k*100/len(y_val))))

In [None]:
lrn = xgb.XGBClassifier(learning_rate = 0.05, max_depth = 10, objective = "reg:linear")
lrn.fit(X_train, y_train)

k = 0
pred = lrn.predict(X_val)
for i in range(len(y_val)):    
    if int(y_val[i]) == int(pred[i]):
        k+=1
print("Acuratete pentru XGB {:.2f}%".format(k*100/len(y_val)))

In [None]:
#Dupa aceste 2 modele, putem conclude ca XGBClassifier este solutia mai buna
# Antrenam modelul din nou cu toate datele
lrn.fit(X, y)   

In [None]:
N_future = 10   #number of predictions into the future

#prediction part: append every further prediction to the feature set


x_valid = np.array(X[-1])
x_valid = np.roll(x_valid,-1)
x_valid[-1] = y[-1]
y_pred = []

for i in range(N_future):
    
    yy = lrn.predict(np.array([np.array(xx)]))
    y_pred.append(yy[0])
    
    x_valid = np.roll(x_valid,-1)
    x_valid[-1] = yy

    
print("Predicite:      ", y_pred)
print("Adevarate: ", diff_after[:N_future])