In [385]:
import numpy as np
import pandas as pd
from datetime import datetime

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, RobustScaler, PowerTransformer, PolynomialFeatures, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, StackingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import SelectFromModel
from sklearn.neural_network import MLPRegressor
from sklearn.impute import SimpleImputer
from sklearn import set_config

In [386]:
# Energy Intensity

URL_0 = 'https://api.filgreen.d.interplanetary.one/models/model?id=0'

data_0 = pd.read_json(URL_0)

# Energy consumption rate

URL_2 = 'https://api.filgreen.d.interplanetary.one/models/model?id=2'

data_2 = pd.read_json(URL_2)

# Energy used to seal data 

URL_3 = 'https://api.filgreen.d.interplanetary.one/models/model?id=3'

data_3 = pd.read_json(URL_3)

# Energy used to store data 

URL_4 = 'https://api.filgreen.d.interplanetary.one/models/model?id=4'

data_4 = pd.read_json(URL_4)

# Cumulative Energy Use

URL_5 = 'https://api.filgreen.d.interplanetary.one/models/model?id=5'

data_5 = pd.read_json(URL_5)

# Data Storage capacity added per day

URL_7 = 'https://api.filgreen.d.interplanetary.one/models/model?id=7'

data_7 = pd.read_json(URL_7)

In [387]:
df_0 = pd.json_normalize(data_0['data'][1]['data'])
df_2 = pd.json_normalize(data_2['data'][1]['data'])
df_3 = pd.json_normalize(data_3['data'][1]['data'])
df_4 = pd.json_normalize(data_4['data'][0]['data'])
df_5 = pd.json_normalize(data_5['data'][1]['data'])
df_7 = pd.json_normalize(data_7['data'][0]['data'])

In [388]:
df = pd.DataFrame()

df['Date'] = pd.to_datetime(df_0[68:-9].start_date, utc=True)
df['year'] = df.Date.dt.year
df['month'] = df.Date.dt.month
#df.Date = df.Date.dt.strftime('%Y-%m-%d')
df['Energy Intensity'] = df_0[68:-9].value
df['Energy consumption rate'] = df_2[68:-9].value
df['Energy used to seal data'] = df_3[68:-9].value
df['Energy used to seal data'] = df_4[68:-9].value
df['Cumulative Energy Use'] = df_5[68:-9].value
df['Data Storage capacity added per day'] = df_7[68:-9].value

In [389]:
binance_data = pd.read_csv("/kaggle/input/binance-filusdt/FILUSDT-1d.csv")

In [390]:
binance_data.open_time = binance_data.open_time.apply(lambda x: datetime.utcfromtimestamp(x/1000))

In [391]:
binance_data_columns = ['open', 'high', 'low', 'close', 'volume',
       'qav', 'num_trades', 'taker_base_vol', 'taker_quote_vol', 'Date']

In [392]:
binance_data['Date'] = pd.to_datetime(binance_data['open_time'], utc=True)

In [393]:
df = df.merge(binance_data[binance_data_columns][17:-9], on='Date')

In [394]:
final_df = df.groupby(['year', 'month']).agg([np.mean, np.median, np.sum])
final_df = final_df.replace([np.inf, -np.inf], np.nan).dropna(axis=1)

In [395]:
final_df = final_df.append(final_df[-1:], ignore_index=True)

In [396]:
for x in final_df.columns:
    if x == ('Data Storage capacity added per day', 'median'):
        pass
    final_df[x] = final_df[x].shift(1)

In [397]:
final_df = final_df.dropna(axis=0)

In [398]:
for col in final_df.columns:
    final_df[x] = final_df[x].astype(float)

# Training

In [399]:
ceiling = int(len(final_df)*0.8)
train, test = final_df[:ceiling], final_df[ceiling:]

In [400]:
cols = [#('Data Storage capacity added per day', 'sum'),
            (                               'open',   'mean'),
            (                               'open', 'median'),
            (                               'open',    'sum'),
            (                               'high',   'mean'),
            (                               'high', 'median'),
            (                               'high',    'sum'),
            (                                'low',   'mean'),
            (                                'low', 'median'),
            (                                'low',    'sum'),
            (                              'close',   'mean'),
            (                              'close', 'median'),
            (                              'close',    'sum'),
            (                             'volume',   'mean'),
            (                             'volume', 'median'),
            (                             'volume',    'sum'),
            (                                'qav',   'mean'),
            (                                'qav', 'median'),
            (                                'qav',    'sum'),
            (                         'num_trades',   'mean'),
            (                         'num_trades', 'median'),
            (                         'num_trades',    'sum'),
            (                     'taker_base_vol',   'mean'),
            (                     'taker_base_vol', 'median'),
            (                     'taker_base_vol',    'sum'),
            (                    'taker_quote_vol',   'mean'),
            (                    'taker_quote_vol', 'median'),
            (                    'taker_quote_vol',    'sum')]

In [401]:
clf_xg = Pipeline(steps=[
                    ('imputer', SimpleImputer()),
                    ('pre', RobustScaler()),
                    ('poly', PolynomialFeatures(2)),
                    ('selection', SelectFromModel(estimator=RandomForestRegressor(n_estimators=150, random_state=1))),
                    ('model', RandomForestRegressor(n_estimators=450, random_state=1))
                    ])

In [402]:
set_config(display="diagram")
clf_xg

In [408]:
clf_xg.fit(final_df[cols][:-1].values, final_df[('Data Storage capacity added per day', 'median')][:-1].values)

In [410]:
clf_xg.predict(final_df[cols][-1:].values)[0]

20432085538.684444