First we import all the librarys/models we need for this project

In [0]:
import mlflow
import mlflow.sklearn
import pandas as pd
import matplotlib.pyplot as plt

from numpy import savetxt

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

Then we import the data from de coinmarketcap API. We had to create an account there in order to get an API key. We used this key in order to obtain the 5000 last lines of data there are on the API. We couldn't load data in real time or more lines because we use a free account on Coinmarketcap.

In [0]:
#This example uses Python 2.7 and the python-request library.

from requests import Request, Session
from requests.exceptions import ConnectionError, Timeout, TooManyRedirects
import json

url = 'https://pro-api.coinmarketcap.com/v1/cryptocurrency/listings/latest'
parameters = {
  'start':'1',
  'limit':'5000',
  'convert':'USD'
}
headers = {
  'Accepts': 'application/json',
  'X-CMC_PRO_API_KEY': 'f6dc697a-c189-4f67-bd7a-4b4768976d31',
}

session = Session()
session.headers.update(headers)

try:
  response = session.get(url, params=parameters)
  data = json.loads(response.text)
  print(data)
except (ConnectionError, Timeout, TooManyRedirects) as e:
  print(e)
 

{'status': {'timestamp': '2021-12-31T16:34:14.424Z', 'error_code': 0, 'error_message': None, 'elapsed': 469, 'credit_count': 25, 'notice': None, 'total_count': 8700}, 'data': [{'id': 1, 'name': 'Bitcoin', 'symbol': 'BTC', 'slug': 'bitcoin', 'num_market_pairs': 8922, 'date_added': '2013-04-28T00:00:00.000Z', 'tags': ['mineable', 'pow', 'sha-256', 'store-of-value', 'state-channel', 'coinbase-ventures-portfolio', 'three-arrows-capital-portfolio', 'polychain-capital-portfolio', 'binance-labs-portfolio', 'blockchain-capital-portfolio', 'boostvc-portfolio', 'cms-holdings-portfolio', 'dcg-portfolio', 'dragonfly-capital-portfolio', 'electric-capital-portfolio', 'fabric-ventures-portfolio', 'framework-ventures-portfolio', 'galaxy-digital-portfolio', 'huobi-capital-portfolio', 'alameda-research-portfolio', 'a16z-portfolio', '1confirmation-portfolio', 'winklevoss-capital-portfolio', 'usv-portfolio', 'placeholder-ventures-portfolio', 'pantera-capital-portfolio', 'multicoin-capital-portfolio', 'par

So now we have a lot of data on the biggest 50 cryptocurrencies. We have now to clean and preprocess the data in order to use a model.

First, we decided to use an other format for our data. Indeed, we have a JSON file but it's better to maniuplate a dataframe in order to clean the dataset. Then we display the data:

In [0]:
df = pd.DataFrame(data["data"])
df.head()

Unnamed: 0,id,name,symbol,slug,num_market_pairs,date_added,tags,max_supply,circulating_supply,total_supply,platform,cmc_rank,last_updated,quote
0,1,Bitcoin,BTC,bitcoin,8922,2013-04-28T00:00:00.000Z,"[mineable, pow, sha-256, store-of-value, state...",21000000.0,18915920.0,18915920.0,,1,2021-12-31T16:33:00.000Z,"{'USD': {'price': 48050.793124232, 'volume_24h..."
1,1027,Ethereum,ETH,ethereum,5365,2015-08-07T00:00:00.000Z,"[mineable, pow, smart-contracts, ethereum-ecos...",,118968100.0,118968100.0,,2,2021-12-31T16:33:00.000Z,"{'USD': {'price': 3787.087165677843, 'volume_2..."
2,1839,Binance Coin,BNB,binance-coin,601,2017-07-25T00:00:00.000Z,"[marketplace, centralized-exchange, payments, ...",166801148.0,166801100.0,166801100.0,,3,2021-12-31T16:33:00.000Z,"{'USD': {'price': 522.5580228219984, 'volume_2..."
3,825,Tether,USDT,tether,25057,2015-02-25T00:00:00.000Z,"[payments, stablecoin, asset-backed-stablecoin...",,78336880000.0,81758990000.0,"{'id': 1027, 'name': 'Ethereum', 'symbol': 'ET...",4,2021-12-31T16:34:00.000Z,"{'USD': {'price': 1.0000419981345485, 'volume_..."
4,5426,Solana,SOL,solana,215,2020-04-10T00:00:00.000Z,"[pos, platform, solana-ecosystem, cms-holdings...",,309483800.0,511616900.0,,5,2021-12-31T16:34:00.000Z,"{'USD': {'price': 175.92551007747838, 'volume_..."


In [0]:
df.shape

Out[54]: (5000, 14)

We choose to delete some columns that werent useful for the training of our model

In [0]:
df = df.drop(['slug', 'date_added','platform','cmc_rank'], axis=1)

We modified the quote column which was composed of JSON data.

In [0]:
print(type(df['quote'][0]))
df['quote'] = df['quote'].apply(lambda x: x['USD'])

<class 'dict'>


In [0]:
df.head()

Unnamed: 0,id,name,symbol,num_market_pairs,tags,max_supply,circulating_supply,total_supply,last_updated,quote
0,1,Bitcoin,BTC,8922,"[mineable, pow, sha-256, store-of-value, state...",21000000.0,18915920.0,18915920.0,2021-12-31T16:33:00.000Z,"{'price': 48050.793124232, 'volume_24h': 32612..."
1,1027,Ethereum,ETH,5365,"[mineable, pow, smart-contracts, ethereum-ecos...",,118968100.0,118968100.0,2021-12-31T16:33:00.000Z,"{'price': 3787.087165677843, 'volume_24h': 130..."
2,1839,Binance Coin,BNB,601,"[marketplace, centralized-exchange, payments, ...",166801148.0,166801100.0,166801100.0,2021-12-31T16:33:00.000Z,"{'price': 522.5580228219984, 'volume_24h': 197..."
3,825,Tether,USDT,25057,"[payments, stablecoin, asset-backed-stablecoin...",,78336880000.0,81758990000.0,2021-12-31T16:34:00.000Z,"{'price': 1.0000419981345485, 'volume_24h': 52..."
4,5426,Solana,SOL,215,"[pos, platform, solana-ecosystem, cms-holdings...",,309483800.0,511616900.0,2021-12-31T16:34:00.000Z,"{'price': 175.92551007747838, 'volume_24h': 13..."


In [0]:
df['quote'][0]

Out[58]: {'price': 48050.793124232,
 'volume_24h': 32612533526.108036,
 'volume_change_24h': 15.8324,
 'percent_change_1h': 0.11496364,
 'percent_change_24h': 1.6687892,
 'percent_change_7d': -6.02435806,
 'percent_change_30d': -18.18529221,
 'percent_change_60d': -21.70909481,
 'percent_change_90d': 0.38380333,
 'market_cap': 908925198928.4882,
 'market_cap_dominance': 40.3371,
 'fully_diluted_market_cap': 1009066655608.87,
 'last_updated': '2021-12-31T16:33:00.000Z'}

We create to columns(volume_change_24h and percent_change_7d).
The fist one is the datat we will predict.
The second one will help us to train the model.

In [0]:
df['volume_change_24h'] = df['quote'].apply(lambda x: x['volume_change_24h'])

In [0]:
df['percent_change_7d'] = df['quote'].apply(lambda x: x['percent_change_7d'])

In [0]:
df['max_supply'].value_counts()
df = df.drop(['max_supply'], axis=1)

Now we have the updated dataset

In [0]:
df.head()

Unnamed: 0,id,name,symbol,num_market_pairs,tags,circulating_supply,total_supply,last_updated,quote,volume_change_24h,percent_change_7d
0,1,Bitcoin,BTC,8922,"[mineable, pow, sha-256, store-of-value, state...",18915920.0,18915920.0,2021-12-31T16:33:00.000Z,"{'price': 48050.793124232, 'volume_24h': 32612...",15.8324,-6.024358
1,1027,Ethereum,ETH,5365,"[mineable, pow, smart-contracts, ethereum-ecos...",118968100.0,118968100.0,2021-12-31T16:33:00.000Z,"{'price': 3787.087165677843, 'volume_24h': 130...",-12.2644,-7.570361
2,1839,Binance Coin,BNB,601,"[marketplace, centralized-exchange, payments, ...",166801100.0,166801100.0,2021-12-31T16:33:00.000Z,"{'price': 522.5580228219984, 'volume_24h': 197...",-4.5957,-4.815197
3,825,Tether,USDT,25057,"[payments, stablecoin, asset-backed-stablecoin...",78336880000.0,81758990000.0,2021-12-31T16:34:00.000Z,"{'price': 1.0000419981345485, 'volume_24h': 52...",-9.7481,-0.025268
4,5426,Solana,SOL,215,"[pos, platform, solana-ecosystem, cms-holdings...",309483800.0,511616900.0,2021-12-31T16:34:00.000Z,"{'price': 175.92551007747838, 'volume_24h': 13...",-28.2281,-10.545477


Then, we use Mlflow in order to get historical results of our model and save our predictions:

In [0]:
X = df.drop(['id','name','symbol','tags','last_updated','quote','percent_change_7d'], axis=1)
y = df['percent_change_7d']
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [0]:
# Enable autolog()
# mlflow.sklearn.autolog() requires mlflow 1.11.0 or above.
mlflow.sklearn.autolog()
 
# With autolog() enabled, all model parameters, a model score, and the fitted model are automatically logged.  
with mlflow.start_run():
  
  # Set the model parameters. 
  n_estimators = 100
  max_depth = 5
  max_features = 3
  
  # Create and train model.
  rf = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth, max_features = max_features)
  rf.fit(X_train, y_train)
  
  # Use the model to make predictions on the test dataset.
  predictions = rf.predict(X_test)

