In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import plotly.express as px
from backend.app.fetch_data_yf import CSV, data_dir

In [None]:
class CryptoEvaluationPipeline(object):
    def __init__(self, data_file: str, clustered_data_file: str, representative_coins: list, window_days=[1, 7, 10]):
        """
        Initialise the evaluation pipeline.
        
        Args:
            data_file (str) : path the the top 30 cryptos dataset
            clustered_data_file (str) : path to the PCA clustered dataset
            windw_days (list): time horizon in days
        """
        self.data_df: pd.DataFrame = pd.read_csv(data_file, sep=',', encoding='utf8', index_col=0, parse_dates=True)
        self.clustered_data_file: pd.DataFrame = pd.read_csv(clustered_data_file, sep=',', encoding='utf8', index_col=0, parse_dates=True)
        self.representative_coins: dict = {}
        self.time_horizon: list = window_days
        self.num_pos_neg_corr_coins = 4
        # create a mapping of clusters and the representative coins
        for coin in representative_coins:
            cluster = self.clustered_data_file.at[coin, 'cluster']
            self.representative_coins[cluster] = coin
        
        print(self.representative_coins)
        
        # build the correlatino map, from representative coins, for analysis
        self.correlation_map = self._build_correlation_map()
        
    def _analyse_correlations(self):
        """
        Analyse correlations between selected coins and all cryptos in the dataset
        
        Args
            data (pandas.DataFrame): dataset of all coins and their closing prices
            crypto_cluster_representatives (list):
                                    list of selected coins
        Returns
            DataFrame: Correlation matrix
        """
        # get column names for representatives - should match but best to extract from the dataset
        representative_col_names = [col for col in self.data_df.columns if any(rep.lower() in col.lower() for rep in self.representative_coins.values())]
        print("representative coins found in dataset:\n",representative_col_names)
        # calculate the correlation matrix for the entire close dataset
        corr_matrix = self.data_df.corr()
        # extract the correlations for the selected coins
        representatives_correlation_matrix = corr_matrix[representative_col_names]
        # plot correlation heatmap
        # fig = px.imshow(
        #     representatives_correlation_matrix,
        #     labels=dict(color="correlation"),
        #     color_continuous_scale="Inferno_r",
        #     text_auto=True,
        #     aspect='auto',
        #     height=1000,
        # )
        # fig.update_xaxes(side='top')
        # fig.show()
        return representatives_correlation_matrix
        
    def _build_correlation_map(self):
        results = []
        corr_matrix = self._analyse_correlations()
        # iterate through each column in the matrix
        for repr_col in corr_matrix.columns:
            # get the correlation for this column, dropping the self-corr column
            representative_corr = corr_matrix[repr_col].drop(repr_col)
            # get the list of the top 4 positive correlations
            top_positives: pd.Series = representative_corr.nlargest(self.num_pos_neg_corr_coins)
            # get the list of the top 4 negative correlations
            top_negatives: pd.Series = representative_corr.nsmallest(self.num_pos_neg_corr_coins)
            # add to dict, a dict of the postive and negative corrs for a given column (coin)

            tmp_df = pd.DataFrame(np.append(top_positives.values, top_negatives.values),
                                  index=np.append(top_positives.index, top_negatives.index),
                                  columns=[repr_col])
            results.append(tmp_df.T)
        return results
   
selected_coins = ['BNB-USD', 'BTC-USD', 'WETH-USD']
foo = CryptoEvaluationPipeline(CSV, data_dir+"/top_30_cryptos_past_year_pca_cluster.csv", selected_coins)

for corr_extremes in foo.correlation_map:
    print()
    print(corr_extremes.to_markdown())
        

{0: 'BNB-USD', 1: 'BTC-USD', 2: 'WETH-USD'}
representative coins found in dataset:
 ['BNB-USD', 'BTC-USD', 'WBTC-USD', 'WETH-USD']

|         |   BTC-USD |   WBTC-USD |   DOGE-USD |   HBAR-USD |    PI-USD |   TON-USD |   USDT-USD |    USDC-USD |
|:--------|----------:|-----------:|-----------:|-----------:|----------:|----------:|-----------:|------------:|
| BNB-USD |   0.82948 |   0.829018 |   0.806936 |   0.786174 | -0.193981 | -0.159604 | -0.0985299 | -0.00642017 |

|         |   WBTC-USD |   LEO-USD |   DOGE-USD |   XLM-USD |   TON-USD |    PI-USD |   USDC-USD |   USDT-USD |
|:--------|-----------:|----------:|-----------:|----------:|----------:|----------:|-----------:|-----------:|
| BTC-USD |   0.999971 |  0.954715 |   0.931514 |  0.930809 | -0.297925 | -0.201206 | -0.0540657 |  0.0495566 |

|          |   BTC-USD |   LEO-USD |   DOGE-USD |   XLM-USD |   TON-USD |    PI-USD |   USDC-USD |   USDT-USD |
|:---------|----------:|----------:|-----------:|----------:|----------:|---

  self.clustered_data_file: pd.DataFrame = pd.read_csv(clustered_data_file, sep=',', encoding='utf8', index_col=0, parse_dates=True)
