In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

encoded = pd.read_parquet('../cache/encoded.parquet')
encoded.head()

In [None]:
encoded.dtypes

In [None]:
encoded = encoded.drop(columns=['Date', 'Source'])

In [None]:
numerical_columns = encoded.select_dtypes(include=['int64', 'float64']).columns.tolist()
# categorical_columns = merged.select_dtypes(include=['object', 'category']).columns.tolist()

# Remove the target variable from the lists if present
# numerical_columns = [col for col in numerical_columns if col != 'pct_change_30min']


In [None]:
# Compute correlations between numerical columns and the numerical target variable (pearson)
correlations = encoded[numerical_columns].corr()
correlations_30min = correlations['pct_change_30min'].drop('pct_change_30min').sort_values(ascending=False)
correlations_15min = correlations['pct_change_15min'].drop('pct_change_15min').sort_values(ascending=False)
correlations_24h = correlations['pct_change_24h'].drop('pct_change_24h').sort_values(ascending=False)
print(correlations_15min)
print("--------------------------------")
print(correlations_30min)
print("--------------------------------")
print(correlations_24h)

In [None]:
# Plot correlation matrix
plt.figure(figsize=(12, 12))
sns.heatmap(correlations, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Pearson correlation matrix')
plt.show()

In [None]:
# Compute correlations between numerical columns and the numerical target variable (spearman)
spearman_correlations = encoded[numerical_columns].corr(method='spearman')
correlations_30min = spearman_correlations['pct_change_30min'].drop('pct_change_30min').sort_values(ascending=False)
correlations_15min = spearman_correlations['pct_change_15min'].drop('pct_change_15min').sort_values(ascending=False)
correlations_24h = spearman_correlations['pct_change_24h'].drop('pct_change_24h').sort_values(ascending=False)
print(correlations_15min)
print("--------------------------------")
print(correlations_30min)
print("--------------------------------")
print(correlations_24h)

In [None]:
# Plot correlation matrix
plt.figure(figsize=(12, 12))
sns.heatmap(spearman_correlations, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Spearman correlation matrix')
plt.show()

In [None]:
# Plot every column in merged dataframe as a scatter plot against the target variable
for col in numerical_columns:
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))

    sns.scatterplot(ax=axes[0], x=col, y='pct_change_15min', data=encoded)
    axes[0].set_title(f'{col} vs pct_change_15min')

    sns.scatterplot(ax=axes[1], x=col, y='pct_change_30min', data=encoded)
    axes[1].set_title(f'{col} vs pct_change_30min')

    sns.scatterplot(ax=axes[2], x=col, y='pct_change_24h', data=encoded)
    axes[2].set_title(f'{col} vs pct_change_24h')

    plt.tight_layout()
    plt.show()

In [None]:
# Compute correlations between numerical columns and the numerical target variable (spearman)
# spearman_correlations = merged[numerical_columns + ['pct_change_30min']].corr(method='spearman')
# target_spearman_correlations = spearman_correlations['pct_change_30min'].drop('pct_change_30min').sort_values(ascending=False)
# print(target_spearman_correlations)

In [None]:
# Plot every column in merged dataframe as a scatter plot against the target variable
# for col in numerical_columns:
#     sns.scatterplot(x=col, y='pct_change_30min', data=merged)
#     plt.title(f'{col} vs pct_change_30min')
#     plt.show()
# for col in categorical_columns:
#     sns.boxplot(x=col, y='pct_change_30min', data=merged)
#     plt.title(f'{col} vs pct_change_30min')
#     plt.show()