In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from scipy.spatial.distance import cityblock
from scipy.stats import entropy
from eodhd import APIClient
from datetime import datetime, timedelta

In [2]:
# Initialize API client with your API key
api = APIClient("67f380ad6e7193.54825103")

# Define the overall period (using UNIX timestamps)
# 2017-01-01 00:00:00 and 2023-12-31 00:00:00 in UNIX time
overall_start = 1483228800  # 2017-01-01
overall_end = 1704067200    # 2024-01-01 (end of 2023)

# Maximum period allowed by the API in seconds (120 days)
max_period = 120 * 24 * 3600  # 120 days in seconds

# Container to hold fetched data chunks
data_chunks = []

# Loop over the full period in 120-day chunks
current_start = overall_start
while current_start < overall_end:
    current_end = min(current_start + max_period, overall_end)
    print(f"Fetching data from {datetime.utcfromtimestamp(current_start)} to {datetime.utcfromtimestamp(current_end)}")
    
    try:
        # Fetch data for the current chunk
        chunk = api.get_intraday_historical_data(
            symbol="XAUUSD",
            interval="1m",
            from_unix_time=current_start,
            to_unix_time=current_end
        )
        # Check if the API returned an error or empty result
        if isinstance(chunk, dict) and 'errors' in chunk:
            print("Error fetching data for this period:", chunk['errors'])
        elif not chunk:
            print("No data returned for this period.")
        else:
            # Convert the chunk to a DataFrame if it contains the expected data
            df_chunk = pd.DataFrame(chunk)
            if 'date' in df_chunk.columns:
                data_chunks.append(df_chunk)
            else:
                print("Chunk missing 'date' column. Skipping this chunk.")
    except Exception as e:
        print(f"Exception occurred: {e}")
    
    # Move to the next period
    current_start = current_end

# Combine all valid chunks into one DataFrame
if data_chunks:
    df_minute = pd.concat(data_chunks)
    # Ensure proper datetime formatting and sorting if 'date' column exists
    df_minute['date'] = pd.to_datetime(df_minute['date'])
    df_minute.set_index('date', inplace=True)
    df_minute.sort_index(inplace=True)
else:
    raise ValueError("No valid data fetched. Check API limits or parameters.")

# Resample 1-minute data to 30-minute intervals using OHLC aggregation
ohlc_data = df_minute.resample('30T').agg({
    'open': 'first',
    'high': 'max',
    'low': 'min',
    'close': 'last'
}).dropna()

# Save to CSV
ohlc_data.to_csv('ohlc_data.csv', index=False)
ohlc_data

  print(f"Fetching data from {datetime.utcfromtimestamp(current_start)} to {datetime.utcfromtimestamp(current_end)}")


Fetching data from 2017-01-01 00:00:00 to 2017-05-01 00:00:00


Fetching data from 2017-05-01 00:00:00 to 2017-08-29 00:00:00


Fetching data from 2017-08-29 00:00:00 to 2017-12-27 00:00:00


Fetching data from 2017-12-27 00:00:00 to 2018-04-26 00:00:00


Fetching data from 2018-04-26 00:00:00 to 2018-08-24 00:00:00


Fetching data from 2018-08-24 00:00:00 to 2018-12-22 00:00:00


Fetching data from 2018-12-22 00:00:00 to 2019-04-21 00:00:00


Fetching data from 2019-04-21 00:00:00 to 2019-08-19 00:00:00


Fetching data from 2019-08-19 00:00:00 to 2019-12-17 00:00:00


Fetching data from 2019-12-17 00:00:00 to 2020-04-15 00:00:00


Fetching data from 2020-04-15 00:00:00 to 2020-08-13 00:00:00


Fetching data from 2020-08-13 00:00:00 to 2020-12-11 00:00:00


Fetching data from 2020-12-11 00:00:00 to 2021-04-10 00:00:00


Fetching data from 2021-04-10 00:00:00 to 2021-08-08 00:00:00


Fetching data from 2021-08-08 00:00:00 to 2021-12-06 00:00:00


Fetching data from 2021-12-06 00:00:00 to 2022-04-05 00:00:00


Fetching data from 2022-04-05 00:00:00 to 2022-08-03 00:00:00


Fetching data from 2022-08-03 00:00:00 to 2022-12-01 00:00:00


Fetching data from 2022-12-01 00:00:00 to 2023-03-31 00:00:00


Fetching data from 2023-03-31 00:00:00 to 2023-07-29 00:00:00


Fetching data from 2023-07-29 00:00:00 to 2023-11-26 00:00:00


Fetching data from 2023-11-26 00:00:00 to 2024-01-01 00:00:00


KeyError: 'date'

Check below

In [None]:
def extract_patterns(data, window=8, threshold=15):
    """
    Extract patterns from OHLC data.
    Each pattern corresponds to an 8-period (4-hour) window with features:
    H-L, C-O, H-O, O-L.
    """
    features = []
    labels = []
    
    for i in range(len(data) - window - 4):  # Ensure future price comparison is possible
        window_data = data.iloc[i:i + window]
        future_price = data['Close'].iloc[i + window + 4]
        
        # Features: H-L, C-O, H-O, O-L for each period in the window
        feature_vector = np.concatenate([
            (window_data['High'] - window_data['Low']).values,
            (window_data['Close'] - window_data['Open']).values,
            (window_data['High'] - window_data['Open']).values,
            (window_data['Open'] - window_data['Low']).values,
        ])
        
        # Label based on future price change (+/- threshold)
        price_change = future_price - window_data['Close'].iloc[-1]
        label = 1 if price_change > threshold else -1 if price_change < -threshold else 0
        
        if label != 0:  # Ignore neutral patterns
            features.append(feature_vector)
            labels.append(label)
    
    return np.array(features), np.array(labels)

features, labels = extract_patterns(ohlc_data)

In [None]:
def calculate_entropy_and_scores(features, labels, alpha=0.8):
    """
    Compute local entropy, information gain, and profitability scores.
    """
    global_entropy = entropy(np.bincount((labels > 0).astype(int)))  # Shannon entropy
    
    scores = []
    for i, feature in enumerate(features):
        neighbors = [cityblock(feature, f) for f in features]  # Manhattan distances
        k_neighbors_idx = np.argsort(neighbors)[:50]           # Top-50 nearest neighbors
        
        neighbor_labels = labels[k_neighbors_idx]
        p_buy = np.mean(neighbor_labels == labels[i])          # Purity measure
        
        local_entropy = -p_buy * np.log(p_buy) - (1-p_buy) * np.log(1-p_buy) if p_buy not in [0, 1] else 0
        
        pnl_score = np.mean(neighbor_labels == labels[i])      # Simplified profitability score
        
        info_gain = global_entropy - local_entropy             # Information gain
        combined_score = alpha * info_gain + (1-alpha) * pnl_score
        
        scores.append((combined_score, feature, labels[i]))
    
    return sorted(scores, key=lambda x: x[0], reverse=True)

scored_patterns = calculate_entropy_and_scores(features, labels)

In [None]:
def filter_patterns(scored_patterns, theta=5):
    """
    Filter Buy and Sell patterns using distance-based criteria.
    """
    buy_patterns = []
    sell_patterns = []
    
    for score, feature, label in scored_patterns:
        if label == 1:  # Buy pattern
            if all(cityblock(feature, b) >= theta for b in sell_patterns):
                buy_patterns.append(feature)
        elif label == -1:  # Sell pattern
            if all(cityblock(feature, s) >= theta for s in buy_patterns):
                sell_patterns.append(feature)
    
    return buy_patterns, sell_patterns

filtered_buy, filtered_sell = filter_patterns(scored_patterns)

In [None]:
def plot_distance_histograms(raw_buy, raw_sell, filtered_buy, filtered_sell):
    plt.figure(figsize=(12, 6))
    
    raw_distances = [cityblock(buy_pattern, sell_pattern) for buy_pattern in raw_buy for sell_pattern in raw_sell]
    plt.subplot(121)
    plt.hist(raw_distances, bins=50)
    plt.title("Raw Pattern Distances")
    
    filtered_distances = [cityblock(buy_pattern, sell_pattern) for buy_pattern in filtered_buy for sell_pattern in filtered_sell]
    plt.subplot(122)
    plt.hist(filtered_distances, bins=50)
    plt.title("Filtered Pattern Distances")
    
    plt.tight_layout()

plot_distance_histograms(features[labels==1], features[labels==-1], filtered_buy, filtered_sell)

In [None]:
def plot_volatility(data):
    data['Year'] = pd.to_datetime(data.index).year
    volatility_by_year = data.groupby('Year')['Open'].std()
    
    plt.boxplot(volatility_by_year.values)
    plt.title("Volatility Distribution by Year")

plot_volatility(ohlc_data)

In [None]:
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

def plot_pca(features_raw, features_filtered):
    pca_raw = PCA(n_components=2).fit_transform(features_raw)
    pca_filtered = PCA(n_components=2).fit_transform(features_filtered)

# Add KMeans/GMM cluster visualizations here...