In [2]:
import pandas as pd
import numpy as np 
import multiprocessing as mp
from multiprocessing import cpu_count
from concurrent.futures import ProcessPoolExecutor
import os
import datetime
import time
import sys
from typing import List



读取数据（取最小行数）

In [None]:
datalist = []
min_rows = float('inf') 

for filename in os.listdir("/home/yichuan/ywc/meta-labeling/cryptocurrency"):
    if filename.endswith('.csv'):
        file_path = os.path.join("/home/yichuan/ywc/meta-labeling/cryptocurrency", filename)
        df = pd.read_csv(file_path)
        df = df.drop(columns=["Date"])
        datalist.append(df.values)
        min_rows = min(min_rows, df.shape[0])      

data_list = [data[:min_rows] for data in datalist]

print(min_rows,len(data_list))

1878 4


处理最后一列

In [4]:
def logVolume(X: np.ndarray):
    volume = X[:, -1]  
    volume_log = np.log(volume + 1)  

    mean_log = np.mean(volume_log)  
    std_log = np.std(volume_log)  

    volume_std = (volume_log - mean_log) / std_log  

    X[:, -1] = volume_std  

    return X


for data in data_list:
    data=logVolume(data)    

print(data_list[1])

[[ 0.21791101  0.221791    0.214866    0.21748801  0.21748801 -2.28482287]
 [ 0.218256    0.21906801  0.20525999  0.20648301  0.20648301 -2.33154956]
 [ 0.205948    0.21445601  0.205459    0.21043     0.21043    -2.37803102]
 ...
 [ 0.36812201  0.368543    0.35613599  0.35653099  0.35653099 -0.58243597]
 [ 0.35652399  0.357225    0.338911    0.34284601  0.34284601 -0.51650078]
 [ 0.34283099  0.34531301  0.33533299  0.34501699  0.34501699 -0.80060086]]


创建标签函数

In [5]:

days=10

ptsl = [0.05, 0.05]  
return_min = 0.005
def triple_barrier(close: np.ndarray, days=10, pts=[0.05, 0.05] ):
    bin = np.zeros(close.size, dtype=int)  
    for i in range(close.size):
        for d in range(days):
            index = min(i + d + 1, close.size - 1)  
            if close[index] >= close[i] * (1 + ptsl[0]):  
                bin[i] = 1
                break
            elif close[index] <= close[i] * (1 - ptsl[1]):  
                bin[i] = -1
                break
    
    return bin




In [6]:
binmat=np.full((min_rows, 4), 3)
for i in range(4):
    binmat[:,i]=triple_barrier(data_list[i][:,3])

label=np.mean(binmat, axis=1)
print(label.shape)

(1878,)


合并

In [7]:
# 确保所有二维数组具有相同的形状
shapes = {data.shape for data in data_list}
if len(shapes) > 1:
    raise ValueError("所有 CSV 文件中的数据形状必须一致")
    
dataset=np.array(data_list)
print(dataset.shape)
dataset = dataset.transpose(1, 0, 2).reshape(1878, -1)
print(dataset.shape)

(4, 1878, 6)
(1878, 24)


In [8]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

In [9]:
targets=(label + 1)/2
targets=np.where(targets > 0.5, 1, 0)
X_train, X_test, y_train, y_test = train_test_split(dataset, targets, test_size=0.2, random_state=42)

In [10]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
def asset_exposure_constraint(weights, max_exposure=0.2):
    """Ensure no asset weight exceeds max_exposure."""
    return all(abs(w) <= max_exposure for w in weights)

def sector_diversification_constraint(weights, sector_mapping, min_sectors=3):
    """Ensure allocation spans at least min_sectors."""
    # unique_sectors = set(sector_mapping[np.nonzero(weights)])
    # return len(unique_sectors) >= min_sectors
    return True

def risk_tolerance_constraint(weights, cov_matrix, max_risk=0.05):
    """Ensure portfolio variance is within max_risk."""
    portfolio_variance = np.dot(weights.T, np.dot(cov_matrix, weights))
    return portfolio_variance <= max_risk

def apply_constraints(weights, sector_mapping, cov_matrix, exposure=0.2, sectors=3, risk=0.05):
    """Check all constraints."""
    return (
        asset_exposure_constraint(weights, exposure) and
        sector_diversification_constraint(weights, sector_mapping, sectors) and
        risk_tolerance_constraint(weights, cov_matrix, risk)
    )

In [12]:
# Primary model using XGBoost
model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    objective='binary:logistic',
    eval_metric='logloss'
)
model.fit(X_train, y_train)

y_pred_primary = model.predict(X_test)
primary_signals = np.where(y_pred_primary > 0.5, 1, 0)


In [13]:
print(primary_signals)
print(len(primary_signals))
print(X_test.shape)

[1 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 1 1 1 0 1 0 1 1
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 1 1 0 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 1 0 1 0 0
 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 0 1 1 1 0 0 1 1 0 0 0 0 1 1 1 0 0 0 0 1 0
 1 1 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 1 1 1 1 0 1 0 0 1 0 0 0 0 0 1 1 0 0 0 0
 0 0 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 0 1 0 1 0 1 1 0 0 0
 0 0 0 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 1 0 1 0 0 0 1 1 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0 1 1 1 1 0 0 0 0 0 1 0 0 0 1 0 1
 0 1 1 0 1 0]
376
(376, 24)


In [14]:
# Create a sample portfolio based on signals
def construct_portfolio(signals, asset_returns, sector_mapping, cov_matrix):
    """Construct a portfolio based on signals and constraints."""
    weights = np.zeros(len(signals))
    for i, signal in enumerate(signals):
        if signal == 1:
            # Placeholder: Assign random initial weights to activated signals
            weights[i] = np.random.uniform(0.01, 0.1)

    # Normalize weights
    weights /= np.sum(weights)

    # Apply constraints
    if apply_constraints(weights, sector_mapping, cov_matrix):
        return weights
    else:
        print("Constraints not satisfied. Adjust weights.")
        return np.zeros_like(weights)

In [15]:
print(X_train.shape)
print(y_test.shape)

(1502, 24)
(376,)


In [16]:
# Assume asset returns, sector mapping, and covariance matrix as placeholders
sector_mapping = np.random.randint(1, 5, size=dataset.shape[1])
cov_matrix = np.cov(X_test.T, rowvar=False)
asset_returns = np.random.normal(0.01, 0.02, size=dataset.shape[1])

print(cov_matrix.shape)

(376, 376)


In [40]:
final_portfolio_weights = construct_portfolio(primary_signals, asset_returns, sector_mapping, cov_matrix)
# Output portfolio weights
print("Final Portfolio Weights:\n", final_portfolio_weights)
print(final_portfolio_weights.shape, np.sum(final_portfolio_weights))

Final Portfolio Weights:
 [0.00068031 0.00192611 0.00149309 0.00195334 0.00294488 0.00222184
 0.0040369  0.00110553 0.00083872 0.00230944 0.00187249 0.00362808
 0.00047518 0.00281967 0.00263548 0.00053906 0.00349345 0.00307582
 0.00236444 0.00276705 0.         0.00325941 0.00070166 0.00404094
 0.         0.00208616 0.00267303 0.00389284 0.00137226 0.00207303
 0.00408057 0.00163531 0.00281927 0.         0.00249416 0.00229038
 0.00123476 0.00395871 0.00195374 0.00149998 0.00327844 0.0033919
 0.00199469 0.         0.00123714 0.00202243 0.00199716 0.
 0.         0.00417236 0.00402034 0.00284357 0.00309126 0.00205606
 0.00206663 0.00263195 0.         0.00129788 0.00229598 0.
 0.         0.00149862 0.00080799 0.00079115 0.00248606 0.00241641
 0.00363469 0.00201482 0.00123717 0.00095109 0.00114721 0.00164904
 0.00073522 0.00160664 0.0016509  0.00239636 0.00291934 0.00046301
 0.00387485 0.00256337 0.00406958 0.00400874 0.00198528 0.00385523
 0.00173083 0.00050906 0.00200858 0.00112787 0.002236

In [20]:
# Step 3: Develop secondary model (meta-labeling) 二级模型，即meta-labeling
# Create meta-labels for secondary model training
meta_labels = primary_signals == y_test

X_train_meta, X_test_meta, y_train_meta, y_test_meta = train_test_split(
    X_test, meta_labels, test_size=0.5, random_state=42
)
X_test_meta = pd.DataFrame(X_test_meta)


# Train and test secondary model (meta-labeling) 训练模型并生成二级信号
logistic = LogisticRegression()
logistic.fit(X_train_meta, y_train_meta)
y_pred_secondary = logistic.predict(X_test_meta)

# Step 4: Filtering, denoising, and refining signals 进行信号过滤，生成最终信号
refined_signals = primary_signals[X_test_meta.index]
refined_signals = refined_signals[y_pred_secondary == 1]



In [21]:
#print(y_pred_secondary)
print(y_pred_secondary.shape)
#print(primary_signals[X_test_meta.index])
#print("Refined_signals:\n", refined_signals)
print("Length of Refined_signals:\n", len(refined_signals))

(188,)
Length of Refined_signals:
 176
