In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
!pip install peft
!pip install accelerate
!pip install -i https://pypi.org/simple/ bitsandbytes

In [None]:
from datetime import datetime
import os
import sys
import peft

import torch
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_kbit_training,
    set_peft_model_state_dict,
)
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq

In [None]:
import pandas as pd
from datasets import load_dataset

# 加载数据集，先取出总数据的 15%
temporary_dataset = load_dataset('csv', data_files='/kaggle/input/g-research-crypto-forecasting/supplemental_train.csv', split='train[:15%]')

# 从这 15% 中进一步分割出90% 作为训练集（实际上是总数据的 10%），剩下的作为测试集（实际上是总数据的 1%）
split_dataset = temporary_dataset.train_test_split(test_size=0.1)  # 0.091 * 11% ≈ 1%
train_dataset = split_dataset['train'].to_pandas()
eval_dataset = split_dataset['test'].to_pandas()

# 确保DataFrame是按时间戳排序的
train_dataset.sort_values(by='timestamp', inplace=True)
eval_dataset.sort_values(by='timestamp', inplace=True)

# 创建函数获取前15分钟的数据并合并为单一字符串，用于生成提示
def get_previous_15_minutes_data_combined(df, index):
    target_timestamp = df.iloc[index]['timestamp']
    start_timestamp = target_timestamp - 900  # 900 seconds = 15 minutes * 60
    historical_data = []
    while index >= 0 and df.iloc[index]['timestamp'] >= start_timestamp:
        row = df.iloc[index]
        historical_data.append(f"""
        timestamp: {row['timestamp']}
        Asset_ID: {row['Asset_ID']}
        Count: {row['Count']}
        Open: {row['Open']}
        High: {row['High']}
        Low: {row['Low']}
        Close: {row['Close']}
        Volume: {row['Volume']}
        VWAP: {row['VWAP']}""")
        index -= 1
    return '\n'.join(historical_data[::-1])  # 将数据顺序反转，使得越近的时间点在字符串的后面

# 应用函数合并历史数据
train_dataset['past_data'] = [get_previous_15_minutes_data_combined(train_dataset, i) for i in range(len(train_dataset))]
eval_dataset['past_data'] = [get_previous_15_minutes_data_combined(eval_dataset, i) for i in range(len(eval_dataset))]

# 储存处理后的数据集
train_dataset.to_csv('train_dataset_combined.csv', index=False)
eval_dataset.to_csv('eval_dataset_combined.csv', index=False)

train_dataset.to_csv('/kaggle/input/g-research-crypto-forecasting/train_dataset_combined.csv', index=False)
eval_dataset.to_csv('/kaggle/input/g-research-crypto-forecasting/eval_dataset_combined.csv', index=False)

In [None]:
from datasets import load_dataset

train_dataset_combined = load_dataset('csv', data_files='/kaggle/working/train_dataset_combined.csv', split='train')
eval_dataset_combined = load_dataset('csv', data_files='/kaggle/working/eval_dataset_combined.csv', split='train')


base_model = "Qwen/Qwen1.5-0.5B-Chat"
# "Qwen/Qwen1.5-7B-Chat"
# "codellama/CodeLlama-7b-hf"

#model = AutoModelForCausalLM.from_pretrained(
#    base_model,
#    load_in_8bit=True,
#    torch_dtype=torch.float16,
#    device_map="auto",
#)

tokenizer = AutoTokenizer.from_pretrained(base_model) #use_fast=False

tokenizer.add_eos_token = True
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"

def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=2048,
        padding=False,
        return_tensors=None,
    )

    # "self-supervised learning" means the labels are also the inputs:
    result["labels"] = result["input_ids"].copy()

    return result

def generate_and_tokenize_prompt(data_point):
    prompt = f"""[INST]
<<SYS>>You are a cryptocurrency market prediction model. Here is the data for all crypto assets over the past 15 minutes. 
You need to provide your prediction for the Target: the residual log-returns for the asset over a 15-minute horizon. <</SYS>>
{data_point['past_data']} 
Please directly provide the Target value for Asset ID {data_point['Asset_ID']} as a 16-digit decimal number here:
[/INST] {data_point['Target']}"""
    
    return tokenize(prompt)

# 应用函数并标记化
tokenized_train_dataset = train_dataset_combined.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset_combined.map(generate_and_tokenize_prompt)

# 保存数据集
tokenized_train_dataset.save_to_disk('/kaggle/input/g-research-crypto-forecasting/train_dataset')
tokenized_val_dataset.save_to_disk('/kaggle/input/g-research-crypto-forecasting/eval_dataset')

In [None]:
from datasets import load_dataset

base_model = "Qwen/Qwen1.5-0.5B-Chat"

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(base_model)

tokenizer.add_eos_token = True
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_from_disk

# 加载处理后的数据集
tokenized_train_dataset = load_from_disk('/kaggle/input/tokenized-data/tokenized_train_dataset')

In [None]:
model.train() # put model back into training mode
model = prepare_model_for_kbit_training(model)
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:64'

config = LoraConfig(
    r=2,
    lora_alpha=1,
    target_modules=[
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)

batch_size = 4
per_device_train_batch_size = 2
gradient_accumulation_steps = batch_size // per_device_train_batch_size
output_dir = "Qwen1.5_0.5B"

training_args = TrainingArguments(
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=100,
        max_steps=1000,
        learning_rate=3e-4,
        fp16=True,
        logging_steps=50,
        optim="adamw_torch",
        evaluation_strategy="no", # if val_set_size > 0 else "no",
        save_strategy="steps",
        # eval_steps=20,
        save_steps=200,
        output_dir=output_dir,
        load_best_model_at_end=False,
        group_by_length=True, # group sequences of roughly the same length together to speed up training
        report_to="none", # if use_wandb else "none",
        run_name="none", # if use_wandb else None,
    )

trainer = Trainer(
    model=model,
    train_dataset=tokenized_train_dataset['train'],
    # eval_dataset=tokenized_val_dataset,
    args=training_args,
    data_collator=DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    ),
)

model.config.use_cache = False

In [None]:
!nvidia-smi

In [None]:
import torch
print(torch.cuda.is_available())

In [None]:
with torch.autocast("cuda"): 
    trainer.train() # resume_from_checkpoint=True

In [None]:
model.save_pretrained(output_dir)

In [None]:
eval_prompt = """<<SYS>>You are a cryptocurrency market prediction model. 
Here is the data for all crypto assets over the past 15 minutes. 
You need to provide your prediction for the Target: the residual log-returns for the asset over a 15-minute horizon. <</SYS>>
        timestamp: 1632250500.0
        Asset_ID: 1.0
        Count: 1570.0
        Open: 42171.53870984572
        High: 42260.91
        Low: 42118.12
        Close: 42225.05142857142
        Volume: 52.19336229000001
        VWAP: 42188.517797656845

        timestamp: 1632250500.0
        Asset_ID: 6.0
        Count: 1902.0
        Open: 2893.2914285714287
        High: 2902.25
        Low: 2892.01
        Close: 2899.514444555715
        Volume: 850.1264780349434
        VWAP: 2896.0527779071203

        timestamp: 1632250620.0
        Asset_ID: 7.0
        Count: 186.0
        Open: 47.747475
        High: 47.807
        Low: 47.67
        Close: 47.77135
        Volume: 2879.0300397144624
        VWAP: 47.73745138029064

        timestamp: 1632250740.0
        Asset_ID: 5.0
        Count: 124.0
        Open: 4.065250656
        High: 4.0673
        Low: 4.06
        Close: 4.064010655999999
        Volume: 17347.45647162
        VWAP: 4.064224495708667

        timestamp: 1632250740.0
        Asset_ID: 11.0
        Count: 21.0
        Open: 224.07666666666668
        High: 224.2
        Low: 223.89
        Close: 224.08666666666667
        Volume: 46.2578
        VWAP: 224.084255247494

        timestamp: 1632250800.0
        Asset_ID: 11.0
        Count: 73.0
        Open: 224.27
        High: 225.13
        Low: 223.8
        Close: 224.662
        Volume: 228.33446411000003
        VWAP: 224.5039483188813

        timestamp: 1632250800.0
        Asset_ID: 1.0
        Count: 2380.0
        Open: 42199.38166666667
        High: 42335.3
        Low: 42138.78
        Close: 42281.62602137998
        Volume: 101.17363563
        VWAP: 42252.82199303657

        timestamp: 1632250860.0
        Asset_ID: 8.0
        Count: 24.0
        Open: 1.24825
        High: 1.2673
        Low: 1.2296
        Close: 1.2476
        Volume: 6547.0
        VWAP: 1.247754112178982

        timestamp: 1632250920.0
        Asset_ID: 2.0
        Count: 94.0
        Open: 527.5366666666667
        High: 528.59
        Low: 527.1
        Close: 527.9033333333333
        Volume: 58.44496282
        VWAP: 527.7006529720935

        timestamp: 1632250980.0
        Asset_ID: 9.0
        Count: 212.0
        Open: 155.90942857142858
        High: 156.0
        Low: 155.7
        Close: 155.87214285714285
        Volume: 704.5380151100003
        VWAP: 155.87034150527393

        timestamp: 1632250980.0
        Asset_ID: 12.0
        Count: 107.0
        Open: 0.2757244
        High: 0.2759
        Low: 0.2754
        Close: 0.2758082
        Volume: 100891.74719796
        VWAP: 0.2757073237070868

        timestamp: 1632251040.0
        Asset_ID: 1.0
        Count: 2370.0
        Open: 42294.46833333333
        High: 42409.0
        Low: 42267.65
        Close: 42326.49333333334
        Volume: 89.68992306000001
        VWAP: 42341.984300104246

        timestamp: 1632251220.0
        Asset_ID: 4.0
        Count: 119.0
        Open: 0.2077798
        High: 0.2079
        Low: 0.2076800000000001
        Close: 0.2077793999999999
        Volume: 152330.73169994
        VWAP: 0.2077837517561581

        timestamp: 1632251280.0
        Asset_ID: 2.0
        Count: 120.0
        Open: 527.4759999999999
        High: 528.03
        Low: 527.1
        Close: 527.298
        Volume: 97.53898623000002
        VWAP: 527.5612333435394

        timestamp: 1632251280.0
        Asset_ID: 1.0
        Count: 1688.0
        Open: 42235.24
        High: 42314.96474771
        Low: 42215.64
        Close: 42234.76
        Volume: 90.97526268
        VWAP: 42249.95927619723

        timestamp: 1632251340.0
        Asset_ID: 5.0
        Count: 88.0
        Open: 4.0703000000000005
        High: 4.0710000000000015
        Low: 4.0641
        Close: 4.066224999999998
        Volume: 17253.326
        VWAP: 4.067114441887444

        timestamp: 1632251340.0
        Asset_ID: 10.0
        Count: 16.0
        Open: 2430.5780666666665
        High: 2432.0
        Low: 2428.2607
        Close: 2430.3582666666666
        Volume: 2.18906
        VWAP: 2430.315220735361

        timestamp: 1632251340.0
        Asset_ID: 12.0
        Count: 135.0
        Open: 0.2750182857142857
        High: 0.275128
        Low: 0.274618
        Close: 0.2748572857142857
        Volume: 86517.7602185
        VWAP: 0.2749318823852612

        timestamp: 1632251340.0
        Asset_ID: 1.0
        Count: 1029.0
        Open: 42234.53333333333
        High: 42263.0
        Low: 42177.83
        Close: 42213.85
        Volume: 19.93701845
        VWAP: 42221.74071271973

        timestamp: 1632251400.0
        Asset_ID: 5.0
        Count: 114.0
        Open: 4.06646
        High: 4.0674
        Low: 4.059
        Close: 4.061259999999999
        Volume: 12601.34122551
        VWAP: 4.064781885240469
Please directly provide the Target value for Asset ID 4 as a 16-digit decimal number here:
[/INST] """
tokenizer = AutoTokenizer.from_pretrained(base_model, padding_side='left')
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

# 进行预测
model.eval()
with torch.no_grad():
    output = model.generate(**model_input, max_new_tokens=200)[0]
    print(tokenizer.decode(output, skip_special_tokens=True))

In [None]:
eval_prompt = """<<SYS>>You are a cryptocurrency market prediction model. Here is the data for all crypto assets over the past 15 minutes. 
You need to provide your prediction for the Target: the residual log-returns for the asset over a 15-minute horizon. <</SYS>>
        timestamp: 1632250500.0
        Asset_ID: 1.0
        Count: 1570.0
        Open: 42171.53870984572
        High: 42260.91
        Low: 42118.12
        Close: 42225.05142857142
        Volume: 52.19336229000001
        VWAP: 42188.517797656845

        timestamp: 1632250500.0
        Asset_ID: 6.0
        Count: 1902.0
        Open: 2893.2914285714287
        High: 2902.25
        Low: 2892.01
        Close: 2899.514444555715
        Volume: 850.1264780349434
        VWAP: 2896.0527779071203

        timestamp: 1632250620.0
        Asset_ID: 7.0
        Count: 186.0
        Open: 47.747475
        High: 47.807
        Low: 47.67
        Close: 47.77135
        Volume: 2879.0300397144624
        VWAP: 47.73745138029064

        timestamp: 1632250740.0
        Asset_ID: 5.0
        Count: 124.0
        Open: 4.065250656
        High: 4.0673
        Low: 4.06
        Close: 4.064010655999999
        Volume: 17347.45647162
        VWAP: 4.064224495708667

        timestamp: 1632250740.0
        Asset_ID: 11.0
        Count: 21.0
        Open: 224.07666666666668
        High: 224.2
        Low: 223.89
        Close: 224.08666666666667
        Volume: 46.2578
        VWAP: 224.084255247494

        timestamp: 1632250800.0
        Asset_ID: 11.0
        Count: 73.0
        Open: 224.27
        High: 225.13
        Low: 223.8
        Close: 224.662
        Volume: 228.33446411000003
        VWAP: 224.5039483188813

        timestamp: 1632250800.0
        Asset_ID: 1.0
        Count: 2380.0
        Open: 42199.38166666667
        High: 42335.3
        Low: 42138.78
        Close: 42281.62602137998
        Volume: 101.17363563
        VWAP: 42252.82199303657

        timestamp: 1632250860.0
        Asset_ID: 8.0
        Count: 24.0
        Open: 1.24825
        High: 1.2673
        Low: 1.2296
        Close: 1.2476
        Volume: 6547.0
        VWAP: 1.247754112178982

        timestamp: 1632250920.0
        Asset_ID: 2.0
        Count: 94.0
        Open: 527.5366666666667
        High: 528.59
        Low: 527.1
        Close: 527.9033333333333
        Volume: 58.44496282
        VWAP: 527.7006529720935

        timestamp: 1632250980.0
        Asset_ID: 9.0
        Count: 212.0
        Open: 155.90942857142858
        High: 156.0
        Low: 155.7
        Close: 155.87214285714285
        Volume: 704.5380151100003
        VWAP: 155.87034150527393

        timestamp: 1632250980.0
        Asset_ID: 12.0
        Count: 107.0
        Open: 0.2757244
        High: 0.2759
        Low: 0.2754
        Close: 0.2758082
        Volume: 100891.74719796
        VWAP: 0.2757073237070868

        timestamp: 1632251040.0
        Asset_ID: 1.0
        Count: 2370.0
        Open: 42294.46833333333
        High: 42409.0
        Low: 42267.65
        Close: 42326.49333333334
        Volume: 89.68992306000001
        VWAP: 42341.984300104246

        timestamp: 1632251220.0
        Asset_ID: 4.0
        Count: 119.0
        Open: 0.2077798
        High: 0.2079
        Low: 0.2076800000000001
        Close: 0.2077793999999999
        Volume: 152330.73169994
        VWAP: 0.2077837517561581

        timestamp: 1632251280.0
        Asset_ID: 2.0
        Count: 120.0
        Open: 527.4759999999999
        High: 528.03
        Low: 527.1
        Close: 527.298
        Volume: 97.53898623000002
        VWAP: 527.5612333435394

        timestamp: 1632251280.0
        Asset_ID: 1.0
        Count: 1688.0
        Open: 42235.24
        High: 42314.96474771
        Low: 42215.64
        Close: 42234.76
        Volume: 90.97526268
        VWAP: 42249.95927619723

        timestamp: 1632251340.0
        Asset_ID: 5.0
        Count: 88.0
        Open: 4.0703000000000005
        High: 4.0710000000000015
        Low: 4.0641
        Close: 4.066224999999998
        Volume: 17253.326
        VWAP: 4.067114441887444

        timestamp: 1632251340.0
        Asset_ID: 10.0
        Count: 16.0
        Open: 2430.5780666666665
        High: 2432.0
        Low: 2428.2607
        Close: 2430.3582666666666
        Volume: 2.18906
        VWAP: 2430.315220735361

        timestamp: 1632251340.0
        Asset_ID: 12.0
        Count: 135.0
        Open: 0.2750182857142857
        High: 0.275128
        Low: 0.274618
        Close: 0.2748572857142857
        Volume: 86517.7602185
        VWAP: 0.2749318823852612

        timestamp: 1632251340.0
        Asset_ID: 1.0
        Count: 1029.0
        Open: 42234.53333333333
        High: 42263.0
        Low: 42177.83
        Close: 42213.85
        Volume: 19.93701845
        VWAP: 42221.74071271973

        timestamp: 1632251400.0
        Asset_ID: 5.0
        Count: 114.0
        Open: 4.06646
        High: 4.0674
        Low: 4.059
        Close: 4.061259999999999
        Volume: 12601.34122551
        VWAP: 4.064781885240469
Please directly provide the Target value for Asset ID 4 as a 16-digit decimal number here:
[/INST] """
tokenizer = AutoTokenizer.from_pretrained(base_model, padding_side='left')
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

# 进行预测
model.eval()
with torch.no_grad():
    output = model.generate(**model_input, max_new_tokens=200)[0]
    print(tokenizer.decode(output, skip_special_tokens=True))

In [None]:
eval_prompt = """<<SYS>>You are a cryptocurrency market prediction model. Here is the data for all crypto assets over the past 15 minutes. 
You need to provide your prediction for the Target: the residual log-returns for the asset over a 15-minute horizon. <</SYS>>
        timestamp: 1632250500.0
        Asset_ID: 1.0
        Count: 1570.0
        Open: 42171.53870984572
        High: 42260.91
        Low: 42118.12
        Close: 42225.05142857142
        Volume: 52.19336229000001
        VWAP: 42188.517797656845

        timestamp: 1632250500.0
        Asset_ID: 6.0
        Count: 1902.0
        Open: 2893.2914285714287
        High: 2902.25
        Low: 2892.01
        Close: 2899.514444555715
        Volume: 850.1264780349434
        VWAP: 2896.0527779071203

        timestamp: 1632250620.0
        Asset_ID: 7.0
        Count: 186.0
        Open: 47.747475
        High: 47.807
        Low: 47.67
        Close: 47.77135
        Volume: 2879.0300397144624
        VWAP: 47.73745138029064

        timestamp: 1632250740.0
        Asset_ID: 5.0
        Count: 124.0
        Open: 4.065250656
        High: 4.0673
        Low: 4.06
        Close: 4.064010655999999
        Volume: 17347.45647162
        VWAP: 4.064224495708667

        timestamp: 1632250740.0
        Asset_ID: 11.0
        Count: 21.0
        Open: 224.07666666666668
        High: 224.2
        Low: 223.89
        Close: 224.08666666666667
        Volume: 46.2578
        VWAP: 224.084255247494

        timestamp: 1632250800.0
        Asset_ID: 11.0
        Count: 73.0
        Open: 224.27
        High: 225.13
        Low: 223.8
        Close: 224.662
        Volume: 228.33446411000003
        VWAP: 224.5039483188813

        timestamp: 1632250800.0
        Asset_ID: 1.0
        Count: 2380.0
        Open: 42199.38166666667
        High: 42335.3
        Low: 42138.78
        Close: 42281.62602137998
        Volume: 101.17363563
        VWAP: 42252.82199303657

        timestamp: 1632250860.0
        Asset_ID: 8.0
        Count: 24.0
        Open: 1.24825
        High: 1.2673
        Low: 1.2296
        Close: 1.2476
        Volume: 6547.0
        VWAP: 1.247754112178982

        timestamp: 1632250920.0
        Asset_ID: 2.0
        Count: 94.0
        Open: 527.5366666666667
        High: 528.59
        Low: 527.1
        Close: 527.9033333333333
        Volume: 58.44496282
        VWAP: 527.7006529720935

        timestamp: 1632250980.0
        Asset_ID: 9.0
        Count: 212.0
        Open: 155.90942857142858
        High: 156.0
        Low: 155.7
        Close: 155.87214285714285
        Volume: 704.5380151100003
        VWAP: 155.87034150527393

        timestamp: 1632250980.0
        Asset_ID: 12.0
        Count: 107.0
        Open: 0.2757244
        High: 0.2759
        Low: 0.2754
        Close: 0.2758082
        Volume: 100891.74719796
        VWAP: 0.2757073237070868

        timestamp: 1632251040.0
        Asset_ID: 1.0
        Count: 2370.0
        Open: 42294.46833333333
        High: 42409.0
        Low: 42267.65
        Close: 42326.49333333334
        Volume: 89.68992306000001
        VWAP: 42341.984300104246

        timestamp: 1632251220.0
        Asset_ID: 4.0
        Count: 119.0
        Open: 0.2077798
        High: 0.2079
        Low: 0.2076800000000001
        Close: 0.2077793999999999
        Volume: 152330.73169994
        VWAP: 0.2077837517561581

        timestamp: 1632251280.0
        Asset_ID: 2.0
        Count: 120.0
        Open: 527.4759999999999
        High: 528.03
        Low: 527.1
        Close: 527.298
        Volume: 97.53898623000002
        VWAP: 527.5612333435394

        timestamp: 1632251280.0
        Asset_ID: 1.0
        Count: 1688.0
        Open: 42235.24
        High: 42314.96474771
        Low: 42215.64
        Close: 42234.76
        Volume: 90.97526268
        VWAP: 42249.95927619723

        timestamp: 1632251340.0
        Asset_ID: 5.0
        Count: 88.0
        Open: 4.0703000000000005
        High: 4.0710000000000015
        Low: 4.0641
        Close: 4.066224999999998
        Volume: 17253.326
        VWAP: 4.067114441887444

        timestamp: 1632251340.0
        Asset_ID: 10.0
        Count: 16.0
        Open: 2430.5780666666665
        High: 2432.0
        Low: 2428.2607
        Close: 2430.3582666666666
        Volume: 2.18906
        VWAP: 2430.315220735361

        timestamp: 1632251340.0
        Asset_ID: 12.0
        Count: 135.0
        Open: 0.2750182857142857
        High: 0.275128
        Low: 0.274618
        Close: 0.2748572857142857
        Volume: 86517.7602185
        VWAP: 0.2749318823852612

        timestamp: 1632251340.0
        Asset_ID: 1.0
        Count: 1029.0
        Open: 42234.53333333333
        High: 42263.0
        Low: 42177.83
        Close: 42213.85
        Volume: 19.93701845
        VWAP: 42221.74071271973

        timestamp: 1632251400.0
        Asset_ID: 5.0
        Count: 114.0
        Open: 4.06646
        High: 4.0674
        Low: 4.059
        Close: 4.061259999999999
        Volume: 12601.34122551
        VWAP: 4.064781885240469
Please directly provide the Target value for Asset ID 4 as a 16-digit decimal number here:
[/INST] """
tokenizer = AutoTokenizer.from_pretrained(base_model, padding_side='left')
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

# 进行预测
model.eval()
with torch.no_grad():
    output = model.generate(**model_input, max_new_tokens=200)[0]
    print(tokenizer.decode(output, skip_special_tokens=True))

In [None]:
import pandas as pd

# 加载CSV文件
df = pd.read_csv('/kaggle/input/test-data/test_dataset_combined.csv')

# 确认数据有 'Asset_ID' 和 'Target' 列
print(df.head(10))

In [None]:
import pandas as pd
from datasets import load_dataset
from tqdm.auto import tqdm

# 加载数据集，这次只取出 15% 到 20% 的数据
dataset_slice = load_dataset('csv', data_files='/kaggle/input/g-research-crypto-forecasting/supplemental_train.csv', split='train[15%:16%]')

# 如果需要，进一步分割这部分数据
split_dataset = dataset_slice.train_test_split(test_size=0.1)
train_dataset = split_dataset['train'].to_pandas()
eval_dataset = split_dataset['test'].to_pandas()

# 确保 DataFrame 是按时间戳排序的
train_dataset.sort_values(by='timestamp', inplace=True)
eval_dataset.sort_values(by='timestamp', inplace=True)


def get_previous_15_minutes_data_combined(df, index):
    target_timestamp = df.iloc[index]['timestamp']
    start_timestamp = target_timestamp - 900  # 900 seconds = 15 minutes * 60
    historical_data = []
    while index >= 0 and df.iloc[index]['timestamp'] >= start_timestamp:
        row = df.iloc[index]
        historical_data.append(f"""
        timestamp: {row['timestamp']}
        Asset_ID: {row['Asset_ID']}
        Count: {row['Count']}
        Open: {row['Open']}
        High: {row['High']}
        Low: {row['Low']}
        Close: {row['Close']}
        Volume: {row['Volume']}
        VWAP: {row['VWAP']}""")
        index -= 1
    return '\n'.join(historical_data[::-1])  # 将数据顺序反转，使得越近的时间点在字符串的后面

def process_and_save_dataset(dataset, batch_size, file_name):
    progress_bar = tqdm(total=len(dataset))
    try:
        for i in range(0, len(dataset), batch_size):
            batch_indices = range(i, min(i + batch_size, len(dataset)))
            dataset.loc[batch_indices, 'past_data'] = [get_previous_15_minutes_data_combined(dataset, idx) for idx in batch_indices]
            # 每处理完一个批次就追加保存到同一个文件中
            dataset.iloc[i:min(i + batch_size, len(dataset))].to_csv(file_name + '.csv', index=False, mode='a', header=(i==0))
            progress_bar.update(batch_size)
    except Exception as e:
        print(f"Error processing data: {e}")
    finally:
        progress_bar.close()

# 处理训练数据集
process_and_save_dataset(eval_dataset, 1000, 'test_dataset')

In [None]:
import pandas as pd

# 加载CSV文件
df = pd.read_csv('/kaggle/working/test_dataset.csv')

# 确认数据有 'Asset_ID' 和 'Target' 列
print(df.head(10))