In [6]:
import numpy as np
import pandas as pd
import pickle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import ast

In [2]:
file_path = 'Fun_predictions.csv'
stock_selection = pd.read_csv(file_path)
stock_selection['Date'] = pd.to_datetime(stock_selection['Date'])
stock_selection.set_index('Date', inplace=True)
stock_selection = stock_selection[['Top_25_percent_positive']]
print(stock_selection.head())

                                      Top_25_percent_positive
Date                                                         
2020-03-31  ['000002', '000617', '000625', '000776', '0009...
2020-06-30  ['000001', '000617', '000776', '000807', '0009...
2020-09-30  ['000002', '000157', '000333', '000408', '0006...
2020-12-31  ['000001', '000425', '000651', '000768', '0007...
2021-03-31  ['000063', '000333', '000408', '000596', '0006...


In [3]:
with open('./MomentumFeature.pkl', 'rb') as f:
    tech_data = pickle.load(f)
for stock in tech_data:
    tech_data[stock].index = pd.to_datetime(tech_data[stock].index, errors='coerce')

In [4]:
for stock in tech_data:

    tech_data[stock].index = pd.to_datetime(tech_data[stock].index, errors='coerce')

    if 'ret' in tech_data[stock].columns:
        features = tech_data[stock].drop(columns=['ret']).resample('W').mean()
        ret = tech_data[stock]['ret'].resample('W').sum()

        tech_data[stock] = pd.concat([features, ret], axis=1)
    else:
        tech_data[stock] = tech_data[stock].resample('W').mean()

tech_data[list(tech_data.keys())[0]].head()

Unnamed: 0_level_0,Adj_close_price,TRIX,RSI,PPO,MOM,MACD,MACDSIGNAL,MACDHIST,CMO,APO,ret
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2001-01-07,3.066104,,,,,,,,,,-0.022285
2001-01-14,3.087379,,,,,,,,,,0.041385
2001-01-21,3.115887,,57.56583,,0.097864,,,,15.13166,,0.012089
2001-01-28,3.186945,,57.56583,,0.142966,,,,15.13166,,0.0
2001-02-04,3.186945,,57.56583,,0.091482,,,,15.13166,,0.0


In [7]:
predictions_dict = {}

last_training_date = pd.to_datetime("2024-09-30")

for i, (date, row) in enumerate(stock_selection.iterrows()):

    predictions_dict[date] = pd.DataFrame()

    top_stocks = ast.literal_eval(row['Top_25_percent_positive'])

    next_date = stock_selection.index[i + 1] if i + 1 < len(stock_selection) else last_training_date
    print("Begin "+str(date))

    for stock in top_stocks:

        stock_data = tech_data[stock].copy()
        stock_data['ret'] = stock_data['ret'].shift(-1)
        stock_data.dropna(inplace=True)

        training_set = stock_data[stock_data.index < date]
        X_train = training_set.drop(columns=['Adj_close_price','ret'])
        y_train = training_set['ret']
        
        if len(X_train) < 3:
            continue
        
        prediction_set = stock_data[(stock_data.index >= date) & (stock_data.index < next_date)]
        X_test = prediction_set.drop(columns=['Adj_close_price','ret'])

        if X_test.empty:
            continue

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        xgb_model = XGBRegressor(objective='reg:squarederror')
        xgb_model.fit(X_train_scaled, y_train)

        X_test_scaled = scaler.transform(X_test)

        predictions = xgb_model.predict(X_test_scaled)

        temp_df = pd.DataFrame(predictions, index=X_test.index, columns=[stock])

        predictions_dict[date] = predictions_dict[date].merge(temp_df, left_index=True, right_index=True, how='outer')

Begin 2020-03-31 00:00:00
Begin 2020-06-30 00:00:00
Begin 2020-09-30 00:00:00
Begin 2020-12-31 00:00:00
Begin 2021-03-31 00:00:00
Begin 2021-06-30 00:00:00
Begin 2021-09-30 00:00:00
Begin 2021-12-31 00:00:00
Begin 2022-03-31 00:00:00
Begin 2022-06-30 00:00:00
Begin 2022-09-30 00:00:00
Begin 2022-12-31 00:00:00
Begin 2023-03-31 00:00:00
Begin 2023-06-30 00:00:00
Begin 2023-09-30 00:00:00
Begin 2023-12-31 00:00:00
Begin 2024-03-31 00:00:00
Begin 2024-06-30 00:00:00


In [8]:
# Define the file path
output_path = './TechPrediction.pkl'

# Save the dictionary to a pickle file
with open(output_path, 'wb') as f:
    pickle.dump(predictions_dict, f)

output_path

'./TechPrediction.pkl'

In [10]:
# Define the file path
output_path = './TechDataWeekly.pkl'

# Save the dictionary to a pickle file
with open(output_path, 'wb') as f:
    pickle.dump(tech_data, f)

output_path

'./TechDataWeekly.pkl'

In [9]:
tech_data["000001"]

Unnamed: 0_level_0,Adj_close_price,TRIX,RSI,PPO,MOM,MACD,MACDSIGNAL,MACDHIST,CMO,APO,ret
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2001-01-07,3.066104,,,,,,,,,,-0.022285
2001-01-14,3.087379,,,,,,,,,,0.041385
2001-01-21,3.115887,,57.565830,,0.097864,,,,15.131660,,0.012089
2001-01-28,3.186945,,57.565830,,0.142966,,,,15.131660,,0.000000
2001-02-04,3.186945,,57.565830,,0.091482,,,,15.131660,,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
2024-10-20,11.994000,0.129355,65.126737,7.477524,2.357331,0.542665,0.329525,0.213140,30.253473,0.784350,0.026938
2024-10-27,11.784000,0.226592,59.930992,9.820081,0.888416,0.461739,0.417006,0.044733,19.861984,1.058811,-0.027791
2024-11-03,11.462000,0.286447,53.042808,6.398686,-0.490545,0.284353,0.382656,-0.098303,6.085616,0.708315,-0.024202
2024-11-10,11.658000,0.304148,56.286066,2.112392,-0.294000,0.177041,0.289197,-0.112156,12.572133,0.239902,0.025055
