In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score



In [2]:
import pickle
with open('./MomentumFeature.pkl', 'rb') as f:
    tech_data = pickle.load(f)
with open('./FundamentalData.pkl', 'rb') as f:
    fun_data = pickle.load(f)
    
for stock in tech_data:
    tech_data[stock].index = pd.to_datetime(tech_data[stock].index, errors='coerce')

for stock in fun_data:
    fun_data[stock].index = pd.to_datetime(fun_data[stock].index, errors='coerce')

In [3]:
cutoff_date = pd.to_datetime("2020-01-01")
stocks_to_remove = []
model_dict = {}

def scale_non_nan(X):
    scaler = StandardScaler()
    X_scaled = X.copy()
    for col in X.columns:
        # Mask to get non-NaN values
        mask = X[col].notna()
        
        # Only apply scaling if there are more than 3 non-NaN values in the column
        if mask.sum() > 3:  # Proceed if there are more than 3 non-NaN values
            # Fit-transform only on non-NaN values and reassign to the DataFrame
            X_scaled.loc[mask, col] = scaler.fit_transform(X[[col]].loc[mask].values.reshape(-1, 1)).flatten()
    return X_scaled

# Wrapper function to use in the pipeline
def non_nan_scaler(X):
    return scale_non_nan(X)

for stock, data in list(fun_data.items()):
    data_before_2020 = data[data.index < cutoff_date].copy()
    data_before_2020.dropna(subset=['ret'], inplace=True)
    data_before_2020['ret'] = data_before_2020['ret'].shift(-1)
    data_before_2020.dropna(subset=['ret'], inplace=True)

    first_5_cols = data_before_2020.columns[:5]
    data_before_2020 = data_before_2020[~(data_before_2020[first_5_cols].isna().sum(axis=1) >= 3)]
    
    if len(data_before_2020) < 12:
        print("Stock Delete: " + str(stock))
        stocks_to_remove.append(stock)
        continue
    
    print("Train Stock: " + str(stock))
    y_train = data_before_2020['ret']
    X_train = data_before_2020.drop(columns=['ret', 'Adj_close_price']).loc[y_train.index]
    
    # Define pipeline with custom scaler
    pipeline = Pipeline([
        ('scaler', FunctionTransformer(non_nan_scaler)),
        ('xgb', XGBRegressor(objective='reg:squarederror'))
    ])
    
    # Perform cross-validation
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=3, scoring='neg_mean_squared_error')
    avg_cv_score = np.mean(cv_scores)
    
    # Fit the pipeline on the entire training set
    pipeline.fit(X_train, y_train)
    
    # Store the pipeline and CV score
    model_dict[stock] = {
        'pipeline': pipeline,
        'cv_score': avg_cv_score
    }

# Remove stocks with insufficient data
for stock in stocks_to_remove:
    del fun_data[stock]

Train Stock: 000001
Train Stock: 000002
Train Stock: 000063
Train Stock: 000100
Train Stock: 000157
Train Stock: 000166
Train Stock: 000301
Train Stock: 000333
Train Stock: 000338
Train Stock: 000408
Train Stock: 000425
Train Stock: 000538
Train Stock: 000568
Train Stock: 000596
Train Stock: 000617
Train Stock: 000625
Train Stock: 000651
Train Stock: 000661
Train Stock: 000708
Train Stock: 000725
Train Stock: 000733
Train Stock: 000768
Train Stock: 000776
Train Stock: 000786
Train Stock: 000792
Train Stock: 000800
Train Stock: 000807
Train Stock: 000858
Train Stock: 000876
Train Stock: 000895
Train Stock: 000938
Train Stock: 000963
Train Stock: 000977
Train Stock: 000983
Train Stock: 000999
Stock Delete: 001289
Stock Delete: 001965
Train Stock: 001979
Train Stock: 002001
Train Stock: 002007
Train Stock: 002027
Train Stock: 002049
Train Stock: 002050
Train Stock: 002074
Train Stock: 002129
Train Stock: 002142
Train Stock: 002179
Train Stock: 002180
Train Stock: 002230
Train Stock: 00223

In [None]:
with open('model_dict.pkl', 'rb') as file:
    loaded_model_dict = pickle.load(file)

In [5]:
with open('FunDataCleaned.pkl', 'wb') as file:
    pickle.dump(fun_data, file)