# DL project 
## Prediction or Share price based on previous share price and ESG factor - NON DL model
CSI 300 company

In [None]:
# pip install statsmodels

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, MinMaxScaler
import joblib

## 1 define the loop

In [2]:
def training_loop_nonDL(X_train, y_train,
                  X_test, y_test):
    test_loss = 0

    import sklearn.metrics as metrics
    from sklearn.pipeline import Pipeline
    from sklearn.model_selection import cross_val_score
    from sklearn.model_selection import GridSearchCV
    from sklearn.ensemble import HistGradientBoostingRegressor
    from sklearn.metrics import mean_squared_error

    pipeline = Pipeline([
        ('regressor', HistGradientBoostingRegressor(random_state=42))])

    param_grid = [{
                    'regressor__max_depth':  np.array([3, 5]),
                    'regressor__l2_regularization': np.array([0, 1])
                }]

    grid = GridSearchCV(estimator=pipeline,
                        param_grid=param_grid,
                        scoring='neg_mean_squared_error',
                        cv=5,
                        verbose=2,
                        n_jobs=-1)

    grid_result = grid.fit(X_train, y_train.ravel())
    pipelineFinal = grid_result.best_estimator_

    scores = cross_val_score(pipelineFinal,
                            X_train,
                            y_train.ravel(),
                            scoring='neg_mean_squared_error',
                            cv=5,
                            verbose=2,
                            n_jobs=-1)


    #print("Cross-validation score is {score:.2f}, standard deviation is {err:.2f}" .format(score=scores.mean(), err=scores.std()))


    test_preds = pipelineFinal.predict(X_test)
    test_loss = mean_squared_error(test_preds, y_test)
    '''
    for epoch in range(n_epochs):
        lstm.train()
        outputs = lstm.forward(X_train) # forward pass
        optimiser.zero_grad() # calculate the gradient, manually setting to 0
        # obtain the loss function
        loss = loss_fn(outputs, y_train)
        loss.backward() # calculates the loss of the loss function
        optimiser.step() # improve from loss, i.e backprop
      
        # test loss
        lstm.eval()
        test_preds = lstm(X_test)
        test_loss = loss_fn(test_preds, y_test)
        if epoch % 100 == 0:
            print("Epoch: %d, train loss: %1.5f, test loss: %1.5f" % (epoch, 
                                                                      loss.item(), 
                                                                      test_loss.item()))
       '''         

    #return test_loss.item()
    return test_loss, pipelineFinal

## get training data

In [3]:
ESG = False
input_size = 6 # number of features (without ESG 6 with 19)
if ESG:
    input_size = 19

''' not used in NonDLmodel
lstm = LSTM(num_classes, 
        input_size, 
        hidden_size, 
        num_layers)
loss_fn = torch.nn.MSELoss()    # mean-squared error for regression
optimiser = torch.optim.Adam(lstm.parameters(), lr=learning_rate)
'''

df2019 = pd.read_excel('data_feature_csi_2019.xlsx', index_col = 'Company_code', parse_dates=True)
df2020 = pd.read_excel('data_feature_csi_2020.xlsx', index_col = 'Company_code', parse_dates=True)
df2021 = pd.read_excel('data_feature_csi_2021.xlsx', index_col = 'Company_code', parse_dates=True)
df2022 = pd.read_excel('data_feature_csi_2022.xlsx', index_col = 'Company_code', parse_dates=True)

  df2019 = pd.read_excel('data_feature_csi_2019.xlsx', index_col = 'Company_code', parse_dates=True)
  df2020 = pd.read_excel('data_feature_csi_2020.xlsx', index_col = 'Company_code', parse_dates=True)
  df2021 = pd.read_excel('data_feature_csi_2021.xlsx', index_col = 'Company_code', parse_dates=True)
  df2022 = pd.read_excel('data_feature_csi_2022.xlsx', index_col = 'Company_code', parse_dates=True)


## define training iteration

In [8]:
def TrainingIteration(plotting, f, filename):


    preview = 241 # How many previous days to use for prediction
    predict = 1 # how many values in the future to predict
    futureStep = 241 # how far in the future to predict

    total = preview+predict



    df = pd.read_csv(f, index_col = 'Date', parse_dates=True)

    #Remove years without ESG Data
    df = df[~(df.index.year == 2018)]
    df = df[~(df.index.year == 2019)]
    X, y = df.drop(columns=['Close']), df.Close.values

    # Add ESG Data --------------------------------------

    #Get Amount of data per year
    X.loc[ X.index.year == 2018].sum(axis=1).count()
    X.loc[ X.index.year == 2019].sum(axis=1).count()
    count2020 = X.loc[ X.index.year == 2020].sum(axis=1).count()
    count2021 =X.loc[ X.index.year == 2021].sum(axis=1).count()
    count2022 =X.loc[ X.index.year == 2022].sum(axis=1).count()
    count2023 =X.loc[ X.index.year == 2023].sum(axis=1).count()

    # create Stacks of ESG values to be added to training data.

    companyID = filename.split('_',1)[0]
    print(companyID)

    ESG2019 = df2019.loc[df2019.index == companyID]
    dub = np.tile(ESG2019.values, (count2020, 1))
    ESG2019_df = pd.DataFrame(dub, columns=ESG2019.columns)

    ESG2020 = df2020.loc[df2020.index == companyID]
    dub = np.tile(ESG2020.values, (count2021, 1))
    ESG2020_df = pd.DataFrame(dub, columns=ESG2020.columns)

    ESG2021 = df2021.loc[df2021.index == companyID]
    dub = np.tile(ESG2021.values, (count2022, 1))
    ESG2021_df = pd.DataFrame(dub, columns=ESG2021.columns)

    ESG2022 = df2022.loc[df2022.index == companyID]
    dub = np.tile(ESG2022.values, (count2023, 1))
    ESG2022_df = pd.DataFrame(dub, columns=ESG2022.columns)


    ESGData = pd.concat([ESG2019_df, ESG2020_df,ESG2021_df,ESG2022_df], ignore_index=True)
    
    ESGData= ESGData.reset_index(drop=True)
    X = X.reset_index(drop=True)
    
    X_ESG = pd.concat([X, ESGData], axis=1)
    print(X.shape)
    print(ESGData.shape)
    print(X_ESG.shape)

    X_ESG = X_ESG.drop('Industry_code', axis=1)
    X_ESG = X_ESG.fillna(0)
    # Add ESG Data --------------------------------------

    # NON ESG version
    #if ESG:
    #    X = X_ESG

    # Data Preprocessing ----------------------------------------------------------------
    mm = MinMaxScaler()
    ss = StandardScaler()

    #extra step to remove really all strings
    X = X.applymap(lambda x: pd.to_numeric(x, errors='coerce'))
    X = X.fillna(0)

    # Basic Scaling of input 
    X_trans = ss.fit_transform(X)
    y_trans = mm.fit_transform(y.reshape(-1, 1))
    
    # spliting the trainings data into sequences for training
    def split_sequences(input_sequences, output_sequence, n_steps_in, n_steps_out):
        X, y = list(), list() # instantiate X and y
        for i in range(len(input_sequences)):
            # find the end of the input, output sequence
            end_ix = i + n_steps_in
            out_end_ix = end_ix + n_steps_out - 1
            # check if we are beyond the dataset
            if out_end_ix +futureStep > len(input_sequences): break
        
            # gather input and output of the pattern
            #seq_x, seq_y = input_sequences[i:end_ix], output_sequence[end_ix-1:out_end_ix, -1]
            seq_x, seq_y = input_sequences[i:end_ix], output_sequence[futureStep+end_ix-1:futureStep+out_end_ix, -1]
            X.append(seq_x), y.append(seq_y)
        return np.array(X), np.array(y)

    X_ss, y_mm = split_sequences(X_trans, y_trans, preview, predict)

    print(X_ss.shape, y_mm.shape)

    total_samples = len(X)
    train_test_cutoff = total_samples-total #round(0.3 * total_samples)

    #split between train and testing sets
    testSamplesCount = round(0.3 * total_samples)

    X_train = X_ss[:-testSamplesCount]
    X_test = X_ss[-testSamplesCount:]

    y_train = y_mm[:-testSamplesCount]
    y_test = y_mm[-testSamplesCount:] 

    #print("Training Shape:", X_train.shape, y_train.shape)
    #print("Testing Shape:", X_test.shape, y_test.shape) 

    X_train_tensors = Variable(torch.Tensor(X_train))
    X_test_tensors = Variable(torch.Tensor(X_test))

    y_train_tensors = Variable(torch.Tensor(y_train))
    y_test_tensors = Variable(torch.Tensor(y_test))

    selected_timestep = 0
    # to adapte for non DL learning
    #X_train_2d = X_train[:, selected_timestep, :]
    #X_test_2d = X_train[:, selected_timestep, :]
    #y_train_2d = X_train[:, selected_timestep, :]
    #y_test_2d = X_train[:, selected_timestep, :]

    #print("Training Shape:", X_train.shape, y_train.shape)
    #print("Testing Shape:", X_test.shape, y_test.shape) 

    X_train_tensors = Variable(torch.Tensor(X_train))
    X_test_tensors = Variable(torch.Tensor(X_test))

    y_train_tensors = Variable(torch.Tensor(y_train))
    y_test_tensors = Variable(torch.Tensor(y_test))





    try:
        X_train_tensors_final = torch.reshape(X_train_tensors,   
                                    (X_train_tensors.shape[0], preview, 
                                    X_train_tensors.shape[2]))
        X_test_tensors_final = torch.reshape(X_test_tensors,  
                                            (X_test_tensors.shape[0], preview, 
                                            X_test_tensors.shape[2])) 
    
    except:
    #    print("skipped invalid data")
            return -1

    print("Training Shape:", X_train_tensors_final.shape, y_train_tensors.shape)
    print("Testing Shape:", X_test_tensors_final.shape, y_test_tensors.shape) 

    # Training ------------------------------------------------------------------------------
    '''
    finalTestLoss = training_loop(n_epochs=n_epochs,
        lstm=lstm,
        optimiser=optimiser,
        loss_fn=loss_fn,
        X_train=X_train_tensors_final,
        y_train=y_train_tensors,
        X_test=X_test_tensors_final,
        y_test=y_test_tensors)
    '''


    finalTestLoss, best_model = training_loop_nonDL(
        X_train=X_train_tensors_final[:, 0, :],
        y_train=y_train_tensors[:, 0],
        X_test=X_test_tensors_final[:, 0, :],
        y_test=y_test_tensors[:, 0])


    # Ploting ------------------------------------------------------------------------------
    
    test_predict = best_model.predict(X_test_tensors_final[-1, 0, :].unsqueeze(0)) # get the last sample
    #test_predict = test_predict.detach().numpy()
    #test_predict = mm.inverse_transform(test_predict)
    #test_predict = test_predict[0].tolist()

    if plotting:
        """ 
        df_X_ss = ss.transform(X) # old transformers
        df_y_mm = mm.transform(df.Close.values.reshape(-1, 1)) # old transformers
        # split the sequence
        df_X_ss, df_y_mm = split_sequences(df_X_ss, df_y_mm, preview, predict)
        # converting to tensors
        df_X_ss = Variable(torch.Tensor(df_X_ss))
        df_y_mm = Variable(torch.Tensor(df_y_mm))
        # reshaping the dataset
        df_X_ss = torch.reshape(df_X_ss, (df_X_ss.shape[0], preview, df_X_ss.shape[2]))

        train_predict = lstm(df_X_ss) # forward pass
        data_predict = train_predict.data.numpy() # numpy conversion
        dataY_plot = df_y_mm.data.numpy()

        data_predict = mm.inverse_transform(data_predict) # reverse transformation
        dataY_plot = mm.inverse_transform(dataY_plot)
        true, preds = [], []
        for i in range(len(dataY_plot)):
            true.append(dataY_plot[i][0])
        for i in range(len(data_predict)):
            preds.append(data_predict[i][0])
        plt.figure(figsize=(10,6)) #plotting
        plt.axvline(x=train_test_cutoff, c='r', linestyle='--') # size of the training set

        plt.plot(true, label='Actual Data') # actual plot
        plt.plot(preds, label='Predicted Data') # predicted plot
        plt.title('Time-Series Prediction')
        plt.legend()
        plt.savefig("whole_plot.png", dpi=300)
        plt.show() 
        """

        # Ploting 2 ------------------------------------------------------------------------------

        test_target = y_test_tensors[-1].detach().numpy() # last sample again
        test_target = mm.inverse_transform(test_target.reshape(1, -1))
        test_target = test_target[0].tolist()

        plt.plot(test_target, label="Actual Data")
        plt.plot(test_predict, label="nonDL Predictions")
        plt.savefig("NonDL_plot.png", dpi=300)
        plt.show()

        # Ploting 2 ------------------------------------------------------------------------------

        plt.figure(figsize=(10,6)) #plotting
        a = [x for x in range(0, len(y))]
        #a = [x for x in df.index]
        plt.plot(a, y[0:], label='Actual data')
        c = [x for x in range(len(y)-predict, len(y))]
        #plt.plot(c, test_predict, label=f'One-shot multi-step prediction ({predict} days)')
        #plt.axvline(x=len(y)-predict, c='r', linestyle='--')
        plt.axvline(x=train_test_cutoff, c='r', linestyle='--')

        plt.axvline(x=total_samples-count2023, c='b', linestyle='--')
        plt.axvline(x=total_samples-count2023-count2022, c='b', linestyle='--')
        plt.axvline(x=total_samples-count2023-count2022-count2021, c='b', linestyle='--')
        plt.axvline(x=total_samples-count2023-count2022-count2021-count2020, c='b', linestyle='--')
        plt.axvline(x=total_samples, c='b', linestyle='--')

        plt.scatter(len(y), test_predict[0], color='red', marker='x', label='LongTermPrediction')

        plt.legend()
        plt.show()

    return finalTestLoss

# Main

In [11]:
def main():
    safe = True
    iteration = 0
    finalloss = []
    for filename in os.listdir('CSI300_historical_Data'):
        iteration += 1
        print(f"iteration: {iteration}")
        f = os.path.join('CSI300_historical_Data', filename)
        print('read file:' + f)
        if iteration == 15:
            break
        
        loss = TrainingIteration(False, f, filename)
        if loss:
            finalloss.append(loss)

    print("Final Statistic")
    print(finalloss)
    print("Avarage Loss")
    print((sum(finalloss) / len(finalloss)))
    print("Finished")

    #if safe:
        #torch.save(lstm.state_dict(), "Model")
    #    joblib.dump(best_model, 'model_nonDL.pkl')


if __name__ == "__main__":
    main()


iteration: 1
read file:CSI300_historical_Data/300750.SZ_historical_data_231116.csv
300750.SZ
(939, 6)
(939, 14)
(939, 20)
(458, 241, 6) (458, 1)
Training Shape: torch.Size([176, 241, 6]) torch.Size([176, 1])
Testing Shape: torch.Size([282, 241, 6]) torch.Size([282, 1])
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=5; total time=   0.1s


  X = X.applymap(lambda x: pd.to_numeric(x, errors='coerce'))


[CV] END regressor__l2_regularization=0, regressor__max_depth=5; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=5; total time=   0.1s
[CV] END regressor__l2_regularization=1, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=1, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=5; total time=   0.1s
[CV] END regressor__l2_regularization=1, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=5; total time=   0.1s
[CV] END regressor__l2_regularization=1, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=1, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=1, regressor__max_depth=5; total time=   0.1s
[CV] END regressor__l2_regularization=1, regressor__max_depth=5; total time=   0.1s
[CV] END regressor__l2_regularization=1, regressor__max_depth=5; total time=

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished
  X = X.applymap(lambda x: pd.to_numeric(x, errors='coerce'))


[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
iteration: 2
read file:CSI300_historical_Data/600741.SH_historical_data_231116.csv
600741.SH
(939, 6)
(939, 14)
(939, 20)
(458, 241, 6) (458, 1)
Training Shape: torch.Size([176, 241, 6]) torch.Size([176, 1])
Testing Shape: torch.Size([282, 241, 6]) torch.Size([282, 1])
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regr

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished
  X = X.applymap(lambda x: pd.to_numeric(x, errors='coerce'))


[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
iteration: 3
read file:CSI300_historical_Data/601699.SH_historical_data_231116.csv
601699.SH
(939, 6)
(939, 14)
(939, 20)
(458, 241, 6) (458, 1)
Training Shape: torch.Size([176, 241, 6]) torch.Size([176, 1])
Testing Shape: torch.Size([282, 241, 6]) torch.Size([282, 1])
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regr

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.2s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.2s finished


[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
iteration: 4
read file:CSI300_historical_Data/601766.SH_historical_data_231116.csv
601766.SH
(939, 6)
(939, 14)
(939, 20)
(458, 241, 6) (458, 1)
Training Shape: torch.Size([176, 241, 6]) torch.Size([176, 1])
Testing Shape: torch.Size([282, 241, 6]) torch.Size([282, 1])
Fitting 5 folds for each of 4 candidates, totalling 20 fits


  X = X.applymap(lambda x: pd.to_numeric(x, errors='coerce'))


[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=5; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=5; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=5; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=5; total time=   0.1s
[CV] END regressor__l2_regularization=1, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=1, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=5; total time=

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished
  X = X.applymap(lambda x: pd.to_numeric(x, errors='coerce'))


000002.SZ
(939, 6)
(939, 14)
(939, 20)
(458, 241, 6) (458, 1)
Training Shape: torch.Size([176, 241, 6]) torch.Size([176, 1])
Testing Shape: torch.Size([282, 241, 6]) torch.Size([282, 1])
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=5; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=5; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=5; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=5; total time=   0.

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished
  X = X.applymap(lambda x: pd.to_numeric(x, errors='coerce'))


[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.1s
iteration: 6
read file:CSI300_historical_Data/002920.SZ_historical_data_231116.csv
002920.SZ
(939, 6)
(939, 14)
(939, 20)
(458, 241, 6) (458, 1)
Training Shape: torch.Size([176, 241, 6]) torch.Size([176, 1])
Testing Shape: torch.Size([282, 241, 6]) torch.Size([282, 1])
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s[CV] END regre

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished
  X = X.applymap(lambda x: pd.to_numeric(x, errors='coerce'))


[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
iteration: 7
read file:CSI300_historical_Data/603986.SH_historical_data_231116.csv
603986.SH
(939, 6)
(939, 14)
(939, 20)
(458, 241, 6) (458, 1)
Training Shape: torch.Size([176, 241, 6]) torch.Size([176, 1])
Testing Shape: torch.Size([282, 241, 6]) torch.Size([282, 1])
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.0s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=5; total time=   0.1s
[CV]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished
  X = X.applymap(lambda x: pd.to_numeric(x, errors='coerce'))


[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
iteration: 8
read file:CSI300_historical_Data/600009.SH_historical_data_231116.csv
600009.SH
(939, 6)
(939, 14)
(939, 20)
(458, 241, 6) (458, 1)
Training Shape: torch.Size([176, 241, 6]) torch.Size([176, 1])
Testing Shape: torch.Size([282, 241, 6]) torch.Size([282, 1])
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=5; total time=   0.1s
[CV]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished
  X = X.applymap(lambda x: pd.to_numeric(x, errors='coerce'))


iteration: 9
read file:CSI300_historical_Data/601100.SH_historical_data_231116.csv
601100.SH
(939, 6)
(939, 14)
(939, 20)
(458, 241, 6) (458, 1)
Training Shape: torch.Size([176, 241, 6]) torch.Size([176, 1])
Testing Shape: torch.Size([282, 241, 6]) torch.Size([282, 1])
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=5; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=5; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=5; total time=   0.1

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished
  X = X.applymap(lambda x: pd.to_numeric(x, errors='coerce'))


[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
iteration: 10
read file:CSI300_historical_Data/601919.SH_historical_data_231116.csv
601919.SH
(939, 6)
(939, 14)
(939, 20)
(458, 241, 6) (458, 1)
Training Shape: torch.Size([176, 241, 6]) torch.Size([176, 1])
Testing Shape: torch.Size([282, 241, 6]) torch.Size([282, 1])
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END reg

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished
  X = X.applymap(lambda x: pd.to_numeric(x, errors='coerce'))


[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
iteration: 11
read file:CSI300_historical_Data/600884.SH_historical_data_231116.csv
600884.SH
(939, 6)
(939, 14)
(939, 20)
(458, 241, 6) (458, 1)
Training Shape: torch.Size([176, 241, 6]) torch.Size([176, 1])
Testing Shape: torch.Size([282, 241, 6]) torch.Size([282, 1])
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END reg

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished
  X = X.applymap(lambda x: pd.to_numeric(x, errors='coerce'))


[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
iteration: 12
read file:CSI300_historical_Data/601872.SH_historical_data_231116.csv
601872.SH
(939, 6)
(939, 14)
(939, 20)
(458, 241, 6) (458, 1)
Training Shape: torch.Size([176, 241, 6]) torch.Size([176, 1])
Testing Shape: torch.Size([282, 241, 6]) torch.Size([282, 1])
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END reg

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished
  X = X.applymap(lambda x: pd.to_numeric(x, errors='coerce'))


[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
iteration: 13
read file:CSI300_historical_Data/600436.SH_historical_data_231116.csv
600436.SH
(939, 6)
(939, 14)
(939, 20)
(458, 241, 6) (458, 1)
Training Shape: torch.Size([176, 241, 6]) torch.Size([176, 1])
Testing Shape: torch.Size([282, 241, 6]) torch.Size([282, 1])
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END reg

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished
  X = X.applymap(lambda x: pd.to_numeric(x, errors='coerce'))


[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
[CV] END .................................................... total time=   0.1s
iteration: 14
read file:CSI300_historical_Data/603799.SH_historical_data_231116.csv
603799.SH
(939, 6)
(939, 14)
(939, 20)
(458, 241, 6) (458, 1)
Training Shape: torch.Size([176, 241, 6]) torch.Size([176, 1])
Testing Shape: torch.Size([282, 241, 6]) torch.Size([282, 1])
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END regressor__l2_regularization=0, regressor__max_depth=3; total time=   0.1s
[CV] END reg

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished


# debugging

In [None]:

for filename in os.listdir('CSI300_historical_Data'):
    f = os.path.join('CSI300_historical_Data', filename)

preview = 241 # How many previous days to use for prediction
predict = 1 # how many values in the future to predict
futureStep = 241 # how far in the future to predict

total = preview+predict

df = pd.read_csv(f, index_col = 'Date', parse_dates=True)

#Remove years without ESG Data
df = df[~(df.index.year == 2018)]
df = df[~(df.index.year == 2019)]
X, y = df.drop(columns=['Close']), df.Close.values

# Add ESG Data --------------------------------------

#Get Amount of data per year
X.loc[ X.index.year == 2018].sum(axis=1).count()
X.loc[ X.index.year == 2019].sum(axis=1).count()
count2020 = X.loc[ X.index.year == 2020].sum(axis=1).count()
count2021 =X.loc[ X.index.year == 2021].sum(axis=1).count()
count2022 =X.loc[ X.index.year == 2022].sum(axis=1).count()
count2023 =X.loc[ X.index.year == 2023].sum(axis=1).count()

# create Stacks of ESG values to be added to training data.

companyID = filename.split('_',1)[0]
print(companyID)

ESG2019 = df2019.loc[df2019.index == companyID]
dub = np.tile(ESG2019.values, (count2020, 1))
ESG2019_df = pd.DataFrame(dub, columns=ESG2019.columns)

ESG2020 = df2020.loc[df2020.index == companyID]
dub = np.tile(ESG2020.values, (count2021, 1))
ESG2020_df = pd.DataFrame(dub, columns=ESG2020.columns)

ESG2021 = df2021.loc[df2021.index == companyID]
dub = np.tile(ESG2021.values, (count2022, 1))
ESG2021_df = pd.DataFrame(dub, columns=ESG2021.columns)

ESG2022 = df2022.loc[df2022.index == companyID]
dub = np.tile(ESG2022.values, (count2023, 1))
ESG2022_df = pd.DataFrame(dub, columns=ESG2022.columns)


ESGData = pd.concat([ESG2019_df, ESG2020_df,ESG2021_df,ESG2022_df], ignore_index=True)

ESGData= ESGData.reset_index(drop=True)
X = X.reset_index(drop=True)

X_ESG = pd.concat([X, ESGData], axis=1)
print(X.shape)
print(ESGData.shape)
print(X_ESG.shape)

X_ESG = X_ESG.drop('Industry_code', axis=1)
X_ESG = X_ESG.fillna(0)
# Add ESG Data --------------------------------------

if ESG:
    X = X_ESG

# Data Preprocessing ----------------------------------------------------------------
mm = MinMaxScaler()
ss = StandardScaler()

#extra step to remove really all strings
X = X.applymap(lambda x: pd.to_numeric(x, errors='coerce'))
X = X.fillna(0)

# Basic Scaling of input 
X_trans = ss.fit_transform(X)
y_trans = mm.fit_transform(y.reshape(-1, 1))

# spliting the trainings data into sequences for training
def split_sequences(input_sequences, output_sequence, n_steps_in, n_steps_out):
    X, y = list(), list() # instantiate X and y
    for i in range(len(input_sequences)):
        # find the end of the input, output sequence
        end_ix = i + n_steps_in
        out_end_ix = end_ix + n_steps_out - 1
        # check if we are beyond the dataset
        if out_end_ix +futureStep > len(input_sequences): break
    
        # gather input and output of the pattern
        #seq_x, seq_y = input_sequences[i:end_ix], output_sequence[end_ix-1:out_end_ix, -1]
        seq_x, seq_y = input_sequences[i:end_ix], output_sequence[futureStep+end_ix-1:futureStep+out_end_ix, -1]
        X.append(seq_x), y.append(seq_y)
    return np.array(X), np.array(y)

X_ss, y_mm = split_sequences(X_trans, y_trans, preview, predict)

print(X_ss.shape, y_mm.shape)

total_samples = len(X)
train_test_cutoff = total_samples-total #round(0.3 * total_samples)

#split between train and testing sets
testSamplesCount = round(0.3 * total_samples)


selected_timestep = 0


X_train = X_ss[:-testSamplesCount]
X_test = X_ss[-testSamplesCount:]

y_train = y_mm[:-testSamplesCount]
y_test = y_mm[-testSamplesCount:] 


X_train_2d = X_train[:, selected_timestep, :]
X_test_2d = X_train[:, selected_timestep, :]
y_train_2d = X_train[:, selected_timestep, :]
y_test_2d = X_train[:, selected_timestep, :]

#print("Training Shape:", X_train.shape, y_train.shape)
#print("Testing Shape:", X_test.shape, y_test.shape) 

X_train_tensors = Variable(torch.Tensor(X_train_2d))
X_test_tensors = Variable(torch.Tensor(X_test_2d))

y_train_tensors = Variable(torch.Tensor(y_train_2d))
y_test_tensors = Variable(torch.Tensor(y_test_2d))


loss, best_model = TrainingIteration(False, f, filename)


In [None]:
X_train_tensors.shape

In [None]:
X_ss.shape

In [None]:
y_mm.shape

In [None]:

X_train_tensors.shape