In [4]:
## Get the daily CSI 300 constituents and all the constituents that once were in the CSI 300
import pandas as pd
from datetime import timedelta

# Load the data
csi_300_20150505 = pd.read_csv(r"C:\Users\yfxx_\Desktop\NB106-Fortune-Robbers-main\CSI300_Constituents_info\csi300_20150505.csv")
constituent_changes = pd.read_csv(r"C:\Users\yfxx_\Desktop\NB106-Fortune-Robbers-main\CSI300_Constituents_info\constituent_changes.csv")

# Preprocess the data
csi_300_20150505['Date'] = pd.to_datetime(csi_300_20150505['Date'])
constituent_changes['Date'] = pd.to_datetime(constituent_changes['Date'])

# Initialize the constituents list
start_date = pd.to_datetime('2015-05-05')
end_date = pd.to_datetime('2024-06-25')

# Create a dictionary to store constituents for each date
constituents_dict = {}

# Start with the initial constituents
initial_constituents = set(csi_300_20150505['RIC'])
current_constituents = initial_constituents.copy()

# Generate daily lists
current_date = start_date
while current_date <= end_date:
    # Record the current constituents list
    constituents_dict[current_date] = list(current_constituents)
    
    # Apply changes for the current date
    changes_today = constituent_changes[constituent_changes['Date'] == current_date]
    for _, change in changes_today.iterrows():
        if change['Change'] == 'Joiner':
            current_constituents.add(change['Constituent RIC'])
        elif change['Change'] == 'Leaver':
            current_constituents.discard(change['Constituent RIC'])
    
    # Move to the next day
    current_date += timedelta(days=1)

# Convert the dictionary to a DataFrame with grouped RICs
grouped_result_df = pd.DataFrame([(date, rics) for date, rics in constituents_dict.items()],
                                 columns=['Date', 'RICs'])

# Save to CSV
output_grouped_path = r"C:\Users\yfxx_\Desktop\NB106-Fortune-Robbers-main\CSI300_Constituents_info\daily_csi_300_constituents_grouped.csv"
grouped_result_df.to_csv(output_grouped_path, index=False)

print(f"Grouped Daily CSI 300 Constituents saved to {output_grouped_path}")

## Get all the RICs that once to be in the CSI 300
all_rics = set()
for rics in constituents_dict.values():
    all_rics.update(set(rics))

print(f"Total number of unique RICs in the CSI 300: {len(all_rics)}")

hf_token = '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'

## Save all the RICs to a file and change all the .SS to .SH
all_rics = [ric.replace('.SS', '.SH') for ric in all_rics]
output_rics_path = r"C:\Users\yfxx_\Desktop\NB106-Fortune-Robbers-main\CSI300_Constituents_info\csi_300_all_rics.csv"
pd.DataFrame(all_rics, columns=['RIC']).to_csv(output_rics_path, index=False)

Grouped Daily CSI 300 Constituents saved to C:\Users\yfxx_\Desktop\NB106-Fortune-Robbers-main\CSI300_Constituents_info\daily_csi_300_constituents_grouped.csv
Total number of unique RICs in the CSI 300: 638


In [50]:
## Technical Indicators
import talib as ta
import os

file_path = r"C:\Users\yfxx_\Desktop\NB106-Fortune-Robbers-main\RawData\Securities\CSI300_All"

## Read all the files in the folder
files = os.listdir(file_path)

for file in files:
    print(file)
    data = pd.read_csv(file_path + '/' + file, encoding='gbk')
    ## Check if the data columns are not empty
    if not data['收盘价(元)'].isna().all() and not data['成交量(股)'].isna().all() and not data['最高价(元)'].isna().all() and not data['最低价(元)'].isna().all():
        ## Fill the blank values with 0 because of the suspension of trading
        data['成交量(股)'] = data['成交量(股)'].fillna(0)
        data['成交金额(元)'] = data['成交金额(元)'].fillna(0)
        data['涨跌幅(%)'] = data['涨跌幅(%)'].fillna(0)
        data['涨跌(元)'] = data['涨跌(元)'].fillna(0)
        data['换手率(%)'] = data['换手率(%)'].fillna(0)
        data['均价(元)'] = data['均价(元)'].fillna(0)
        ## Calculate the technical indicators
        data['MA(5)'] = ta.SMA(data['收盘价(元)'], timeperiod=5)
        data['MA(30)'] = ta.SMA(data['收盘价(元)'], timeperiod=30)
        data['MA(60)'] = ta.SMA(data['收盘价(元)'], timeperiod=60)
        data['EMA(5)'] = ta.EMA(data['收盘价(元)'], timeperiod=5)
        data['EMA(30)'] = ta.EMA(data['收盘价(元)'], timeperiod=30)
        data['EMA(60)'] = ta.EMA(data['收盘价(元)'], timeperiod=60)
        data['MACD(6,15,6)'], data['MACD_SIGNAL(6,15,6)'], data['MACD_HIST(6,15,6)'] = ta.MACD(data['收盘价(元)'], fastperiod=6, slowperiod=15, signalperiod=6)
        data['MACD(12,26,9)'], data['MACD_SIGNAL'], data['MACD_HIST'] = ta.MACD(data['收盘价(元)'], fastperiod=12, slowperiod=26, signalperiod=9)
        data['MACD(30,60,30)'], data['MACD_SIGNAL'], data['MACD_HIST'] = ta.MACD(data['收盘价(元)'], fastperiod=30, slowperiod=60, signalperiod=30)
        data['RSI(14)'] = ta.RSI(data['收盘价(元)'], timeperiod=14)
        data['WILLR(14)'] = ta.WILLR(data['最高价(元)'], data['最低价(元)'], data['收盘价(元)'], timeperiod=14)
        data['MOM(14)'] = ta.MOM(data['收盘价(元)'], timeperiod=14)
        data['CMO(14)'] = ta.CMO(data['收盘价(元)'], timeperiod=14)
        data['ULTOSC(7,14,28)'] = ta.ULTOSC(data['最高价(元)'], data['最低价(元)'], data['收盘价(元)'], timeperiod1=7, timeperiod2=14, timeperiod3=28)
        data['OBV'] = ta.OBV(data['收盘价(元)'], data['成交量(股)'])
        data['ADOSC(3,10)'] = ta.ADOSC(data['最高价(元)'], data['最低价(元)'], data['收盘价(元)'], data['成交量(股)'], fastperiod=3, slowperiod=10)
        data['BBANDS_UPPER'], data['BBANDS_MIDDLE'], data['BBANDS_LOWER'] = ta.BBANDS(data['收盘价(元)'], timeperiod=5, nbdevup=2, nbdevdn=2, matype=0)
        data['TRIMA(30)'] = ta.TRIMA(data['收盘价(元)'], timeperiod=30)
        data['TRIX(30)'] = ta.TRIX(data['收盘价(元)'], timeperiod=30)
        data['ADX(14)'] = ta.ADX(data['最高价(元)'], data['最低价(元)'], data['收盘价(元)'], timeperiod=14)
        data['CCI(14)'] = ta.CCI(data['最高价(元)'], data['最低价(元)'], data['收盘价(元)'], timeperiod=14)
        data['CCI(30)'] = ta.CCI(data['最高价(元)'], data['最低价(元)'], data['收盘价(元)'], timeperiod=30)
        data['Chaikin(AD)'] = ta.AD(data['最高价(元)'], data['最低价(元)'], data['收盘价(元)'], data['成交量(股)'])
        data['AROON_UP(14)'], data['AROON_DOWN(14)'] = ta.AROON(data['最高价(元)'], data['最低价(元)'], timeperiod=14)
        data['AROONOSC(14)'] = ta.AROONOSC(data['最高价(元)'], data['最低价(元)'], timeperiod=14)
        data['ATR(14)'] = ta.ATR(data['最高价(元)'], data['最低价(元)'], data['收盘价(元)'], timeperiod=14)
        data['NATR(14)'] = ta.NATR(data['最高价(元)'], data['最低价(元)'], data['收盘价(元)'], timeperiod=14)
        data['TRANGE'] = ta.TRANGE(data['最高价(元)'], data['最低价(元)'], data['收盘价(元)'])
        data['LINEARREG(14)'] = ta.LINEARREG(data['收盘价(元)'], timeperiod=14)
        data['LINEARREG_ANGLE(14)'] = ta.LINEARREG_ANGLE(data['收盘价(元)'], timeperiod=14)
        data['LINEARREG_SLOPE(14)'] = ta.LINEARREG_SLOPE(data['收盘价(元)'], timeperiod=14)
        data['LINEARREG_INTERCEPT(14)'] = ta.LINEARREG_INTERCEPT(data['收盘价(元)'], timeperiod=14)
        data['LINEARREG_SLOPE(14)'] = ta.LINEARREG_SLOPE(data['收盘价(元)'], timeperiod=14)
        data['LINEARREG_ANGLE(14)'] = ta.LINEARREG_ANGLE(data['收盘价(元)'], timeperiod=14)
        data['LINEARREG_INTERCEPT(14)'] = ta.LINEARREG_INTERCEPT(data['收盘价(元)'], timeperiod=14)

        ## Save the data to "C:\Users\yfxx_\Desktop\NB106-Fortune-Robbers-main\Processed Data"
        file_name = file[:-4] + "_Technical_Indicators.csv"
        data.to_csv('Processed Data/' + file_name, index=False)
    else:
        print(f"Data {file} does not contain necessary columns")

000001.SZ.CSV
000002.SZ.CSV
000008.SZ.CSV
000009.SZ.CSV
000024.SZ.CSV
000027.SZ.CSV
000039.SZ.CSV
000046.SZ.CSV
000060.SZ.CSV
000061.SZ.CSV
000063.SZ.CSV
000066.SZ.CSV
000069.SZ.CSV
000100.SZ.CSV
000156.SZ.CSV
000157.SZ.CSV
000166.SZ.CSV
000301.SZ.CSV
000333.SZ.CSV
000338.SZ.CSV
000400.SZ.CSV
000401.SZ.CSV
000402.SZ.CSV
000408.SZ.CSV
000413.SZ.CSV
000415.SZ.CSV
000423.SZ.CSV
000425.SZ.CSV
000503.SZ.CSV
000536.SZ.CSV
000538.SZ.CSV
000539.SZ.CSV
000540.SZ.CSV
000553.SZ.CSV
000555.SZ.CSV
000559.SZ.CSV
000568.SZ.CSV
000581.SZ.CSV
000596.SZ.CSV
000598.SZ.CSV
000617.SZ.CSV
000623.SZ.CSV
000625.SZ.CSV
000627.SZ.CSV
000629.SZ.CSV
000630.SZ.CSV
000651.SZ.CSV
000656.SZ.CSV
000661.SZ.CSV
000671.SZ.CSV
000686.SZ.CSV
000703.SZ.CSV
000708.SZ.CSV
000709.SZ.CSV
000712.SZ.CSV
000718.SZ.CSV
000723.SZ.CSV
000725.SZ.CSV
000728.SZ.CSV
000729.SZ.CSV
000733.SZ.CSV
000738.SZ.CSV
000750.SZ.CSV
000768.SZ.CSV
000776.SZ.CSV
000778.SZ.CSV
000783.SZ.CSV
000786.SZ.CSV
000792.SZ.CSV
000793.SZ.CSV
000800.SZ.CSV
000807

In [51]:
## Read all the files in the folder and drop rows with empty values
file_path = r"C:\Users\yfxx_\Desktop\NB106-Fortune-Robbers-main\Processed Data"
files = os.listdir(file_path)

for file in files:
    print(file)
    data = pd.read_csv(file_path + '/' + file)

    if 'Unnamed: 22' in data.columns:
        data = data.drop(columns=['Unnamed: 22'])
    data = data.dropna()
    
    data.to_csv('Processed Data/' + file, index=False)

000001.SZ_Technical_Indicators.csv
000002.SZ_Technical_Indicators.csv
000008.SZ_Technical_Indicators.csv
000009.SZ_Technical_Indicators.csv
000024.SZ_Technical_Indicators.csv
000027.SZ_Technical_Indicators.csv
000039.SZ_Technical_Indicators.csv
000046.SZ_Technical_Indicators.csv
000060.SZ_Technical_Indicators.csv
000061.SZ_Technical_Indicators.csv
000063.SZ_Technical_Indicators.csv
000066.SZ_Technical_Indicators.csv
000069.SZ_Technical_Indicators.csv
000100.SZ_Technical_Indicators.csv
000156.SZ_Technical_Indicators.csv
000157.SZ_Technical_Indicators.csv
000166.SZ_Technical_Indicators.csv
000301.SZ_Technical_Indicators.csv
000333.SZ_Technical_Indicators.csv
000338.SZ_Technical_Indicators.csv
000400.SZ_Technical_Indicators.csv
000401.SZ_Technical_Indicators.csv
000402.SZ_Technical_Indicators.csv
000408.SZ_Technical_Indicators.csv
000413.SZ_Technical_Indicators.csv
000415.SZ_Technical_Indicators.csv
000423.SZ_Technical_Indicators.csv
000425.SZ_Technical_Indicators.csv
000503.SZ_Technical_

In [52]:
## Standardize the data
from sklearn.preprocessing import StandardScaler
import os

file_path = r"C:\Users\yfxx_\Desktop\NB106-Fortune-Robbers-main\Processed Data"
files = os.listdir(file_path)

if not os.path.exists('Standardized Data'):
    os.makedirs('Standardized Data')

for file in files:
    print(file)
    data = pd.read_csv(file_path + '/' + file)

    ## Standardize the data in the columns after the third column
    scaler = StandardScaler()
    data.iloc[:, 3:] = scaler.fit_transform(data.iloc[:, 3:])

    ## Save the data
    data.to_csv('Standardized Data/' + file, index=False)

000001.SZ_Technical_Indicators.csv
000002.SZ_Technical_Indicators.csv
000008.SZ_Technical_Indicators.csv
000009.SZ_Technical_Indicators.csv
000024.SZ_Technical_Indicators.csv
000027.SZ_Technical_Indicators.csv
000039.SZ_Technical_Indicators.csv
000046.SZ_Technical_Indicators.csv
000060.SZ_Technical_Indicators.csv
000061.SZ_Technical_Indicators.csv
000063.SZ_Technical_Indicators.csv
000066.SZ_Technical_Indicators.csv
000069.SZ_Technical_Indicators.csv
000100.SZ_Technical_Indicators.csv
000156.SZ_Technical_Indicators.csv
000157.SZ_Technical_Indicators.csv
000166.SZ_Technical_Indicators.csv
000301.SZ_Technical_Indicators.csv
000333.SZ_Technical_Indicators.csv
000338.SZ_Technical_Indicators.csv
000400.SZ_Technical_Indicators.csv
000401.SZ_Technical_Indicators.csv
000402.SZ_Technical_Indicators.csv
000408.SZ_Technical_Indicators.csv
000413.SZ_Technical_Indicators.csv
000415.SZ_Technical_Indicators.csv
000423.SZ_Technical_Indicators.csv
000425.SZ_Technical_Indicators.csv
000503.SZ_Technical_