In [250]:
import pandas as pd 
import numpy as np
import os
import sys
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

DEFAULT_TODAY = (datetime.today() - timedelta(days=365)).strftime('%Y%m%d')
DEFAULT_LASTYEAR = datetime.today().strftime('%Y%m%d')

os.chdir(sys.path[0] + "\\..")
os.getcwd()

from pysrc import utils


In [251]:
RATING_WINDOW = 30

PREDICT_WINDOW = 7
prev_d = 5 * PREDICT_WINDOW // 7


In [252]:
dirPath = "data/raw/"
tagDirPath = dirPath + "sohu_stock/"
data_date_lst = [] 
for filename in os.listdir(tagDirPath):
    if os.path.isdir(tagDirPath + filename):
        data_date_lst.append(datetime.strptime(filename, '%Y-%m-%d').date())

date = max(data_date_lst).strftime('%Y-%m-%d')
tag_df = pd.read_csv(tagDirPath + date + ".csv")

tag_df["stockCode"] = tag_df['stockCode'].astype("str").apply(lambda x: x.zfill(6)).to_list()
tag_df = tag_df[~tag_df['sector'].isin(["央视50_","AH股","分拆预期","破净股","证金持股","上证50_","HS300_","上证180_","中证500","深成500","深证100R","标准普尔"])]
tag_df.drop(['date'],axis=1,inplace=True)
tag_df

Unnamed: 0,stockCode,stockName,sector,sectorCount
0,600337,美克家居,2025规划,81
1,300720,海川智能,2025规划,81
2,000810,创维数字,2025规划,81
3,603111,康尼机电,2025规划,81
4,300729,乐歌股份,2025规划,81
...,...,...,...,...
63948,300900,广联航空,黑龙江板块,41
63949,600853,龙建股份,黑龙江板块,41
63950,301371,敷尔佳,黑龙江板块,41
63951,688459,哈铁科技,黑龙江板块,41


### Pricing

In [253]:
### Pricing
price_df = pd.read_csv("price.csv")
price_df

Unnamed: 0,date,sector,close,open,high,low,volume,turnover,outstanding
0,2020-08-26,2025规划,17.099342,17.376184,17.710921,16.897105,21750333.0,3.160960e+10,7.363291e+07
1,2020-08-26,3D打印,16.533333,16.953556,17.242000,16.352889,7332533.0,8.609245e+09,3.855253e+07
2,2020-08-26,3D摄像头,32.694000,33.891000,34.307000,32.424000,4176112.0,1.155246e+10,1.723285e+07
3,2020-08-26,3D玻璃,11.558947,11.913158,12.069474,11.425263,8603472.0,1.069712e+10,4.500931e+07
4,2020-08-26,5G概念,20.043140,20.668217,20.945078,19.853488,53489373.0,8.487718e+10,3.117830e+08
...,...,...,...,...,...,...,...,...,...
351142,2023-08-25,鸡肉概念,14.696667,14.840000,15.432222,14.550000,3836519.0,4.220057e+09,1.320531e+07
351143,2023-08-25,鸿蒙概念,21.165000,21.756944,22.165278,20.919167,8617783.0,1.254978e+10,5.845514e+07
351144,2023-08-25,麒麟电池,47.365000,47.648333,48.055000,46.843333,534960.0,2.976663e+09,6.798281e+05
351145,2023-08-25,黄金概念,11.033208,11.125660,11.264340,10.897358,12888744.0,1.008243e+10,4.770991e+07


### Rating

In [254]:
# Custom function to map substrings to values
def map_values(value):
    res = np.nan
    value = str(value)

    if any(np.char.find(['买进','买入','强烈推荐'],value)!=-1):
        res = 3
    elif any(np.char.find(['增持','推荐'],value)!=-1):
        res = 2
    
    if any(np.char.find(['慎推荐','慎增持','优于大市','强于大市'],value)!=-1):
        res = 1.5
    elif any(np.char.find(['持有','观望'],value)!=-1):
        res = 1
    elif any(np.char.find(['中性'],value)!=-1):
        res = 0
    elif any(np.char.find(['弱于大市'],value)!=-1):
        res = -1
    elif any(np.char.find(['减持'],value)!=-1):
        res = -2
    elif any(np.char.find(['卖出'],value)!=-1):
        res = -3

    return res

In [255]:
rating_df = pd.read_csv("个股研报.csv")
rating_df.rename(columns={"publishDate":"date"})
rating_df['date'] = pd.to_datetime(rating_df['publishDate']).dt.date
rating_df["stockCode"] = rating_df['stockCode'].astype("str").apply(lambda x: x.zfill(6)).to_list()
# Filter based on tag_df 
stock_code_list = tag_df['stockCode'].to_list()
rating_df = rating_df[rating_df['stockCode'].isin(stock_code_list)]
rating_df = pd.merge(tag_df, rating_df, how='inner',on=['stockCode','stockName'])

rating_df['stockCount'] = rating_df.groupby('sector')['stockCode'].transform(lambda x:len(set(x)))

rating_df['sRatingValue'] = rating_df['sRatingName'].apply(map_values)
rating_df['emRatingValue'].fillna(rating_df['sRatingValue'], inplace=True)

rating_df = rating_df[['date','stockCode','stockName','sector','stockCount','sectorCount','infoCode','emRatingValue','emRatingName']]


In [256]:
utils.summary(rating_df)

data shape: (1086289, 9)


Unnamed: 0,data type,#missing,%missing,#unique,min,max,first value,second value,third value
date,object,0,0.0,766,,,2021-04-26,2021-04-27,2021-04-26
stockCode,object,0,0.0,3482,,,600337,600337,600337
stockName,object,0,0.0,3481,,,美克家居,美克家居,美克家居
sector,object,0,0.0,483,,,2025规划,2025规划,C2M概念
stockCount,int64,0,0.0,161,1.0,2554.0,59,59,23
sectorCount,int64,0,0.0,202,4.0,3346.0,81,81,40
infoCode,object,0,0.0,79057,,,AP202104261487873071,AP202104271488107503,AP202104261487873071
emRatingValue,float64,23046,2.121535,7,-3.0,3.0,3.0,3.0,3.0
emRatingName,object,24093,2.217918,6,,,买入,买入,买入


In [257]:
# rating_df['i'] = rating_df.groupby("title")['infoCode'].transform(lambda x:len(set(x)))
# # rating_df['i'].drop_duplicates()
# rating_df[rating_df['i']==8][['date','stockName','infoCode','title','author','i']].drop_duplicates().sort_values(["title","date"])

In [169]:
rating_df.sort_values(by=['date','sector'], inplace=True)
df = rating_df
window_size = 30

date_df = price_df[['date']].drop_duplicates()
date_df['date'] = pd.to_datetime(date_df['date']).dt.date
date_df = date_df[date_df['date']>=rating_df['date'].min()]

grouped = df.groupby('sector')

# Initialize lists to store results
total_stock_count = []
increase_from_last_week = []
increase_from_two_weeks_ago = []
weighted_score_avg = []

weekly_rating_df = pd.DataFrame()

# Loop through each sector
for sector, sector_data in grouped:
    # Calculate rolling statistics
    rolling_count = sector_data.groupby(['date','stockCode','infoCode']).size().groupby('date').count().reset_index().rename(columns={0:"count"})
    rolling_count = pd.merge(date_df,rolling_count,how='outer',on='date').sort_values('date').fillna(0)
    rolling_count = rolling_count.set_index('date').rolling(window=window_size,min_periods=1).sum()
    rolling_count = rolling_count[pd.to_datetime(rolling_count.index).weekday==4]

    rolling_weighted_score_avg = sector_data.groupby(['date'])['emRatingValue'].apply(sum).reset_index()
    rolling_weighted_score_avg = pd.merge(date_df,rolling_weighted_score_avg,how='outer',on='date').sort_values('date').fillna(0)
    rolling_weighted_score_avg = rolling_weighted_score_avg.set_index('date').rolling(window=window_size,min_periods=1).apply(lambda x:np.exp(np.linspace(-5, 0, window_size))[-len(x):].dot(x))
    rolling_weighted_score_avg = rolling_weighted_score_avg[pd.to_datetime(rolling_weighted_score_avg.index).weekday==4]
   
    # Create a DataFrame for the sector's rolling statistics
    sector_results = pd.DataFrame({
        'sector': sector,
        'count': rolling_count['count'],
        'emRatingValue': rolling_weighted_score_avg['emRatingValue']
    })

    # Append the sector's rolling statistics to the weekly_rating_df DataFrame
    weekly_rating_df = pd.concat([weekly_rating_df, sector_results])

weekly_rating_df.reset_index(inplace=True)
weekly_rating_df

Unnamed: 0_level_0,sector,count,emRatingValue
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-04-23,2025规划,12.0,24.841970
2021-04-30,2025规划,68.0,127.245341
2021-05-07,2025规划,87.0,119.507659
2021-05-14,2025规划,100.0,73.110498
2021-05-21,2025规划,102.0,34.787164
...,...,...,...
2023-07-28,黑龙江板块,12.0,5.874757
2023-08-04,黑龙江板块,13.0,6.656150
2023-08-11,黑龙江板块,14.0,5.982305
2023-08-18,黑龙江板块,17.0,9.718578


In [173]:
weekly_rating_df.to_csv('weekly_rating_df.csv',index=False)

### Merged

In [261]:
df = price_df
df = df.sort_values(by=['sector', 'date'])
df['log_return'] = df.groupby('sector')['close'].apply(lambda x: x / x.shift(1)).apply(lambda x: pd.np.log(x))

df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
weekly_data = df.groupby('sector').resample('W-FRI').agg({
    'open': 'mean', 'close': 'mean',
    'volume': 'sum', 'turnover': 'sum', 'outstanding': 'mean',
    'high': 'max', 'low': 'min'})

weekly_data['log_return'] = weekly_data.groupby('sector')['close'].apply(lambda x: x / x.shift(1)).apply(lambda x: pd.np.log(x))
weekly_data['log_return'] = weekly_data.groupby('sector')['log_return'].shift(-1)

data = weekly_data
data.reset_index(inplace=True)
data['date'] = data['date'].dt.date
data

Unnamed: 0,sector,date,open,close,volume,turnover,outstanding,high,low,log_return
0,2025规划,2020-08-28,17.311447,17.419254,58977060.0,8.977038e+10,6.095612e+07,17.963684,16.868289,0.022558
1,2025规划,2020-09-04,17.800858,17.816659,102299077.0,1.508144e+11,7.551097e+07,18.336711,17.206400,-0.050363
2,2025规划,2020-09-11,17.127653,16.941573,123443542.0,1.447128e+11,1.150070e+08,18.110933,16.064933,0.010807
3,2025规划,2020-09-18,16.975961,17.125647,92867239.0,1.304577e+11,7.415628e+07,17.532105,16.501733,-0.011406
4,2025规划,2020-09-25,17.074737,16.931421,84060738.0,1.203642e+11,6.127513e+07,17.816579,16.384079,-0.015826
...,...,...,...,...,...,...,...,...,...,...
75620,黑龙江板块,2023-07-28,10.627363,10.648135,37670661.0,2.202696e+10,3.193090e+07,10.884211,10.452895,0.090924
75621,黑龙江板块,2023-08-04,11.749310,11.661687,60499045.0,3.991703e+10,6.107467e+07,12.666667,10.548947,0.001297
75622,黑龙江板块,2023-08-11,11.685436,11.676821,47648535.0,3.137772e+10,4.145987e+07,11.904872,11.481282,-0.003502
75623,黑龙江板块,2023-08-18,11.585692,11.636000,31922522.0,2.169401e+10,1.570421e+07,11.919487,11.335128,-0.039601


Unnamed: 0,date,stockCode,stockName,sector,stockCount,sectorCount,infoCode,emRatingValue,emRatingName
0,2021-04-26,600337,美克家居,2025规划,59,81,AP202104261487873071,3.0,买入
1,2021-04-27,600337,美克家居,2025规划,59,81,AP202104271488107503,3.0,买入
2,2021-04-26,600337,美克家居,C2M概念,23,40,AP202104261487873071,3.0,买入
3,2021-04-27,600337,美克家居,C2M概念,23,40,AP202104271488107503,3.0,买入
4,2021-04-26,600337,美克家居,人工智能,330,488,AP202104261487873071,3.0,买入
...,...,...,...,...,...,...,...,...,...
1086284,2022-01-12,836826,盖世食品,食品饮料,100,126,AP202201121539998338,,
1086285,2023-06-16,836826,盖世食品,食品饮料,100,126,AP202306161591025808,,
1086286,2022-08-25,836826,盖世食品,食品饮料,100,126,AP202208251577620576,3.0,买入
1086287,2022-04-01,836826,盖世食品,食品饮料,100,126,AP202204011556453189,,


In [275]:
df = pd.merge(weekly_rating_df,weekly_data,on=['date','sector'],how='outer')
df = pd.merge(df,rating_df[['sector','stockCount']].drop_duplicates(),on='sector',how='left')
df['count'] = df['count']/df['stockCount']
df['emRatingValue'] = df['emRatingValue']/df['stockCount']
df.drop(columns=['stockCount'])
data = df

cols = data.columns.to_list()
y_cols = ['log_return']
x_cols = [col for col in cols if col not in ['log_return','date','sector']]
x_cols + y_cols

['count',
 'emRatingValue',
 'open',
 'close',
 'volume',
 'turnover',
 'outstanding',
 'high',
 'low',
 'stockCount',
 'log_return']

In [276]:
data

Unnamed: 0,date,sector,count,emRatingValue,open,close,volume,turnover,outstanding,high,low,log_return,stockCount
0,2021-04-23,2025规划,0.203390,0.421050,17.877079,18.027237,83128627.0,1.442410e+11,3.677434e+07,18.375658,17.384474,0.003454,59
1,2021-04-30,2025规划,1.152542,2.156701,18.119605,18.089605,89668529.0,1.615558e+11,4.239846e+07,18.749079,17.502632,-0.032710,59
2,2021-05-07,2025规划,1.474576,2.025554,17.770467,17.507467,43724373.0,7.555516e+10,6.786680e+07,18.181600,17.190000,0.001642,59
3,2021-05-14,2025规划,1.694915,1.239161,17.377040,17.536240,96951739.0,1.478119e+11,5.036038e+07,17.945333,16.890800,0.036532,59
4,2021-05-21,2025规划,1.728814,0.589613,18.056788,18.188713,91293069.0,1.745279e+11,4.657931e+07,18.735000,17.612400,0.032792,59
...,...,...,...,...,...,...,...,...,...,...,...,...,...
75752,2022-02-04,黑龙江板块,,,,,0.0,0.000000e+00,,,,,15
75753,2022-06-03,黑龙江板块,,,8.916389,9.087083,26412433.0,2.184417e+10,3.365473e+07,9.409722,8.590278,0.013001,15
75754,2022-10-07,黑龙江板块,,,,,0.0,0.000000e+00,,,,,15
75755,2023-01-27,黑龙江板块,,,,,0.0,0.000000e+00,,,,,15


In [274]:
diff_periods = [1, 3]  # Number of lag periods
for diff in diff_periods:
    for col in x_cols:
        data[f'{col}_diff_{diff}'] = data.groupby('sector')[col].diff(diff)
data

Unnamed: 0,date,sector,count,emRatingValue,open,close,volume,turnover,outstanding,high,...,count_diff_3,emRatingValue_diff_3,open_diff_3,close_diff_3,volume_diff_3,turnover_diff_3,outstanding_diff_3,high_diff_3,low_diff_3,stockCount_diff_3
0,2021-04-23,2025规划,0.203390,0.421050,17.877079,18.027237,83128627.0,1.442410e+11,3.677434e+07,18.375658,...,,,,,,,,,,
1,2021-04-30,2025规划,1.152542,2.156701,18.119605,18.089605,89668529.0,1.615558e+11,4.239846e+07,18.749079,...,,,,,,,,,,
2,2021-05-07,2025规划,1.474576,2.025554,17.770467,17.507467,43724373.0,7.555516e+10,6.786680e+07,18.181600,...,,,,,,,,,,
3,2021-05-14,2025规划,1.694915,1.239161,17.377040,17.536240,96951739.0,1.478119e+11,5.036038e+07,17.945333,...,1.491525,0.818111,-0.500039,-0.490997,13823112.0,3.570844e+09,1.358604e+07,-0.430325,-0.493674,0.0
4,2021-05-21,2025规划,1.728814,0.589613,18.056788,18.188713,91293069.0,1.745279e+11,4.657931e+07,18.735000,...,0.576271,-1.567088,-0.062818,0.099107,1624540.0,1.297210e+10,4.180845e+06,-0.014079,0.109768,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75752,2022-02-04,黑龙江板块,,,,,0.0,0.000000e+00,,,...,,,,,-16910837.0,-1.122390e+10,,,,0.0
75753,2022-06-03,黑龙江板块,,,8.916389,9.087083,26412433.0,2.184417e+10,3.365473e+07,9.409722,...,,,-0.914525,-0.757202,6486795.0,9.429367e+09,1.302961e+07,-0.667421,-1.025437,0.0
75754,2022-10-07,黑龙江板块,,,,,0.0,0.000000e+00,,,...,,,,,-25895049.0,-1.964096e+10,,,,0.0
75755,2023-01-27,黑龙江板块,,,,,0.0,0.000000e+00,,,...,,,,,0.0,0.000000e+00,,,,0.0


In [277]:
lag_periods = [1, 3]  # Number of lag periods
for lag in lag_periods:
    for col in x_cols:
        data[f'{col}_lag_{lag}'] = data.groupby('sector')[col].shift(lag)
data

Unnamed: 0,date,sector,count,emRatingValue,open,close,volume,turnover,outstanding,high,...,count_lag_3,emRatingValue_lag_3,open_lag_3,close_lag_3,volume_lag_3,turnover_lag_3,outstanding_lag_3,high_lag_3,low_lag_3,stockCount_lag_3
0,2021-04-23,2025规划,0.203390,0.421050,17.877079,18.027237,83128627.0,1.442410e+11,3.677434e+07,18.375658,...,,,,,,,,,,
1,2021-04-30,2025规划,1.152542,2.156701,18.119605,18.089605,89668529.0,1.615558e+11,4.239846e+07,18.749079,...,,,,,,,,,,
2,2021-05-07,2025规划,1.474576,2.025554,17.770467,17.507467,43724373.0,7.555516e+10,6.786680e+07,18.181600,...,,,,,,,,,,
3,2021-05-14,2025规划,1.694915,1.239161,17.377040,17.536240,96951739.0,1.478119e+11,5.036038e+07,17.945333,...,0.203390,0.421050,17.877079,18.027237,83128627.0,1.442410e+11,3.677434e+07,18.375658,17.384474,59.0
4,2021-05-21,2025规划,1.728814,0.589613,18.056788,18.188713,91293069.0,1.745279e+11,4.657931e+07,18.735000,...,1.152542,2.156701,18.119605,18.089605,89668529.0,1.615558e+11,4.239846e+07,18.749079,17.502632,59.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75752,2022-02-04,黑龙江板块,,,,,0.0,0.000000e+00,,,...,,,9.976357,10.011929,16910837.0,1.122390e+10,1.655254e+07,10.189714,9.798286,15.0
75753,2022-06-03,黑龙江板块,,,8.916389,9.087083,26412433.0,2.184417e+10,3.365473e+07,9.409722,...,,,9.830914,9.844286,19925638.0,1.241480e+10,2.062512e+07,10.077143,9.615714,15.0
75754,2022-10-07,黑龙江板块,,,,,0.0,0.000000e+00,,,...,,,11.002014,10.843125,25895049.0,1.964096e+10,3.938766e+07,11.605000,10.511667,15.0
75755,2023-01-27,黑龙江板块,,,,,0.0,0.000000e+00,,,...,,,,,0.0,0.000000e+00,,,,15.0


In [278]:
moving_averages = [1, 3]  # Moving average periods
for ma in moving_averages:
    for col in x_cols:
        data[f'{col}_ma_{ma}'] = data.groupby('sector')[col].rolling(window=ma).mean().reset_index(level='sector',drop=True)
data

Unnamed: 0,date,sector,count,emRatingValue,open,close,volume,turnover,outstanding,high,...,count_ma_3,emRatingValue_ma_3,open_ma_3,close_ma_3,volume_ma_3,turnover_ma_3,outstanding_ma_3,high_ma_3,low_ma_3,stockCount_ma_3
0,2021-04-23,2025规划,0.203390,0.421050,17.877079,18.027237,83128627.0,1.442410e+11,3.677434e+07,18.375658,...,,,,,,,,,,
1,2021-04-30,2025规划,1.152542,2.156701,18.119605,18.089605,89668529.0,1.615558e+11,4.239846e+07,18.749079,...,,,,,,,,,,
2,2021-05-07,2025规划,1.474576,2.025554,17.770467,17.507467,43724373.0,7.555516e+10,6.786680e+07,18.181600,...,0.943503,1.534435,17.922384,17.874770,7.217384e+07,1.271173e+11,4.901320e+07,18.435446,17.359035,59.0
3,2021-05-14,2025规划,1.694915,1.239161,17.377040,17.536240,96951739.0,1.478119e+11,5.036038e+07,17.945333,...,1.440678,1.807138,17.755704,17.711104,7.678155e+07,1.283076e+11,5.354188e+07,18.292004,17.194477,59.0
4,2021-05-21,2025规划,1.728814,0.589613,18.056788,18.188713,91293069.0,1.745279e+11,4.657931e+07,18.735000,...,1.632768,1.284776,17.734765,17.744140,7.732306e+07,1.326316e+11,5.493549e+07,18.287311,17.231067,59.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75752,2022-02-04,黑龙江板块,,,,,0.0,0.000000e+00,,,...,,,,,1.527356e+07,1.068526e+10,,,,15.0
75753,2022-06-03,黑龙江板块,,,8.916389,9.087083,26412433.0,2.184417e+10,3.365473e+07,9.409722,...,,,,,1.743583e+07,1.382838e+10,,,,15.0
75754,2022-10-07,黑龙江板块,,,,,0.0,0.000000e+00,,,...,,,,,8.804144e+06,7.281390e+09,,,,15.0
75755,2023-01-27,黑龙江板块,,,,,0.0,0.000000e+00,,,...,,,,,8.804144e+06,7.281390e+09,,,,15.0


In [279]:
# data.dropna(inplace=True)
# data = data[data['log_return'].notna()]
data.to_csv("featured_price.csv",index=False)


In [221]:
utils.summary(data)

data shape: (72248, 73)


Unnamed: 0,data type,#missing,%missing,#unique,min,max,first value,second value,third value
date,object,0,0.000000,150,,,2021-04-23,2021-04-30,2021-05-07
sector,object,0,0.000000,483,,,2025规划,2025规划,2025规划
count,float64,17799,24.635976,11780,0.0,14.75,0.20339,1.152542,1.474576
emRatingValue,float64,17799,24.635976,51910,0.0,14.901076,0.42105,2.156701,2.025554
open,float64,0,0.000000,71954,2.205879,270.048765,17.877079,18.119605,17.770467
...,...,...,...,...,...,...,...,...,...
turnover_ma_3,float64,966,1.337061,71282,422636237.333333,6082874280823.333008,,,127117315337.333328
outstanding_ma_3,float64,966,1.337061,71282,55365.49,4629839301.272666,,,49013199.406
high_ma_3,float64,966,1.337061,70928,2.27779,275.637647,,,18.435446
low_ma_3,float64,966,1.337061,70827,2.183774,254.464853,,,17.359035


### Archive

In [None]:
df.sort_values()

In [None]:
summary(df)

In [None]:
df = df.sort_values(by=['sector', '日期'])
df['log_return'] = df.groupby('sector')['close'].apply(lambda x: x / x.shift(1)).apply(lambda x: pd.np.log(x))

In [None]:
df

In [None]:
df['date'] = pd.to_datetime(df['日期'])
df.set_index('date', inplace=True)
# weekly_data = df.groupby('sector').resample('W').agg({
#     '开盘': 'mean', '收盘': 'mean',
#     '成交量': 'sum', '成交额': 'sum', '流通股数': 'mean',
#     '最高': 'max', '最低': 'min'})
# weekly_data = weekly_data.rename(columns={"开盘":'open','收盘':'close','成交量':'volume','成交额':'turnover',"最高":'high',"最低":'low','流通股数':'outstanding'})

weekly_data = df.groupby('sector').resample('W-FRI').agg({
    'open': 'mean', 'close': 'mean',
    'volume': 'sum', 'turnover': 'sum', 'outstanding': 'mean',
    'high': 'max', 'low': 'min'})

weekly_data['log_return'] = weekly_data.groupby('sector')['close'].apply(lambda x: x / x.shift(1)).apply(lambda x: pd.np.log(x))
weekly_data['log_return'] = weekly_data.groupby('sector')['log_return'].shift(-1)

data = weekly_data
data.reset_index(inplace=True)

In [None]:
data[data['sector']=='2025规划'][-20:]

In [None]:
weekly_rating_df[weekly_rating_df['sector']=='2025规划'].sort_values('date')[-20:]

In [None]:
# Create lagged features
lag_periods = [1, 5, 10]  # Number of lag periods
for lag in lag_periods:
    for col in ['open', 'close', 'volume', 'turnover', 'outstanding', 'high', 'low']:
        data[f'{col}_lag_{lag}'] = data.groupby('sector')[col].shift(lag)


In [None]:
# Reset the index to separate 'sector' from the index
data.reset_index(level='sector', inplace=True)

# Create moving averages
moving_averages = [5, 10, 20]  # Moving average periods
for ma in moving_averages:
    for col in ['open', 'close', 'volume', 'turnover', 'outstanding', 'high', 'low']:
        data[f'{col}_ma_{ma}'] = data.groupby('sector')[col].rolling(window=ma).mean().reset_index(level='sector', drop=True)


In [None]:

# Calculate price change features
data['price_change'] = data.groupby('sector')['close'].diff()

# Calculate log return features
data['log_return_lag_1'] = data.groupby('sector')['log_return'].shift(1)

# Calculate relative indicators
data['return_vs_lag'] = data['log_return'] - data['log_return_lag_1']
data['return_vs_ma_5'] = data['log_return'] - data['close_ma_5']
data['return_vs_ma_10'] = data['log_return'] - data['close_ma_10']

# Drop rows with NaN due to feature engineering
data.dropna(inplace=True)

In [None]:
data

In [None]:
data = data.reset_index()
data.to_csv("featured_price.csv",index=False)