In [47]:
import pandas as pd

# 讀取財務比例數據，並優先排除超過
financial_ratio = pd.read_csv('/Users/shawn/Github/M1/金融機構與風險管理/New_/S_P500 財務比率_rename_V2.csv')
def remove_columns_with_high_missing_ratio(df, threshold=0.10):
    # 計算每個col缺失值比例
    missing_ratio = df.isnull().mean()
    columns_to_drop = missing_ratio[missing_ratio > threshold].index
    df_dropped = df.drop(columns=columns_to_drop)

    return df_dropped
financial_ratio  = remove_columns_with_high_missing_ratio(financial_ratio, threshold=0.10)


In [48]:
# 讀取rating資料，並更新日期格式
rating = pd.read_csv('/Users/shawn/Github/M1/金融機構與風險管理/New_/rating.csv')

financial_ratio['datadate'] = pd.to_datetime(financial_ratio['datadate'], format='%Y/%m/%d')
financial_ratio['datadate'] = financial_ratio['datadate'].dt.strftime('%Y-%m-%d')


rating['datadate'] = pd.to_datetime(rating['datadate'], format='%Y-%m-%d')
rating['datadate'] = rating['datadate'].dt.strftime('%Y-%m-%d')

In [49]:
merged_df = pd.merge(financial_ratio, rating[['datadate', 'ticker', 'splticrm']], on=['datadate', 'ticker'], how='left')
# merged_df['dividend yield'] = merged_df['dividend yield'].str.rstrip('%').astype('float') / 100


print(merged_df[merged_df['ticker'] == 'AMD'][['datadate','ticker','splticrm']].head()) 
print(rating[rating['ticker'] == 'AMD'][['datadate','ticker','splticrm']].head()) 

        datadate ticker splticrm
3137  2010-01-31    AMD       B-
3138  2010-02-28    AMD       B-
3139  2010-03-31    AMD       B-
3140  2010-04-30    AMD       B-
3141  2010-05-31    AMD       B-
       datadate ticker splticrm
388  2001-01-31    AMD        B
389  2001-02-28    AMD        B
390  2001-03-31    AMD        B
391  2001-04-30    AMD        B
392  2001-05-31    AMD        B


In [50]:
# 以插值法填入
def fill_missing_values_with_interpolation(df, group_col):
    df_filled = df.groupby(group_col).apply(
        lambda group: group.interpolate(method='linear').round(3)
    ).reset_index(drop=True)
    return df_filled
merged_df = fill_missing_values_with_interpolation(merged_df, 'ticker')


In [51]:
# financial_ratio, rating期間範圍無對齊
merged_df.dropna(subset=['splticrm'], inplace=True)
def filter_rows(group):
    # 組內rating前後不同的才會被留下
    mask = group['splticrm'] != group['splticrm'].shift()
    return group[mask]
merged_df = merged_df.groupby('ticker').apply(filter_rows).reset_index(drop=True)
merged_df['ticker'].value_counts().to_frame() 
merged_df = fill_missing_values_with_interpolation(merged_df, 'ticker')


In [52]:
# merged_df保留了所有ticker
merged_df2 = merged_df.groupby('ticker').filter(lambda x: len(x) > 1)
print(len(merged_df2))

merged_df2 = fill_missing_values_with_interpolation(merged_df2, 'ticker')

668


In [53]:
ratings_map = {
    'D': 0, 'CC': 1, 'CCC': 2, 'CCC+': 3,
    'B-': 5, 'B': 6, 'B+': 7, 'BB-': 8, 'BB': 9, 'BB+': 10,
    'BBB-': 12, 'BBB': 14, 'BBB+': 16,
    'A-': 19, 'A': 22, 'A+': 25, 'AA-': 28, 'AA': 32, 'AA+': 36, 'AAA': 40
}

# 將評級參數化
merged_df2['rating'] = merged_df2['splticrm'].map(ratings_map)

In [54]:
def process_data(group):
    # 保留前一期的評價
    group['prev_rating'] = group['rating'].shift(1)
    group['rating_diff'] = group['rating'] - group['rating'].shift(1)
    # 變化率
    for col in group.columns:
        if pd.api.types.is_numeric_dtype(group[col]):
            group[col + '_change'] = group[col].pct_change()

    # 刪除前一期評價(prev_rating)
    # group = group[group['is_upgraded'] == 1].drop(columns=['prev_rating'])
    group = group.dropna(subset=['prev_rating'])

    return group

In [55]:
import numpy as np
merged_df3 = merged_df2.groupby('ticker').apply(process_data).reset_index(drop=True)
# 再次處理缺失值與inf
merged_df3 = fill_missing_values_with_interpolation(merged_df3, 'ticker')
cols_with_inf = merged_df3.columns.to_series()[merged_df3.isin([np.inf, -np.inf]).any()]
merged_df3 = merged_df3.drop(cols_with_inf.index, axis=1)
merged_df3 = fill_missing_values_with_interpolation(merged_df3, 'ticker')
merged_df3 = merged_df3.dropna().reset_index(drop=True)




In [56]:
merged_df3.to_csv('/Users/shawn/Github/M1/金融機構與風險管理/New_/SP500_change_V5_rename.csv')

In [57]:
# 隨機抽樣

# 70-30 training-testing data
train_df70 = merged_df3.sample(frac=0.7, random_state=73).reset_index(drop=True)  # random_state 紀錄結果
train_df70 = train_df70.drop(train_df70.columns[:55], axis=1)
train_df70 = train_df70.drop(train_df70.columns[-3:], axis=1)
test_df30 = merged_df3.drop(train_df70.index).reset_index(drop=True)
test_df30 = test_df30.drop(test_df30.columns[:55], axis=1)
test_df30 = test_df30.drop(test_df30.columns[-3:], axis=1)

# 60-40 training-testing data
train_df60 = merged_df3.sample(frac=0.6, random_state=64).reset_index(drop=True)  # random_state 紀錄結果
train_df60 = train_df60.drop(train_df60.columns[:55], axis=1)
train_df60 = train_df60.drop(train_df60.columns[-3:], axis=1)
test_df40 = merged_df3.drop(train_df60.index).reset_index(drop=True)
test_df40 = test_df40.drop(test_df40.columns[:55], axis=1)
test_df40 = test_df40.drop(test_df40.columns[-3:], axis=1)

train_df70.to_csv('/Users/shawn/Github/M1/金融機構與風險管理/New_/train_df70.csv')
test_df30.to_csv('/Users/shawn/Github/M1/金融機構與風險管理/New_/test_df30.csv')
train_df60.to_csv('/Users/shawn/Github/M1/金融機構與風險管理/New_/train_df60.csv')
test_df40.to_csv('/Users/shawn/Github/M1/金融機構與風險管理/New_/test_df40.csv')


train_df70.head()


Unnamed: 0,rating_diff,after-tax interest coverage_change,interest coverage ratio_change,cash flow/ total debt_change,operating margin before dep._change,return on equity_change,total debt/ total assets_change,book/ market_change,interest/ average LTD_change,interest/ average total debt_change,...,"P/E (Diluted, Excl. EI)_change","P/E (Diluted, Incl. EI)_change",price/ sales_change,price/ cash flow_change,gross profit margin_change,after-tax return on average common equity_change,after-tax return on average stockholders' equity_change,gross profit/ total assets_change,common equity/invested capital_change,cash flow margin_change
0,3.0,0.357,0.264,-0.315,0.252,1.023,0.377,-0.63,-0.194,-0.194,...,0.096,0.061,0.734,0.186,0.006,1.096,1.096,0.062,-0.173,0.157
1,-3.0,3.501,-1.099,0.0,-0.59,3.658,-0.104,-0.203,0.033,0.069,...,-0.722,-0.722,1.152,0.132,-0.176,3.683,3.683,-0.493,0.019,302.0
2,-3.0,1.748,-5.725,0.0,-0.889,1.761,0.1,0.938,0.043,0.043,...,-0.879,-0.889,-0.182,-18.223,-0.715,1.801,1.801,-0.722,-0.281,6.875
3,-3.0,-0.039,0.645,-1.031,-6.0,0.428,0.097,0.888,1.028,0.056,...,-0.966,-0.964,0.378,-0.852,-1.867,0.451,0.451,-1.909,1.835,0.148
4,-5.0,-0.733,-0.689,-0.077,0.055,-0.436,0.232,-0.165,0.481,0.296,...,0.935,0.935,0.586,0.974,0.022,-0.419,-0.419,-0.304,-0.069,-0.09
