In [64]:
import pandas as pd
import numpy as np
import random
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.optimize import curve_fit
from matplotlib.backends.backend_pdf import PdfPages
from datetime import datetime, timedelta
import numpy as np
from scipy.optimize import minimize, basinhopping


# 100銘柄のISINを生成
isin_list = ['ISIN{:03}'.format(i) for i in range(1, 5001)]

# データを格納するための空のリストを作成
data = []

# 同じISINで同じクーポン率、時価、信用格付けを生成
coupon_rates = {isin: random.uniform(0.01, 0.10) for isin in isin_list}
market_prices = {isin: random.uniform(80, 120) for isin in isin_list}
credit_ratings = {isin: random.choice([5, 4, 3, 2, 1, 0]) for isin in isin_list}

# 業種のリストを作成
industries = ['Technology', 'Finance', 'Healthcare', 'Energy', 'Consumer Goods']

# 月ごとのデータを生成
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 12, 31)
current_date = start_date

while current_date <= end_date:
    for i, isin in enumerate(isin_list):
        oas = random.uniform(0.1, 2.0)  # OASをランダムに生成
        t_spread = random.uniform(0.05, 1.0) * i / 100   # T-Spreadをランダムに生成
        z_spread = random.uniform(0.2, 3.0)  # Z-Spreadをランダムに生成
        remaining_years = random.uniform(0.1, 50)  # 残存年数をランダムに設定
        maturity_date = current_date.replace(year=current_date.year + 1)  # 償還日を1年後に設定
        payment_frequency = random.choice([1, 2, 4])  # 利払い回数をランダムに選択
        coupon_rate = coupon_rates[isin]  # 同じISINのクーポン率を取得
        market_price = market_prices[isin]  # 同じISINの時価を取得
        credit_rating = credit_ratings[isin]  # 同じISINの信用格付けを取得
        industry = random.choice(industries)  # ランダムな業種を選択
        flag_SDGs = random.choice([True, False])

        # データをリストに追加
        data.append([current_date, isin, oas, t_spread, z_spread, remaining_years, maturity_date, payment_frequency, coupon_rate, market_price, credit_rating, industry, flag_SDGs])

    # 月を進める
    if current_date.month == 12:
        current_date = current_date.replace(year=current_date.year + 1, month=1, day=1)
    else:
        current_date = current_date.replace(month=current_date.month + 1, day=1)

# データフレームを作成
columns = ['日付', 'ISIN', 'OAS', 'T-Spread', 'Z-Spread', '残存年数', '償還日', '利払い回数', 'クーポン率', '時価', '信用格付け', '業種', 'merged_green_flag']
df = pd.DataFrame(data, columns=columns)
df

Unnamed: 0,日付,ISIN,OAS,T-Spread,Z-Spread,残存年数,償還日,利払い回数,クーポン率,時価,信用格付け,業種,merged_green_flag
0,2023-01-01,ISIN001,0.193857,0.000000,1.738140,26.056285,2024-01-01,1,0.084907,102.212496,2,Consumer Goods,True
1,2023-01-01,ISIN002,0.429830,0.003659,2.689048,46.655404,2024-01-01,2,0.081814,104.025152,4,Finance,False
2,2023-01-01,ISIN003,1.083203,0.015246,2.610036,34.036414,2024-01-01,4,0.091181,116.875081,0,Consumer Goods,False
3,2023-01-01,ISIN004,1.385097,0.002934,2.043499,22.082755,2024-01-01,1,0.039919,82.900639,2,Technology,False
4,2023-01-01,ISIN005,1.018491,0.024689,2.745670,40.610696,2024-01-01,2,0.037809,114.183278,2,Healthcare,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,2023-12-01,ISIN4996,0.453430,7.300588,2.357653,34.701413,2024-12-01,4,0.081302,118.085144,1,Technology,False
59996,2023-12-01,ISIN4997,0.292044,13.279362,2.882587,4.232333,2024-12-01,1,0.087387,117.043084,5,Finance,True
59997,2023-12-01,ISIN4998,0.877654,41.187529,2.817387,2.564015,2024-12-01,4,0.050446,91.661924,4,Consumer Goods,False
59998,2023-12-01,ISIN4999,0.428110,39.279042,0.801414,39.064355,2024-12-01,4,0.075838,85.537160,4,Consumer Goods,True


In [53]:
# Nelson-Siegelモデルのパラメータ推計の計算最適化を含む関数
# Nelson-Siegelモデルの関数定義
def nelson_siegel_model(t, beta0, beta1, beta2, tau):
    return beta0 + (beta1 + beta2) * ((1 - np.exp(-t/tau)) / (t/tau)) - beta2 * np.exp(-t/tau)

def plot_data_and_fit_ns_model_optimized(df, metric, date_column, sector_column, rating_column, maturity_column, rating_split, maturity_step, min_num_of_data, flag_column):
    """ データを分割し、Nelson-Siegelモデルに基づいて推計し、プロットする関数

    Args:
        df : pd.DataFrame
            データフレーム
        metric : str
            目的変数(y軸)
        date_column : str
            日付データカラム
        sector_column : str
            セクターカラム
        rating_column : str
            信用格付けカラム
        maturity_column : str
            残存年数カラム
        rating_split : int
            信用格付けの分割数
        maturity_step : int
            残存年数の刻み幅
        min_num_of_data : int
            パラメータ推計に必要な最小データ点数
        flag_column : str
            フラグカラム
    """
    # 日付データでグループ分け
    for date, date_group in df.groupby(date_column):
        formatted_date = date.strftime('%Y-%m-%d')
        # PDFに複数ページを作成
        with PdfPages(f'{formatted_date}_{metric}_split.pdf') as pdf:
            # セクターごとに処理
            for sector in date_group[sector_column].unique():
                sector_df = date_group[date_group[sector_column] == sector]

                # 信用格付けを指定された数で分割
                sector_df['RatingGroup'] = pd.qcut(sector_df[rating_column], rating_split, labels=False, duplicates='drop')

                # 各信用格付けグループごとにプロット
                fig, axes = plt.subplots(rating_split, 1, figsize = (10, 6 * rating_split))
                fig.suptitle(f'{sector} - {date}')

                for i in range(rating_split):
                    rating_group_df = sector_df[sector_df['RatingGroup'] == i]
                    # 残存年数ごとの平均値を計算
                    avg_metric = rating_group_df.groupby((rating_group_df[maturity_column] // maturity_step).astype(int))[metric].mean()
                    print(avg_metric)
                    # Nelson-Siegelモデルのパラメータを推計
                    if len(avg_metric) >= min_num_of_data:
                        try:
                            maturity_values = np.array(avg_metric.index) * maturity_step
                            # 初期パラメータを設定してcurve_fitを呼び出し
                            popt, _ = curve_fit(nelson_siegel_model, maturity_values, avg_metric, p0=[0, 0, 0, 1], maxfev = 10000)

                            # 推計された曲線を描画
                            t_fit = np.linspace(0, maturity_values.max(), 100)
                            curve_fit = nelson_siegel_model(t_fit, *popt)
                            axes[i].plot(t_fit, curve_fit, label='Fitted Curve')

                        except RuntimeError:
                            # パラメータ推計に失敗した場合の処理
                            axes[i].text(0.5, 0.5, 'Fit failed', horizontalalignment = 'center', verticalalignment = 'center', transform = axes[i].transAxes)
                            continue

                    # 元のデータの散布図を描画（フラグに基づく色分け）
                    for flag_value in rating_group_df[flag_column].unique():
                        flag_df = rating_group_df[rating_group_df[flag_column] == flag_value]
                        color = 'red' if flag_value else 'blue'
                        axes[i].scatter(flag_df[maturity_column], flag_df[metric], color=color, label=f'Flag {flag_value}')

                    axes[i].set_title(f'Rating Group {i}')
                    axes[i].legend()

                plt.tight_layout()
                pdf.savefig()
                plt.close()


In [71]:
# Nelson-Siegelモデル関数
def nelson_siegel(t, beta0, beta1, beta2, tau):
    return beta0 + beta1 * ((1 - np.exp(-t/tau)) / (t/tau)) + beta2 * ((1 - np.exp(-t/tau)) / (t/tau) - np.exp(-t/tau))

# 誤差関数
def ns_error(params, t, observed_yields):
    beta0, beta1, beta2, tau = params
    predicted_yields = nelson_siegel(t, beta0, beta1, beta2, tau)
    return np.sum((observed_yields - predicted_yields)**2)

# Nelson-Siegelモデルのパラメータ推計の計算最適化を含む関数
def plot_data_and_fit_ns_model_optimized(df, metric, date_column, sector_column, rating_column, maturity_column, rating_split, maturity_step, min_num_of_data, flag_column):
    """ データを分割し、Nelson-Siegelモデルに基づいて推計し、プロットする関数

    Args:
        df : pd.DataFrame
            データフレーム
        metric : str
            目的変数(y軸)
        date_column : str
            日付データカラム
        sector_column : str
            セクターカラム
        rating_column : str
            信用格付けカラム
        maturity_column : str
            残存年数カラム
        rating_split : int
            信用格付けの分割数
        maturity_step : int
            残存年数の刻み幅
        min_num_of_data : int
            パラメータ推計に必要な最小データ点数
        flag_column : str
            フラグカラム
    """

    for date, date_group in df.groupby(date_column):
        formatted_date = date.strftime('%Y-%m-%d')
        with PdfPages(f'{formatted_date}_{metric}_split.pdf') as pdf:
            for sector in date_group[sector_column].unique():
                sector_df = date_group[date_group[sector_column] == sector]

                sector_df = sector_df.dropna(subset=[rating_column])
                if not np.issubdtype(sector_df[rating_column].dtype, np.number):
                    continue

                sector_df['RatingGroup'] = pd.qcut(sector_df[rating_column], rating_split, labels=False, duplicates='drop')

                fig, axes = plt.subplots(rating_split, 1, figsize=(10, 6 * rating_split))
                fig.suptitle(f'{sector} - {formatted_date}')

                for i in range(rating_split):
                    rating_group_df = sector_df[sector_df['RatingGroup'] == i]
                    avg_metric = rating_group_df.groupby((rating_group_df[maturity_column] // maturity_step).astype(int))[metric].mean()

                    # curve_fit結果の初期化
                    curve_fit_result = None

                    if len(avg_metric) >= min_num_of_data:
                        try:
                            maturity_values = np.array(avg_metric.index) * maturity_step
                            initial_params = [0.1, 0.1, 0.1, 1]
                            result = basinhopping(
                                                ns_error,
                                                initial_params,
                                                minimizer_kwargs = {"args": (maturity_values, avg_metric),
                                                                    "method": 'Nelder-Mead'}
                                                )
                            # result = minimize(ns_error,
                            #                   initial_params = [0.1, 0.1, 0.1, 1],
                            #                   args = (maturity_values, avg_metric),
                            #                   method = 'Nelder-Mead'
                            #                   )

                            # popt, _ = curve_fit(nelson_siegel_model, maturity_values, avg_metric, p0=[0.1, 0.1, 0.1, 1], maxfev = 20000)
                            t_fit = np.linspace(0, maturity_values.max(), 100)
                            curve_fit_result = nelson_siegel_model(t_fit, *result.x)
                        except RuntimeError:
                            axes[i].text(0.5, 0.5, 'Fit failed', horizontalalignment='center', verticalalignment='center', transform=axes[i].transAxes)
                            continue

                    if curve_fit_result is not None:
                        axes[i].plot(t_fit, curve_fit_result, label='Fitted Curve')

                    for flag_value in rating_group_df[flag_column].unique():
                        flag_df = rating_group_df[rating_group_df[flag_column] == flag_value]
                        color = 'red' if flag_value else 'blue'
                        axes[i].scatter(flag_df[maturity_column], flag_df[metric], color=color, label=f'Flag {flag_value}')

                    axes[i].set_title(f'Rating Group {i}')
                    axes[i].legend()

                plt.tight_layout()
                pdf.savefig()
                plt.close()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from scipy.optimize import curve_fit
from matplotlib.backends.backend_pdf import PdfPages

plot_data_and_fit_ns_model_optimized(df, 'T-Spread', '日付', '業種', '信用格付け', '残存年数', 4, 2, 20, "merged_green_flag")

In [None]:
from scipy.optimize import minimize

# 目的関数定義（二乗誤差最小化）
def objective_function(coeffs, x, y):
    polynomial = np.poly1d(coeffs)
    predictions = polynomial(x)
    return np.sum((predictions - y) ** 2)

# 多項式フィッティングとパラメータ推計を行う関数
def plot_data_and_fit_polynomial_with_minimize(df, metric, date_column, sector_column, rating_column, maturity_column, flag_column, rating_split, maturity_step, polynomial_degree):
    with PdfPages('output_polynomial_fit_minimize.pdf') as pdf:
        for date, date_group in df.groupby(date_column):
            for sector in date_group[sector_column].unique()[:2]:  # メモリ制限のためセクター数を制限
                sector_df = date_group[date_group[sector_column] == sector]
                sector_df['RatingGroup'] = pd.qcut(sector_df[rating_column], rating_split, labels=False, duplicates='drop')

                fig, axes = plt.subplots(min(rating_split, 2), 1, figsize=(10, 12))
                fig.suptitle(f'{sector} - {date}')

                for i in range(min(rating_split, 2)):
                    rating_group_df = sector_df[sector_df['RatingGroup'] == i]
                    avg_metric = rating_group_df.groupby((rating_group_df[maturity_column] // maturity_step).astype(int))[metric].mean()

                    if len(avg_metric) > polynomial_degree:
                        maturity_values = np.array(avg_metric.index) * maturity_step
                        initial_guess = np.zeros(polynomial_degree + 1)
                        result = minimize(objective_function, initial_guess, args=(maturity_values, avg_metric), method='SLSQP')
                        coefficients = result.x

                        t_fit = np.linspace(0, maturity_values.max(), 100)
                        curve_fit = np.polyval(coefficients, t_fit)
                        axes[i].plot(t_fit, curve_fit, label='Fitted Polynomial Curve')

                    sns.scatterplot(x=maturity_column, y=metric, 
                                    hue=flag_column, palette=['blue', 'red'], 
                                    data=rating_group_df, ax=axes[i], legend=False)
                    axes[i].scatter(avg_metric.index * maturity_step, avg_metric, color='green', label='Average')
                    axes[i].set_title(f'Rating Group {i}')
                    axes[i].legend()

                plt.tight_layout()
                pdf.savefig()
                plt.close()

# テスト実行
plot_data_and_fit_polynomial_with_minimize(df, 'T-Spread', 'date', '業種', '信用格付け', '残存年数', 'merged_green_flag', 4, 1, 3)


In [None]:
def plot_return_distribution_with_fixed_yaxis(df, pdf_filename, y_min, y_max):
    """
    Further modified function to create box plots of return distributions by industry and flag status for each unique date in the dataframe,
    with a fixed y-axis range based on the provided minimum and maximum values.
    Outputs the plots to a PDF file, with each page representing one date.

    :param df: DataFrame with columns '業種', 'リターン', 'フラグ', and '日付'
    :param pdf_filename: The name of the PDF file to save the plots to
    :param y_min: The minimum value for the y-axis
    :param y_max: The maximum value for the y-axis
    """
    # Ensure the date column is in datetime format and extract month and year
    df['日付'] = pd.to_datetime(df['日付'])
    df['月'] = df['日付'].dt.to_period('M')

    # Create a PDF file
    with PdfPages(pdf_filename) as pdf:
        # Loop over each unique date
        for date in df['月'].unique():
            # Filter data for the current date
            df_date = df[df['月'] == date]

            # Create a figure with two subplots (for flag True and False)
            fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))
            fig.suptitle(f'Return Distribution by Industry for {date}', fontsize=16)

            # Plot boxplots for each flag status
            for i, flag_status in enumerate([True, False]):
                ax = axes[i]
                sns.boxplot(x='業種', y='リターン', data=df_date[df_date['フラグ'] == flag_status], ax=ax)
                ax.set_title(f'Flag Status: {"あり" if flag_status else "なし"}')
                ax.set_xlabel('業種')
                ax.set_ylabel('リターン')

                # Rotate x-axis labels and set font size for flag=True plot
                if flag_status:
                    ax.tick_params(axis='x', rotation=45, labelsize=8)

                # Set y-axis limits
                ax.set_ylim(y_min, y_max)

            # Tight layout and save the current figure to the PDF
            plt.tight_layout(rect=[0, 0.03, 1, 0.95])
            pdf.savefig(fig)
            plt.close(fig)

# This function also assumes the dataframe `df` has the necessary structure and data.
# Replace 'pdf_filename', 'y_min', and 'y_max' with the desired values when calling this function.


In [None]:
def create_sector_boxplots_by_date_seaborn_corrected(df):
    """
    Corrected function to create separate PDF files for each date in the dataframe using Seaborn.
    Each page of the PDF file contains box plots of return distributions for a specific sector on that date, 
    with separate plots for flagged and non-flagged data.

    :param df: DataFrame with columns '業種', 'リターン', 'フラグ', and '日付'
    """
    # Ensure the date column is in datetime format
    df['日付'] = pd.to_datetime(df['日付'])

    # Iterate over each unique date
    for date in df['日付'].dt.date.unique():
        # Filter data for the current date
        df_date = df[df['日付'].dt.date == date]

        # Create a PDF file for the current date
        pdf_filename = f"{date}.pdf"
        with PdfPages(pdf_filename) as pdf:
            # Iterate over each unique sector
            for sector in df_date['業種'].unique():
                # Filter data for the current sector
                df_sector = df_date[df_date['業種'] == sector]

                # Create a new figure for the current sector
                plt.figure()
                sns.boxplot(x='フラグ', y='リターン', data=df_sector)
                plt.title(f'Sector: {sector} - Date: {date}')
                plt.xlabel('Flag Status')
                plt.ylabel('Return')

                # Save the current figure to the PDF and close it
                pdf.savefig()
                plt.close()

# この関数は、適切なデータフレーム構造を持つデータフレーム `df` を引数として呼び出されることを想定しています。
# 例: create_sector_boxplots_by_date_seaborn_corrected(df)


In [None]:
import pandas as pd

# データフレームの構造に基づいて、ISINごとのT-Spreadのボラティリティを計算し、リターンデータで割る関数
def calculate_volatility_ratio(df, isin_column, return_column, t_spread_column):
    # ISINごとにT-Spreadの標準偏差（ボラティリティ）を計算
    volatility = df.groupby(isin_column)[t_spread_column].std()

    # 新しい列としてボラティリティで割った値を追加
    df['Volatility_Ratio'] = df.apply(lambda row: row[return_column] / volatility[row[isin_column]], axis=1)

    return df

# 仮のデータフレームを作成してテスト
test_data = {
    'ISIN': ['ISIN1', 'ISIN1', 'ISIN2', 'ISIN2', 'ISIN3'],
    'return': [0.05, 0.06, 0.07, 0.08, 0.09],
    'T-SP': [100, 110, 120, 130, 140]
}
test_df = pd.DataFrame(test_data)

# 関数を実行
result_df = calculate_volatility_ratio(test_df, 'ISIN', 'return', 'T-SP')
result_df
