In [2]:
import os
import catboost as cb
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.feature_selection import mutual_info_regression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.exceptions import ConvergenceWarning
from scipy.stats import randint, uniform
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
import logging

# LightGBMの特定の警告を抑制
logging.getLogger("lightgbm").setLevel(logging.ERROR)

# 特定の警告を無視
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=UserWarning, module='lightgbm')

# インタラクティブモードをオフにする
import matplotlib
matplotlib.use('Agg')

def preprocess_data(file_path, target_column_name='Market Cap'):
    # データの読み込み
    df = pd.read_csv(file_path)
    
    print(f"Columns in the dataset: {df.columns.tolist()}")
    
    # 数値列と文字列列の分離
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    categorical_columns = df.select_dtypes(include=[object]).columns
    
    # 数値列の対数変換
    for col in numeric_columns:
        if df[col].min() > 0:  # 値が全て正の場合のみ対数変換を適用
            df[col] = np.log1p(df[col])
    
    # 外れ値の処理
    for col in numeric_columns:
        if col != target_column_name:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            df[col] = df[col].clip(lower_bound, upper_bound)
    
    # 数値列の欠損値補完
    df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].median())
    
    if target_column_name not in df.columns:
        raise ValueError(f"Target column '{target_column_name}' not found in the dataset.")
    
    X = df.drop(target_column_name, axis=1)
    y = df[target_column_name]
    
    # 相互情報量に基づく特徴量重要度の計算
    mi_scores = mutual_info_regression(X.select_dtypes(include=[np.number]), y)
    mi_scores = pd.Series(mi_scores, index=X.select_dtypes(include=[np.number]).columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    
    # 上位の特徴量を選択
    top_features = mi_scores.head(5).index.tolist()
    X = X[top_features + list(categorical_columns)]
    
    return X, y, top_features, categorical_columns

def create_preprocessor(numeric_features, categorical_features):
    # 数値特徴量の前処理パイプライン
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('poly', PolynomialFeatures(degree=2, include_bias=False))
    ])
    
    # 文字列特徴量の前処理
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')
    
    # 前処理パイプラインの定義
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    return preprocessor

def visualize_data(df, numeric_features, categorical_features):
    # 数値特徴量のヒストグラム
    for feature in numeric_features:
        plt.figure(figsize=(8, 6))
        sns.histplot(df[feature].dropna(), kde=True)
        plt.title(f'Histogram of {feature}')
        plt.xlabel(feature)
        plt.ylabel('Frequency')
        plt.savefig(f'histogram_{feature}.png')
        plt.close()
    
    # カテゴリ特徴量の棒グラフ
    for feature in categorical_features:
        plt.figure(figsize=(8, 6))
        sns.countplot(x=feature, data=df)
        plt.title(f'Count plot of {feature}')
        plt.xlabel(feature)
        plt.ylabel('Count')
        plt.xticks(rotation=45)
        plt.savefig(f'countplot_{feature}.png')
        plt.close()

def random_forest_model():
    return RandomForestRegressor(random_state=42)

def gradient_boosting_model():
    return GradientBoostingRegressor(random_state=42)

def neural_network_model():
    return MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=2000, random_state=42)

def svr_model():
    return SVR()

def xgboost_model():
    return xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

def lightgbm_model():
    return lgb.LGBMRegressor(random_state=42)

def catboost_model():
    return cb.CatBoostRegressor(random_state=42, verbose=0)

def deep_neural_network_model(input_dim):
    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(16, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    return model

def ensemble_prediction(models, X_test):
    predictions = []
    for model in models:
        if isinstance(model, Sequential):
            pred = model.predict(X_test).flatten()
        else:
            pred = model.predict(X_test)
        predictions.append(pred)
    return np.mean(predictions, axis=0)

def evaluate_model_extended(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return mse, mae, r2, mape

def plot_actual_vs_predicted(y_true, y_pred):
    plt.figure(figsize=(10, 6))
    plt.scatter(y_true, y_pred, alpha=0.5)
    plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.title('Actual vs Predicted')
    plt.tight_layout()
    plt.savefig('actual_vs_predicted.png')
    plt.close()

def perform_cross_validation(model, X, y, cv=5):
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    mse_scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
    r2_scores = cross_val_score(model, X, y, cv=kf, scoring='r2')
    return -mse_scores, r2_scores

def plot_residuals(y_true, y_pred):
    residuals = y_true - y_pred
    plt.figure(figsize=(10, 6))
    plt.scatter(y_pred, residuals, alpha=0.5)
    plt.xlabel('Predicted Values')
    plt.ylabel('Residuals')
    plt.title('Residual Plot')
    plt.axhline(y=0, color='r', linestyle='--')
    plt.tight_layout()
    plt.savefig('residuals.png')
    plt.close()

def plot_feature_importance(model, feature_names):
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        indices = np.argsort(importances)[::-1]
        
        plt.figure(figsize=(10, 6))
        plt.title("Feature Importance")
        plt.bar(range(len(importances)), importances[indices])
        plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=90)
        plt.tight_layout()
        plt.savefig('feature_importance.png')
        plt.close()

def main():
    # データの読み込み
    file_path = 'asx_company_financials_extended.csv'
    X, y, numeric_features, categorical_features = preprocess_data(file_path)
    
    # データの可視化
    visualize_data(pd.read_csv(file_path), numeric_features, categorical_features)
    
    # データの分割
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # 前処理パイプラインの作成
    preprocessor = create_preprocessor(numeric_features, categorical_features)
    
    # モデルの定義とパイプラインの作成
    rf_model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', random_forest_model())
    ])
    
    gb_model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', gradient_boosting_model())
    ])
    
    nn_model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', neural_network_model())
    ])
    
    svr_model_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', svr_model())
    ])
    
    xgb_model_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', xgboost_model())
    ])
    
    lgb_model_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', lightgbm_model())
    ])
    
    cb_model_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', catboost_model())
    ])
    
    # LightGBMのハイパーパラメータチューニング
    param_grid = {
        'regressor__num_leaves': randint(20, 50),
        'regressor__min_data_in_leaf': randint(20, 100),
        'regressor__learning_rate': uniform(0.01, 0.1),
        'regressor__feature_fraction': uniform(0.7, 0.3),
        'regressor__bagging_fraction': uniform(0.7, 0.3),
        'regressor__bagging_freq': randint(1, 10)
    }
    random_search = RandomizedSearchCV(lgb_model_pipeline, param_grid, n_iter=100, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, random_state=42)
    random_search.fit(X_train, y_train)
    best_lgb_model = random_search.best_estimator_
        
    # モデルの学習と評価
    models = [rf_model, gb_model, nn_model, svr_model_pipeline, xgb_model_pipeline, best_lgb_model, cb_model_pipeline]
    for model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse, mae, r2, mape = evaluate_model_extended(y_test, y_pred)
        print(f"{model.named_steps['regressor'].__class__.__name__} Evaluation: MSE={mse}, MAE={mae}, R2={r2}, MAPE={mape}%")
    
    # アンサンブル予測
    y_pred_ensemble = ensemble_prediction(models, X_test)
    
    # アンサンブルモデルの評価
    mse, mae, r2, mape = evaluate_model_extended(y_test, y_pred_ensemble)
    print(f"Ensemble Model Evaluation: MSE={mse}, MAE={mae}, R2={r2}, MAPE={mape}%")
    
    # 実際値vs予測値のプロット
    plot_actual_vs_predicted(y_test, y_pred_ensemble)
    
    # 交差検証 (Random Forestモデルを使用)
    mse_scores, r2_scores = perform_cross_validation(rf_model, X, y)
    print(f"Cross-Validation MSE: {mse_scores.mean()} (+/- {mse_scores.std() * 2})")
    print(f"Cross-Validation R-squared: {r2_scores.mean()} (+/- {r2_scores.std() * 2})")
    
    # 残差プロット
    plot_residuals(y_test, y_pred_ensemble)
    
    # 特徴量名の取得
    preprocessor.fit(X_train)
    feature_names = preprocessor.get_feature_names_out()
    plot_feature_importance(rf_model.named_steps['regressor'], feature_names)
    
    print("Plots have been saved. Please check the saved files.")

if __name__ == "__main__":
    main()

Columns in the dataset: ['Ticker', 'Name', 'Sector', 'Industry', 'Market Cap', 'Revenue', 'Net Income', 'PE Ratio', 'Dividend Yield', '52 Week High', '52 Week Low']
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 64, number of used features: 0
[LightGBM] [Info] Number of data points in the train set: 64, number of used features: 0
[LightGBM] [Info] Number of data points in the train set: 64, number of used features: 0
[LightGBM] [Info] Start training from score 17.679006
[LightGBM] [Info] Start training from score 17.867110
[LightGBM] [Info] Start training from score 17.516050


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 64, number of used features: 0
[LightGBM] [Info] Start training from score 17.757781






















[LightGBM] [Info] Total Bins 0


[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Nu