## ライブラリのインポート・前処理

In [1]:
# ライブラリのインポート
# # 基本ライブラリ
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# モデル構築のためのライブラリ
import lightgbm as lgb
import xgboost as xgb
import optuna
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import root_mean_squared_error as rmse
from sklearn.metrics import root_mean_squared_error as rmse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score

# # pandasのカラムが100列まで見れるようにする
pd.set_option("display.max_columns", 100)

In [2]:
# データの読み込み
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [None]:
# 目的変数`SalePrice`のカラムを取り出しておく
y_train = df_train["SalePrice"]

In [5]:
# trainデータとtestデータを結合する
df_all = pd.concat([df_train.drop(columns="SalePrice"), df_test])
display(df_all.head())
display(df_all.isnull().sum().sum())

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706.0,Unf,0.0,150.0,856.0,GasA,Ex,Y,SBrkr,856,854,0,1710,1.0,0.0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2.0,548.0,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978.0,Unf,0.0,284.0,1262.0,GasA,Ex,Y,SBrkr,1262,0,0,1262,0.0,1.0,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2.0,460.0,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486.0,Unf,0.0,434.0,920.0,GasA,Ex,Y,SBrkr,920,866,0,1786,1.0,0.0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2.0,608.0,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216.0,Unf,0.0,540.0,756.0,GasA,Gd,Y,SBrkr,961,756,0,1717,1.0,0.0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3.0,642.0,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655.0,Unf,0.0,490.0,1145.0,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1.0,0.0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3.0,836.0,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal


15707

In [None]:
### 欠損値除去
df_all = df_all.drop(columns=["Alley"])
df_all = df_all.drop(columns=["PoolQC"])
df_all = df_all.drop(columns=["Fence"])
df_all = df_all.drop(columns=["MiscFeature"])

In [None]:
# データの前処理をまとめた関数
def data_pre(df):
    # 欠損値処理

    ### 試し
    ###  欠損値が90％以上のカラムを除去
    # df = df.drop(columns=["Alley", "PoolQC", "Fence", "MiscFeature"])

    # 文字列の変数の欠損は「'None'」、数字の変数の欠損は「0」で埋める
    for colum in df.columns:
        # object型の場合
        if df[colum].dtype == "object":
            df[colum].fillna("None", inplace=True)
        # 数値型の場合
        else:
            df[colum].fillna(0, inplace=True)

    ## 加えたら精度が落ちたため、コメントアウト
    # 特徴量エンジニアリング
    # TotalSF(総面積)を新たな特徴量として加える
    # df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']

    # 数値型のカテゴリ変数をObject型のカテゴリ変数に変換
    df["MSSubClass"] = df["MSSubClass"].astype("category")
    df["YrSold"] = df["YrSold"].astype("category")
    df["MoSold"] = df["MoSold"].astype("category")
    df["OverallCond"] = df["OverallCond"].astype("category")

    # Object型をダミー変数化
    df = pd.get_dummies(df)

    return df

In [None]:
# データの前処理を実行
df_all = data_pre(df_all)

In [None]:
# Object型のカラムを取り出す
object_columns = df_all.select_dtypes(include=object).columns
object_columns.values

array([], dtype=object)

In [None]:
### 精度が落ちた
# ラベルエンコーディング
# from sklearn.preprocessing import LabelEncoder

# # object_columnsのカラムをラベルエンコーディング
# for column in object_columns:
#     le = LabelEncoder()
#     le.fit(df_all[column])
#     df_all[column] = le.transform(df_all[column])

In [None]:
# # object型のカラムを取り出す
# object_columns = df_all.select_dtypes(include=object).columns
# object_columns.values

In [None]:
# trainデータとtestデータに分割
# trainデータ
df_train = df_all[: len(df_train)]
df_train["SalePrice"] = y_train  # 目的変数を戻す
df_train["SalePriceLog"] = np.log(df_train["SalePrice"])  # SalePriceを対数変換

# testデータ
df_test = df_all[len(df_train) :]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["SalePrice"] = y_train  # 目的変数を戻す
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["SalePriceLog"] = np.log(df_train["SalePrice"])  # SalePriceを対数変換


In [None]:
display(df_train.head())

Unnamed: 0,Id,LotFrontage,LotArea,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MSSubClass_20,MSSubClass_30,MSSubClass_40,MSSubClass_45,MSSubClass_50,MSSubClass_60,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,MSSubClass_90,MSSubClass_120,MSSubClass_150,MSSubClass_160,MSSubClass_180,MSSubClass_190,MSZoning_C (all),...,GarageQual_Ex,GarageQual_Fa,GarageQual_Gd,GarageQual_None,GarageQual_Po,GarageQual_TA,GarageCond_Ex,GarageCond_Fa,GarageCond_Gd,GarageCond_None,GarageCond_Po,GarageCond_TA,PavedDrive_N,PavedDrive_P,PavedDrive_Y,MoSold_1,MoSold_2,MoSold_3,MoSold_4,MoSold_5,MoSold_6,MoSold_7,MoSold_8,MoSold_9,MoSold_10,MoSold_11,MoSold_12,YrSold_2006,YrSold_2007,YrSold_2008,YrSold_2009,YrSold_2010,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_None,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SalePrice,SalePriceLog
0,1,65.0,8450,7,2003,2003,196.0,706.0,0.0,150.0,856.0,856,854,0,1710,1.0,0.0,2,1,3,1,8,0,2003.0,2.0,548.0,0,61,0,0,0,0,0,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False,False,True,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,208500,12.247694
1,2,80.0,9600,6,1976,1976,0.0,978.0,0.0,284.0,1262.0,1262,0,0,1262,0.0,1.0,2,0,3,1,6,1,1976.0,2.0,460.0,298,0,0,0,0,0,0,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False,False,True,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,181500,12.109011
2,3,68.0,11250,7,2001,2002,162.0,486.0,0.0,434.0,920.0,920,866,0,1786,1.0,0.0,2,1,3,1,6,1,2001.0,2.0,608.0,0,42,0,0,0,0,0,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,223500,12.317167
3,4,60.0,9550,7,1915,1970,0.0,216.0,0.0,540.0,756.0,961,756,0,1717,1.0,0.0,1,0,3,1,7,1,1998.0,3.0,642.0,0,35,272,0,0,0,0,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False,False,True,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,140000,11.849398
4,5,84.0,14260,8,2000,2000,350.0,655.0,0.0,490.0,1145.0,1145,1053,0,2198,1.0,0.0,2,1,4,1,9,1,2000.0,3.0,836.0,192,84,0,0,0,0,0,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,250000,12.429216


In [None]:
### 精度向上
# GrLivAreaの2件の外れ値を除去
df_train = df_train.drop(
    df_train[(df_train["GrLivArea"] > 4000) & (df_train["SalePrice"] < 300000)].index
)

In [None]:
# 標準化用にデータをコピーしておく
df_train_std = df_train.copy()
df_test_std = df_test.copy()

## 基本的なモデル構築

In [None]:
# 説明変数と目的変数を定義
x_train = df_train.drop(["SalePrice", "Id", "SalePriceLog"], axis=1)
y_train = df_train["SalePriceLog"]
id_train = df_train["Id"]


x_test = df_test.drop(["Id"], axis=1)
id_test = df_test["Id"]
print(x_test.shape, id_test.shape)

(1459, 331) (1459,)


In [None]:
# LightGBMのパラメータ
params = {
    "boosting_type": "gbdt",
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.05,
    "num_leaves": 16,
    "n_estimators": 100000,
    "random_state": 123,
    "importance_type": "gain",
    "verbose": -1,  # 追加 # ログを非表示
}


# モデル学習と評価の処理を関数化
def lgb_train_cv(input_x, input_y, input_id, params, n_splits=5):
    scores = []  # rmseのスコアを格納するリスト
    val_preds = np.zeros(len(input_x))  # 予測値を格納するリスト
    imp = pd.DataFrame()  # 特徴量の重要度を格納するdf

    # データを学習用と評価用に分割
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    cv = list(kf.split(input_x, input_y))

    # 交差検証法でモデル構築
    for nfold in range(n_splits):
        tr_idx, val_idx = cv[nfold][0], cv[nfold][1]
        x_tr, x_val = x_train.iloc[tr_idx], x_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]

        # モデル学習
        model = lgb.LGBMRegressor(**params)
        model.fit(
            x_tr,
            y_tr,
            eval_set=[(x_tr, y_tr), (x_val, y_val)],
            callbacks=[
                lgb.early_stopping(stopping_rounds=100, verbose=True),
                lgb.log_evaluation(100),
            ],
        )

        # モデルで予測
        y_tr_preds = model.predict(x_tr)
        y_val_preds = model.predict(x_val)
        # 精度(正解率)の確認
        metric_tr = round(rmse(y_tr, y_tr_preds), 5)
        metric_val = round(rmse(y_val, y_val_preds), 5)
        print("[RMSE] tr: {:.5f}, val: {:.5f}".format(metric_tr, metric_val))
        scores.append([nfold, metric_tr, metric_val])  # 結果を格納
        # 検証データの予測値を該当のIDの場所に格納
        val_preds[val_idx] = y_val_preds

        # 特徴量の重要度を確認
        _imp = pd.DataFrame(
            {"col": x_train.columns, "imp": model.feature_importances_, "nfold": nfold}
        )
        # print(_imp)
        imp = pd.concat([imp, _imp], axis=0, ignore_index=True)

    scores = np.array(scores)

    imp = imp.groupby("col")["imp"].agg(["mean", "std"])
    imp.columns = ["imp_mean", "imp_std"]
    imp = imp.reset_index(drop=False)

    # 予測
    y_test_preds = model.predict(x_test)

    return imp, scores, y_test_preds, val_preds
    # return scores, imp

In [None]:
lgb_imp, lgb_scores, lgb_test_preds, lgb_val_preds = lgb_train_cv(
    x_train, y_train, id_train, params, n_splits=5
)

Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.0806415	valid_1's rmse: 0.137178
[200]	training's rmse: 0.0584338	valid_1's rmse: 0.135258
[300]	training's rmse: 0.0454131	valid_1's rmse: 0.135247
[400]	training's rmse: 0.0363593	valid_1's rmse: 0.133946
[500]	training's rmse: 0.0295184	valid_1's rmse: 0.133722
Early stopping, best iteration is:
[453]	training's rmse: 0.0325871	valid_1's rmse: 0.133546
[RMSE] tr: 0.03259, val: 0.13355
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.08137	valid_1's rmse: 0.125682
[200]	training's rmse: 0.0589477	valid_1's rmse: 0.120174
[300]	training's rmse: 0.046028	valid_1's rmse: 0.119298
[400]	training's rmse: 0.0369408	valid_1's rmse: 0.119205
Early stopping, best iteration is:
[370]	training's rmse: 0.0393681	valid_1's rmse: 0.11893
[RMSE] tr: 0.03937, val: 0.11893
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.0799597	valid_1's r

In [None]:
print("===== LightGBM =====")
print("\nRMSE")
print(lgb_scores)
print("\nRMSEの平均値")
print(round(np.mean(lgb_scores[:, 2]), 5))

# 過学習傾向がある

===== LightGBM =====

RMSE
[[0.      0.03259 0.13355]
 [1.      0.03937 0.11893]
 [2.      0.04229 0.13635]
 [3.      0.05546 0.12898]
 [4.      0.04922 0.11658]]

RMSEの平均値
0.12688


In [None]:
# print("\n特徴量重要度")
df_importance = pd.DataFrame(lgb_imp.sort_values("imp_mean", ascending=False))
df_importance

Unnamed: 0,col,imp_mean,imp_std
280,OverallQual,962.591416,71.454962
159,GrLivArea,255.851956,15.127610
319,TotalBsmtSF,83.385279,14.168684
134,GarageCars,69.691070,22.466324
19,BsmtFinSF1,51.101577,6.142993
...,...,...,...
213,MSSubClass_45,0.000000,0.000000
61,Condition2_RRNn,0.000000,0.000000
217,MSSubClass_75,0.000000,0.000000
56,Condition2_Norm,0.000000,0.000000


In [None]:
# XGBoostのパラメータ
params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "learning_rate": 0.05,
    "n_estimators": 100000,
    "max_depth": 4,
    "random_state": 123,
}


# モデル学習と評価の処理を関数化
def xgboost_train_cv(input_x, input_y, input_id, params, n_splits=5):
    scores = []  # rmseのスコアを格納するリスト
    val_preds = np.zeros(len(input_x))  # 予測値を格納するリスト
    imp = pd.DataFrame()  # 特徴量の重要度を格納するdf

    # データを学習用と評価用に分割
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    cv = list(kf.split(input_x, input_y))

    # 交差検証法でモデル構築
    for nfold in range(n_splits):
        tr_idx, val_idx = cv[nfold][0], cv[nfold][1]
        x_tr, x_val = x_train.iloc[tr_idx], x_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]

        # モデル学習
        model = xgb.XGBRegressor(**params)
        model.fit(
            x_tr,
            y_tr,
            eval_set=[(x_tr, y_tr), (x_val, y_val)],
            verbose=False,
        )

        # モデルで予測
        y_tr_preds = model.predict(x_tr)
        y_val_preds = model.predict(x_val)
        # 精度(正解率)の確認
        metric_tr = round(rmse(y_tr, y_tr_preds), 5)
        metric_val = round(rmse(y_val, y_val_preds), 5)
        print("[RMSE] tr: {:.5f}, val: {:.5f}".format(metric_tr, metric_val))
        scores.append([nfold, metric_tr, metric_val])  # 結果を格納
        # 検証データの予測値を該当のIDの場所に格納
        val_preds[val_idx] = y_val_preds

        # 特徴量の重要度を確認
        _imp = pd.DataFrame(
            {"col": x_train.columns, "imp": model.feature_importances_, "nfold": nfold}
        )
        # print(_imp)
        imp = pd.concat([imp, _imp], axis=0, ignore_index=True)

    scores = np.array(scores)

    imp = imp.groupby("col")["imp"].agg(["mean", "std"])
    imp.columns = ["imp_mean", "imp_std"]
    imp = imp.reset_index(drop=False)

    # 予測
    y_test_preds = model.predict(x_test)

    return imp, scores, y_test_preds, val_preds

In [None]:
# print("===== XGBoost =====")
# xgb_imp, xgb_scores, xgb_test_preds, xgb_val_preds = xgboost_train_cv(
#     x_train, y_train, id_train, params, n_splits=5
# )
# print("\nRMSE")
# print(xgb_scores)

In [None]:
# ラッソ回帰で同様に交差検証法でモデル構築
def lasso_train_cv(input_x, input_y, input_id, n_splits=5):
    scores = []  # rmseのスコアを格納するリスト
    val_preds = np.zeros(len(input_x))  # 予測値を格納するリスト
    imp = pd.DataFrame()  # 特徴量の重要度を格納するdf

    # データを学習用と評価用に分割
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    cv = list(kf.split(input_x, input_y))

    # 交差検証法でモデル構築
    for nfold in range(n_splits):
        tr_idx, val_idx = cv[nfold][0], cv[nfold][1]
        x_tr, x_val = x_train.iloc[tr_idx], x_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]

        # モデル学習)
        model = Lasso(alpha=0.0005, random_state=1)
        model.fit(x_tr, y_tr)

        # モデルで予測
        y_tr_preds = model.predict(x_tr)
        y_val_preds = model.predict(x_val)
        # 精度(正解率)の確認
        metric_tr = round(rmse(y_tr, y_tr_preds), 5)
        metric_val = round(rmse(y_val, y_val_preds), 5)
        print("[RMSE] tr: {:.5f}, val: {:.5f}".format(metric_tr, metric_val))
        scores.append([nfold, metric_tr, metric_val])
        # 検証データの予測値を該当のIDの場所に格納
        val_preds[val_idx] = y_val_preds

    scores = np.array(scores)

    # 予測
    y_test_preds = model.predict(x_test)

    return scores, y_test_preds, val_preds

In [None]:
print("===== Lasso =====")
lasso_scores, lasso_test_preds, lasso_val_preds = lasso_train_cv(
    x_train, y_train, id_train
)
print("\nRMSEの平均値")
print(round(np.mean(lasso_scores[:, 2]), 5))

===== Lasso =====
[RMSE] tr: 0.09607, val: 0.11783
[RMSE] tr: 0.09698, val: 0.10854
[RMSE] tr: 0.09512, val: 0.11727
[RMSE] tr: 0.09502, val: 0.11998
[RMSE] tr: 0.09908, val: 0.10178

RMSEの平均値
0.11308


In [None]:
## リッジ回帰で同様に交差検証法でモデル構築
def Ridge_train_cv(input_x, input_y, input_id, n_splits=5):
    scores = []  # rmseのスコアを格納するリスト
    val_preds = np.zeros(len(input_x))  # 予測値を格納するリスト
    imp = pd.DataFrame()  # 特徴量の重要度を格納するdf

    # データを学習用と評価用に分割
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    cv = list(kf.split(input_x, input_y))

    # 交差検証法でモデル構築
    for nfold in range(n_splits):
        tr_idx, val_idx = cv[nfold][0], cv[nfold][1]
        x_tr, x_val = x_train.iloc[tr_idx], x_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]

        # モデル学習
        model = Ridge(alpha=10, random_state=1)
        model.fit(x_tr, y_tr)

        # モデルで予測
        y_tr_preds = model.predict(x_tr)
        y_val_preds = model.predict(x_val)
        # 精度(正解率)の確認
        metric_tr = round(rmse(y_tr, y_tr_preds), 5)
        metric_val = round(rmse(y_val, y_val_preds), 5)
        print("[RMSE] tr: {:.5f}, val: {:.5f}".format(metric_tr, metric_val))
        scores.append([nfold, metric_tr, metric_val])
        # 検証データの予測値を該当のIDの場所に格納
        val_preds[val_idx] = y_val_preds

    scores = np.array(scores)

    # 予測
    y_test_preds = model.predict(x_test)

    return scores, y_test_preds, val_preds

In [None]:
print("===== Ridge =====")
ridge_scores, ridge_test_preds, ridge_val_preds = Ridge_train_cv(
    x_train, y_train, id_train
)
print("\nRMSEの平均値")
print(round(np.mean(ridge_scores[:, 2]), 5))

===== Ridge =====
[RMSE] tr: 0.09240, val: 0.12392
[RMSE] tr: 0.09444, val: 0.10979
[RMSE] tr: 0.09218, val: 0.11721
[RMSE] tr: 0.09132, val: 0.12351
[RMSE] tr: 0.09653, val: 0.10056

RMSEの平均値
0.115


In [None]:
# ElasticNetで同様に交差検証法でモデル構築
from sklearn.linear_model import ElasticNet


def ElasticNet_train_cv(input_x, input_y, input_id, n_splits=5):
    scores = []  # rmseのスコアを格納するリスト
    val_preds = np.zeros(len(input_x))  # 予測値を格納するリスト
    imp = pd.DataFrame()  # 特徴量の重要度を格納するdf

    # データを学習用と評価用に分割
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    cv = list(kf.split(input_x, input_y))

    # 交差検証法でモデル構築
    for nfold in range(n_splits):
        tr_idx, val_idx = cv[nfold][0], cv[nfold][1]
        x_tr, x_val = x_train.iloc[tr_idx], x_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]

        # モデル学習
        model = ElasticNet(alpha=0.0005, l1_ratio=0.9, random_state=1)
        model.fit(x_tr, y_tr)

        # モデルで予測
        y_tr_preds = model.predict(x_tr)
        y_val_preds = model.predict(x_val)
        # 精度(正解率)の確認
        metric_tr = round(rmse(y_tr, y_tr_preds), 5)
        metric_val = round(rmse(y_val, y_val_preds), 5)
        print("[RMSE] tr: {:.5f}, val: {:.5f}".format(metric_tr, metric_val))
        scores.append([nfold, metric_tr, metric_val])
        # 検証データの予測値を該当のIDの場所に格納
        val_preds[val_idx] = y_val_preds

    scores = np.array(scores)

    # 予測
    y_test_preds = model.predict(x_test)

    return scores, y_test_preds, val_preds

In [None]:
print("===== ElasticNet =====")
en_scores, en_test_preds, en_val_preds = ElasticNet_train_cv(x_train, y_train, id_train)
print("\nRMSEの平均値")
print(round(np.mean(en_scores[:, 2]), 5))

===== ElasticNet =====
[RMSE] tr: 0.09513, val: 0.11749
[RMSE] tr: 0.09615, val: 0.10868
[RMSE] tr: 0.09445, val: 0.11717
[RMSE] tr: 0.09417, val: 0.12006
[RMSE] tr: 0.09848, val: 0.10164

RMSEの平均値
0.11301


In [None]:
# 標準化用のデータを作成
from sklearn.preprocessing import StandardScaler

# 説明変数と目的変数を定義
x_train_std = df_train_std.drop(["SalePrice", "Id", "SalePriceLog"], axis=1)
x_test_std = df_test_std.drop(["Id"], axis=1)

# 標準化
scaler = StandardScaler()
x_train_std = scaler.fit_transform(x_train_std)
x_test_std = scaler.transform(x_test_std)

In [None]:
# カーネルリッジで同様に交差検証法でモデル構築
from sklearn.kernel_ridge import KernelRidge


def kernel_ridge_train_cv(input_x, input_y, input_id, n_splits=5):
    scores = []  # rmseのスコアを格納するリスト
    val_preds = np.zeros(len(input_x))  # 予測値を格納するリスト
    imp = pd.DataFrame()  # 特徴量の重要度を格納するdf

    # データを学習用と評価用に分割
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    cv = list(kf.split(input_x, input_y))

    # 交差検証法でモデル構築
    for nfold in range(n_splits):
        tr_idx, val_idx = cv[nfold][0], cv[nfold][1]
        x_tr, x_val = input_x[tr_idx], input_x[val_idx]
        y_tr, y_val = input_y[tr_idx], input_y[val_idx]

        # モデル学習
        model = KernelRidge(alpha=0.6, kernel="polynomial", degree=2, coef0=2.5)
        model.fit(x_tr, y_tr)

        # モデルで予測
        y_tr_preds = model.predict(x_tr)
        y_val_preds = model.predict(x_val)
        # 精度(正解率)の確認
        metric_tr = round(rmse(y_tr, y_tr_preds), 5)
        metric_val = round(rmse(y_val, y_val_preds), 5)
        print("[RMSE] tr: {:.5f}, val: {:.5f}".format(metric_tr, metric_val))
        scores.append([nfold, metric_tr, metric_val])
        # 検証データの予測値を該当のIDの場所に格納
        val_preds[val_idx] = y_val_preds

    scores = np.array(scores)

    # 予測
    y_test_preds = model.predict(x_test_std)

    return scores, y_test_preds, val_preds

In [None]:
# print("===== Kernel Ridge =====")
# kr_scores, kr_test_preds, kr_val_preds = kernel_ridge_train_cv(
#     x_train_std, y_train, id_train
# )
# print("\nRMSEの平均値")
# print(round(np.mean(kr_scores[:, 2]), 5))

In [None]:
# モデル名とRMSEをデータフレームに格納
df_score = pd.DataFrame(
    {
        "model": ["LGBM", "Lasso", "Ridge", "ElasticNet"],
        "RMSE": [
            np.mean(lgb_scores[:, 2]),
            np.mean(lasso_scores[:, 2]),
            np.mean(ridge_scores[:, 2]),
            np.mean(en_scores[:, 2]),
        ],
    }
)
df_score

Unnamed: 0,model,RMSE
0,LGBM,0.126878
1,Lasso,0.11308
2,Ridge,0.114998
3,ElasticNet,0.113008
4,SVR,0.396604


In [None]:
# アンサンブル
# 予測値を平均
y_test_preds = (lgb_test_preds + lasso_test_preds + svr_test_preds) / 3
y_test_preds = np.exp(y_test_preds)  # 対数変換を戻す

# 提出用データの作成
df_submission = pd.DataFrame({"Id": id_test, "SalePrice": y_test_preds})
df_submission.to_csv("submission.csv", index=False)

In [None]:
# # スタッキング
# from sklearn.ensemble import StackingRegressor

# # モデルの定義
# estimators = [
#     ("lgb", lgb.LGBMRegressor(**params)),
#     ("lasso", Lasso(alpha=0.0005, random_state=1)),
#     ("ridge", Ridge(alpha=10, random_state=123)),
#     # ("svr", SVR(kernel="rbf", C=100, gamma=0.0001)),
# ]

# # スタッキングの定義
# model = StackingRegressor(estimators=estimators, final_estimator=Ridge(alpha=10))

# # モデル学習
# model.fit(x_train, y_train)

# # 精度の確認
# y_train_pred = model.predict(x_train)
# print("RMSE: {:.5f}".format(rmse(y_train, y_train_pred)))

# # テストデータで予測
# y_test_pred = model.predict(x_test)
# y_test_pred = np.exp(y_test_pred)

In [None]:
# # 提出用ファイルの作成
# submission = pd.DataFrame({"Id": id_test, "SalePrice": y_test_pred})
# submission.to_csv("submit_stacking.csv", index=False)

In [None]:
# # ラッソ回帰で同様に学習
# lasso = Lasso(alpha=0.0005, random_state=1)
# lasso.fit(x_train, y_train)
# lasso_preds = lasso.predict(x_test)

# # リッジ回帰で同様に学習
# ridge = Ridge(alpha=10, random_state=123)
# ridge.fit(x_train, y_train)
# ridge_preds = ridge.predict(x_test)

# ### 最高スコア
# # 予測結果の平均を取る
# preds = (lgb_preds + lasso_preds + ridge_preds) / 3
# preds = np.exp(preds)

In [None]:
# スタッキング用のデータフレームを作成

NameError: name 'y_test' is not defined