In [1]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,id,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,0,23.059782,6,140,110,2815,17.977429,80,1,dodge aspen
1,3,17.674521,8,350,150,4456,13.514535,72,1,dodge rampage
2,4,17.136353,8,302,140,2774,13.209912,79,1,mercury cougar brougham
3,7,22.664666,6,400,85,2190,15.196381,71,1,pontiac j2000 se hatchback
4,9,17.872018,8,429,220,2245,9.6214,70,1,ford galaxie 500


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            500 non-null    int64  
 1   mpg           500 non-null    float64
 2   cylinders     500 non-null    int64  
 3   displacement  500 non-null    int64  
 4   horsepower    500 non-null    object 
 5   weight        500 non-null    int64  
 6   acceleration  500 non-null    float64
 7   model year    500 non-null    int64  
 8   origin        500 non-null    int64  
 9   car name      500 non-null    object 
dtypes: float64(2), int64(6), object(2)
memory usage: 39.2+ KB


In [4]:
df["horsepower"].unique()

array(['110', '150', '140', '85', '220', '165', '60', '?', '90', '67',
       '97', '72', '122', '139', '88', '100', '105', '54', '71', '78',
       '75', '148', '130', '95', '70', '112', '69', '132', '120', '108',
       '74', '96', '61', '193', '58', '82', '80', '145', '64', '135'],
      dtype=object)

In [5]:
df0 = df[df["horsepower"] != "?"]

In [6]:
X = df0.drop(["id", "mpg", "car name"], axis=1).astype(float)
y = df0["mpg"].astype(float)

In [7]:
# データを訓練セットと検証セットに分割
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# LightGBMのデータセットに変換
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

# ハイパーパラメータの設定
params = {
    'objective': 'regression',  # 回帰タスクを指定
    'metric': 'rmse',  # 評価指標として平均二乗誤差の平方根(RMSE)を使用
    'boosting_type': 'gbdt',  # 勾配ブースティングのタイプ
    'num_leaves': 31,  # 木の葉の数
    'learning_rate': 0.05,  # 学習率
    'feature_fraction': 0.9,  # 各木での特徴量の使用割合
}

# モデルの学習
num_round = 100  # 学習のイテレーション数
bst = lgb.train(params, train_data, num_round, valid_sets=[val_data], early_stopping_rounds=10)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 264
[LightGBM] [Info] Number of data points in the train set: 392, number of used features: 7
[LightGBM] [Info] Start training from score 26.889137
[1]	valid_0's rmse: 6.99272
Training until validation scores don't improve for 10 rounds
[2]	valid_0's rmse: 6.79782
[3]	valid_0's rmse: 6.61157
[4]	valid_0's rmse: 6.44546
[5]	valid_0's rmse: 6.23113
[6]	valid_0's rmse: 6.02853
[7]	valid_0's rmse: 5.89079
[8]	valid_0's rmse: 5.71492
[9]	valid_0's rmse: 5.54971
[10]	valid_0's rmse: 5.38923
[11]	valid_0's rmse: 5.25374
[12]	valid_0's rmse: 5.12252
[13]	valid_0's rmse: 5.00484
[14]	valid_0's rmse: 4.88156
[15]	valid_0's rmse: 4.766
[16]	valid_0's rmse: 4.65919
[17]	valid_0's rmse: 4.56068
[18]	valid_0's rmse: 4.47154
[19]	valid_0's rmse: 4.38966
[20]	valid_0's rmse: 4.31262
[21]	valid_0's rmse: 4.23804
[22]	valid_0's rmse: 4.16822
[23]	valid_0'



In [9]:
# テストセットでの予測
y_pred = bst.predict(X_val, num_iteration=bst.best_iteration)

In [10]:
# モデルの評価
mse = mean_squared_error(y_val, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 10.00911956334721


In [12]:
r2 = r2_score(y_val, y_pred)
print(f"r2 score: {r2}")

r2 score: 0.8106493595894261
