In [1]:
# downloding lightgbm
!pip install -qU lightgbm

In [2]:
# imports
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgbm
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')

In [3]:
# loading data
df = sns.load_dataset('penguins')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [4]:
# getting info of dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [5]:
# converting object features into categorical features
for col in df.select_dtypes(['object']).columns:
  df[col] = df[col].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   species            344 non-null    category
 1   island             344 non-null    category
 2   bill_length_mm     342 non-null    float64 
 3   bill_depth_mm      342 non-null    float64 
 4   flipper_length_mm  342 non-null    float64 
 5   body_mass_g        342 non-null    float64 
 6   sex                333 non-null    category
dtypes: category(3), float64(4)
memory usage: 12.3 KB


In [6]:
# creating X & y
X = df.drop(['bill_length_mm'], axis=1)
y = df['bill_length_mm']

In [7]:
# selecting categorical features
cat = df.select_dtypes(['object', 'category']).columns.to_list()

In [8]:
# splitting X & y into training & testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=18)

In [9]:
# model training
model = lgbm.LGBMRegressor(objective='regression_l2', n_estimators=100, boosting_type='gbdt', learning_rate=0.03, random_state=18)
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], categorical_feature=cat)

[1]	valid_0's l2: 33.6985
[2]	valid_0's l2: 31.8979
[3]	valid_0's l2: 30.2047
[4]	valid_0's l2: 28.6169
[5]	valid_0's l2: 27.1236
[6]	valid_0's l2: 25.715
[7]	valid_0's l2: 24.3904
[8]	valid_0's l2: 23.1483
[9]	valid_0's l2: 21.9699
[10]	valid_0's l2: 20.8677
[11]	valid_0's l2: 19.7891
[12]	valid_0's l2: 18.8193
[13]	valid_0's l2: 17.868
[14]	valid_0's l2: 17.0046
[15]	valid_0's l2: 16.1574
[16]	valid_0's l2: 15.4023
[17]	valid_0's l2: 14.6508
[18]	valid_0's l2: 13.9729
[19]	valid_0's l2: 13.3162
[20]	valid_0's l2: 12.713
[21]	valid_0's l2: 12.1504
[22]	valid_0's l2: 11.6144
[23]	valid_0's l2: 11.1227
[24]	valid_0's l2: 10.6627
[25]	valid_0's l2: 10.2274
[26]	valid_0's l2: 9.82435
[27]	valid_0's l2: 9.43926
[28]	valid_0's l2: 9.11952
[29]	valid_0's l2: 8.7906
[30]	valid_0's l2: 8.5006
[31]	valid_0's l2: 8.20132
[32]	valid_0's l2: 7.94271
[33]	valid_0's l2: 7.67351
[34]	valid_0's l2: 7.4447
[35]	valid_0's l2: 7.26089
[36]	valid_0's l2: 7.02277
[37]	valid_0's l2: 6.85785
[38]	valid_0's l

In [10]:
# checking results
print('R2:', (r2_score(y_test, model.predict(X_test)) * 100).round(2))

R2: 89.71
