In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
data = pd.read_csv('/Users/tarlanjabiyev/Desktop/Sprint 19/insurance.csv')

In [72]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Datanın hazırlanması - Data Preprocessing

In [75]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [77]:
data.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [5]:
target = 'charges'

In [7]:
X = data.drop(columns=[target])
y = data[target]

In [9]:
# Datanın bölünməsi - Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [11]:
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

In [17]:
#For Tree/Forest/XGB: numeric passthrough (no scaling), one-hot for categoricals
preprocess_basic = ColumnTransformer(
    transformers=[
        ("num", "passthrough", num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ],
    remainder="drop",
)
preprocess_basic

In [19]:
# For GLM/KNN/SVM: scaling for numeric, one-hot for categoricals
preprocess_scaled = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("scaler", StandardScaler())]), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ],
    remainder="drop",
)
preprocess_scaled

## Alqoritmalar

In [21]:
def eval_reg(name, model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    rmse = mean_squared_error(y_test, preds, squared=False)
    r2 = r2_score(y_test, preds)
    print(f"\n[{name}]")
    print(f"MAE : {mae:,.4f}")
    print(f"RMSE: {rmse:,.4f}")
    print(f"R^2 : {r2:,.4f}")
    return {"model": name, "mae": mae, "rmse": rmse, "rsq": r2}

In [23]:
results = []

### GLM — Linear Regression

In [25]:
glm_pipe = Pipeline([
    ("prep", preprocess_scaled),
    ("model", LinearRegression())
])
results.append(eval_reg("GLM (Linear Regression)", glm_pipe, X_train, y_train, X_test, y_test))


[GLM (Linear Regression)]
MAE : 4,013.1006
RMSE: 5,527.4259
R^2 : 0.8002




### KNN

In [29]:
knn_pipe = Pipeline([
    ("prep", preprocess_scaled),
    ("model", KNeighborsRegressor(n_neighbors=5))
])
results.append(eval_reg("KNN", knn_pipe, X_train, y_train, X_test, y_test))


[KNN]
MAE : 3,497.5679
RMSE: 5,772.8552
R^2 : 0.7820




### SVM

In [31]:
svm_pipe = Pipeline([
    ("prep", preprocess_scaled),
    ("model", SVR(kernel="rbf"))
])
results.append(eval_reg("SVM", svm_pipe, X_train, y_train, X_test, y_test))


[SVM]
MAE : 8,379.2165
RMSE: 13,071.3557
R^2 : -0.1175




### Decision Tree

In [33]:
tree_pipe = Pipeline([
    ("prep", preprocess_basic),
    ("model", DecisionTreeRegressor(
        max_depth=15, #the maximum number of splits from root to leaf
                      #large value: high variance, can overfit
                      #small value: high bias, low variance
        min_samples_split=2, #minimum row count in a node
                             #large value: less overfitting, high bias, low variance
                             #smal value: can overfit, low bias, high variance
        random_state=123))
])
results.append(eval_reg("Decision Tree", tree_pipe, X_train, y_train, X_test, y_test))


[Decision Tree]
MAE : 3,027.1570
RMSE: 6,469.6793
R^2 : 0.7262




### Random Forest (Bagging)

In [110]:
rf_pipe = Pipeline([
    ("prep", preprocess_basic),
    ("model", RandomForestRegressor(
        n_estimators=500,
        min_samples_leaf=5,
        random_state=123
    ))
])
results.append(eval_reg("Random Forest", rf_pipe, X_train, y_train, X_test, y_test))


[Random Forest]
MAE : 2,323.8005
RMSE: 3,938.4769
R^2 : 0.8985




### XGBoost (Boosting)

In [112]:
xgb_pipe = Pipeline([
    ("prep", preprocess_basic),
    ("model", XGBRegressor(
        objective="reg:squarederror",
        n_estimators=1000, #more trees: higher capacity and training time. Usually paired with a smaller learn_rate
        learning_rate=0.05, #step size reduction per boosting step
                            #small (0.01–0.1): slower, steadier learning, needs more trees, often better generalization
                            #large (0.2–0.3): faster, risk of overfit if trees is also large
        max_depth=6, #the maximum number of splits from root to leaf
                     #large (6–12): high variance, can overfit
                     #small (3–6): high bias, low variance
        min_child_weight=5, #large (5–20): less overfitting, high bias, low variance
                            #smal (1–2): can overfit, low bias, high variance
        random_state=42
    ))
])
results.append(eval_reg("XGBoost", xgb_pipe, X_train, y_train, X_test, y_test))


[XGBoost]
MAE : 2,839.6089
RMSE: 4,566.2129
R^2 : 0.8636




### LightGBM

In [125]:
lgb_pipe = Pipeline([
    ("prep", preprocess_basic),
    ("model", LGBMRegressor(
        objective="regression",
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=6,
        min_child_samples=5,
        random_state=123
    ))
])

results.append(eval_reg("LightGBM", lgb_pipe, X_train, y_train, X_test, y_test))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000505 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 324
[LightGBM] [Info] Number of data points in the train set: 1070, number of used features: 11
[LightGBM] [Info] Start training from score 13189.257679

[LightGBM]
MAE : 2,918.2971
RMSE: 4,567.5875
R^2 : 0.8635




## Ən yaxşı model seçimi

In [127]:
pd.DataFrame(results).sort_values("rmse").reset_index(drop=True)

Unnamed: 0,model,mae,rmse,rsq
0,Random Forest,2323.800481,3938.476852,0.898548
1,XGBoost,2839.60891,4566.212917,0.86363
2,LightGBM,2918.29706,4567.587458,0.863548
3,LightGBM,2918.29706,4567.587458,0.863548
4,LightGBM,2940.996616,4604.879796,0.861311
5,GLM (Linear Regression),4013.100618,5527.42593,0.800174
6,KNN,3497.567895,5772.855163,0.782035
7,Decision Tree,3027.157013,6469.679301,0.72624
8,SVM,8379.216547,13071.355726,-0.117497
