In [55]:
#Veri Manipülasyonu
import numpy as np
import pandas as pd

#Veri Görselleştirme
import matplotlib.pyplot as plt 
import seaborn as sns

# Algoritmalar
from sklearn.linear_model import Ridge,Lasso,ElasticNet

#Train ve Test Setlerinin ayrılması
from sklearn.model_selection import train_test_split

#Değerlendirme Metrikleri
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Gereksiz uyarıların görüntülenmemesi için kullanıyoruz
import warnings
warnings.filterwarnings('ignore')

data : https://www.kaggle.com/datasets/mirichoi0218/insurance

In [56]:
df = pd.read_csv("./Data/insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [58]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,1338.0,39.207025,14.04996,18.0,27.0,39.0,51.0,64.0
bmi,1338.0,30.663397,6.098187,15.96,26.29625,30.4,34.69375,53.13
children,1338.0,1.094918,1.205493,0.0,0.0,1.0,2.0,5.0
charges,1338.0,13270.422265,12110.011237,1121.8739,4740.28715,9382.033,16639.912515,63770.42801


In [59]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [60]:
# One Hot Encoding yöntemiyle categorik değişkenlerimizi sayısal hale getiriyoruz. Çünkü modelimiz metinlerle çalışmıyor
df_encoded = pd.get_dummies(df, columns = ['sex', 'smoker','region'],drop_first=True,dtype=np.int64) 

In [61]:
df_encoded

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.900,0,16884.92400,0,1,0,0,1
1,18,33.770,1,1725.55230,1,0,0,1,0
2,28,33.000,3,4449.46200,1,0,0,1,0
3,33,22.705,0,21984.47061,1,0,1,0,0
4,32,28.880,0,3866.85520,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0,1,0,0
1334,18,31.920,0,2205.98080,0,0,0,0,0
1335,18,36.850,0,1629.83350,0,0,0,1,0
1336,21,25.800,0,2007.94500,0,0,0,0,1


In [62]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               1338 non-null   int64  
 1   bmi               1338 non-null   float64
 2   children          1338 non-null   int64  
 3   charges           1338 non-null   float64
 4   sex_male          1338 non-null   int64  
 5   smoker_yes        1338 non-null   int64  
 6   region_northwest  1338 non-null   int64  
 7   region_southeast  1338 non-null   int64  
 8   region_southwest  1338 non-null   int64  
dtypes: float64(2), int64(7)
memory usage: 94.2 KB


In [63]:
# hedef özelliğimizi geri kalan özelliklerden ayırıyoruz
# Bunu bağımlı değişkenimizi bağımsız değişkenlerden ayırıyoruz şeklinde de söyleyebiliriz
X = df_encoded.drop(columns=["charges"])
y = df_encoded["charges"]

In [64]:
# Verimizi train test olarak ikiye ayırıyoruz.
# Ciddi bir projede bu ayrımı OHE'den önce yapmış olmalıyız
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.8,random_state=42)

# Ridge Regression

In [65]:
#Varsayılan alpha değerimiz zaten 1'di
Ridge_reg = Ridge(alpha=1)
#Ridge Regresyon modelini eğitiyoruz
Ridge_reg.fit(X_train,y_train)

In [66]:
# Eğittiğimiz Ridge Regresyon modeli ile tahminde bulunuyoruz
y_pred = Ridge_reg.predict(X_test)

In [67]:
print("Ridge MAE:",mean_absolute_error(y_test,y_pred)) 
print("Ridge MSE:",mean_squared_error(y_test,y_pred))
print("Ridge RMSE:",np.sqrt(mean_squared_error(y_test,y_pred)))
print("Ridge R2 Score:",r2_score(y_test,y_pred))

Ridge MAE: 4193.585298299824
Ridge MSE: 33645037.09177903
Ridge RMSE: 5800.434215796179
Ridge R2 Score: 0.7832830144988214


In [68]:
#Alpha hiperparametremizi 0.1 'e değiştirdik
Ridge_reg = Ridge(alpha=0.1)
#Ridge Regresyon modelini eğitiyoruz
Ridge_reg.fit(X_train,y_train)

In [69]:
# Eğittiğimiz Ridge Regresyon modeli ile tahminde bulunuyoruz
y_pred = Ridge_reg.predict(X_test)

In [70]:
print("Ridge MAE:",mean_absolute_error(y_test,y_pred)) 
print("Ridge MSE:",mean_squared_error(y_test,y_pred))
print("Ridge RMSE:",np.sqrt(mean_squared_error(y_test,y_pred)))
print("Ridge R2 Score:",r2_score(y_test,y_pred))

Ridge MAE: 4182.428134240744
Ridge MSE: 33601489.86482401
Ridge RMSE: 5796.679210101592
Ridge R2 Score: 0.7835635142268166


In [71]:
#Alpha hiperparametremizi 10'a değiştirdik
Ridge_reg = Ridge(alpha=10)
#Ridge Regresyon modelini eğitiyoruz
Ridge_reg.fit(X_train,y_train)

In [72]:
# Eğittiğimiz Ridge Regresyon modeli ile tahminde bulunuyoruz
y_pred = Ridge_reg.predict(X_test)

In [73]:
print("Ridge MAE:",mean_absolute_error(y_test,y_pred)) 
print("Ridge MSE:",mean_squared_error(y_test,y_pred))
print("Ridge RMSE:",np.sqrt(mean_squared_error(y_test,y_pred)))
print("Ridge R2 Score:",r2_score(y_test,y_pred))

Ridge MAE: 4303.735883230566
Ridge MSE: 34289876.70984791
Ridge RMSE: 5855.755861530423
Ridge R2 Score: 0.7791294242448292


In [74]:
Ridge_reg = Ridge(alpha=100)
#Ridge Regresyon modelini eğitiyoruz
Ridge_reg.fit(X_train,y_train)

In [75]:
# Eğittiğimiz Ridge Regresyon modeli ile tahminde bulunuyoruz
y_pred = Ridge_reg.predict(X_test)

In [76]:
print("Ridge MAE:",mean_absolute_error(y_test,y_pred)) 
print("Ridge MSE:",mean_squared_error(y_test,y_pred))
print("Ridge RMSE:",np.sqrt(mean_squared_error(y_test,y_pred)))
print("Ridge R2 Score:",r2_score(y_test,y_pred))

Ridge MAE: 5191.3638964346665
Ridge MSE: 48311750.05326998
Ridge RMSE: 6950.665439601447
Ridge R2 Score: 0.6888106615168705


In [77]:
df_sonuc = pd.DataFrame(y_test)
df_sonuc["Tahmin"] = y_pred
# Hatayı mutlak değer içerisinde gösterdik
df_sonuc["Hata"] = abs(df_sonuc["charges"] - df_sonuc["Tahmin"])
df_sonuc

Unnamed: 0,charges,Tahmin,Hata
764,9095.06825,10300.862209,1205.793959
887,5272.17580,8565.465765,3293.289965
890,29330.98315,29379.204711,48.221561
1293,9301.89355,11302.916218,2001.022668
259,33750.29180,20376.852743,13373.439057
...,...,...,...
109,47055.53210,32343.071173,14712.460927
575,12222.89830,13010.662501,787.764201
535,6067.12675,9425.316565,3358.189815
543,63770.42801,33825.172436,29945.255574


In [78]:
Ridge_reg.intercept_

-9834.037644251415

In [79]:
Ridge_reg.coef_

array([  244.45471831,   327.30072507,   447.32088791,   350.15120563,
       14995.09460604,  -226.43421668,   -53.76669066,  -538.05896076])

# Lasso Regression

In [80]:
Lasso_reg = Lasso()
#Lasso Regresyon modelini eğitiyoruz
Lasso_reg.fit(X_train,y_train)

In [81]:
# Eğittiğimiz Lasso Regresyon modeli ile tahminde bulunuyoruz
y_pred = Lasso_reg.predict(X_test)

In [82]:
print("Lasso MAE:",mean_absolute_error(y_test,y_pred)) 
print("Lasso MSE:",mean_squared_error(y_test,y_pred))
print("Lasso RMSE:",np.sqrt(mean_squared_error(y_test,y_pred)))
print("Lasso R2 Score:",r2_score(y_test,y_pred))

Lasso MAE: 4182.426033636681
Lasso MSE: 33605507.55392852
Lasso RMSE: 5797.025750669779
Lasso R2 Score: 0.7835376351805539


# ElasticNet

In [83]:
ElasticNet_reg = ElasticNet()
#ElasticNet Regresyon modelini eğitiyoruz
ElasticNet_reg.fit(X_train,y_train)

In [84]:
# Eğittiğimiz ElasticNet Regresyon modeli ile tahminde bulunuyoruz
y_pred = ElasticNet_reg.predict(X_test)

In [85]:
print("ElasticNet MAE:",mean_absolute_error(y_test,y_pred)) 
print("ElasticNet MSE:",mean_squared_error(y_test,y_pred))
print("ElasticNet RMSE:",np.sqrt(mean_squared_error(y_test,y_pred)))
print("ElasticNet R2 Score:",r2_score(y_test,y_pred))

ElasticNet MAE: 7423.855422936446
ElasticNet MSE: 90268049.60276304
ElasticNet RMSE: 9500.94993159963
ElasticNet R2 Score: 0.41855853677267385
