## Model Tuning / Model Doğrulama

In [234]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [235]:
ad = pd.read_csv("Advertising.csv", usecols = [1, 2, 3, 4])
df = ad.copy()

In [236]:
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [237]:
X1 = df.drop("sales", axis = 1)
y1 = df["sales"]
X1_train, X1_test, y1_train, y1_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
lm1 = LinearRegression()
model1 = lm.fit(X1_train, y1_train)

rmse1_train = np.sqrt(mean_squared_error(y1_train, model1.predict(X1_train)))
r1_square = model1.score(X1_train, y1_train)

In [238]:
rmse1_train

1.644727765644337

In [239]:
r1_square

0.8957008271017818

In [240]:
X2 = df.drop("sales", axis = 1)
y2 = df["sales"]
X2_train, X2_test, y2_train, y2_test = train_test_split(X, y, test_size = 0.2, random_state = 99)
lm2 = LinearRegression()
model2 = lm.fit(X2_train, y2_train)

rmse2_train = np.sqrt(mean_squared_error(y2_train, model2.predict(X2_train)))
r2_square = model2.score(X2_train, y2_train)

In [241]:
rmse2_train

1.7236824822650751

In [242]:
r2_square

0.8906288862925659

In [243]:
X3 = df.drop("sales", axis = 1)
y3 = df["sales"]
X3_train, X3_test, y3_train, y3_test = train_test_split(X, y, test_size = 0.2, random_state = 144)
lm3 = LinearRegression()
model3 = lm.fit(X3_train, y3_train)

rmse3_train = np.sqrt(mean_squared_error(y3_train, model3.predict(X3_train)))
r3_square = model3.score(X3_train, y3_train)

In [244]:
rmse3_train

1.6748559274650712

In [245]:
r3_square

0.8971614078663419

her bir r_train ve s_square değeri birbirinden farklı.

bu modellerin seçilen kısmının farklı olduğu anlamına gelir.

modelin bir kısmını seçeceğiz fakat hangi kısmını seçeceğiz ?

bu problemi ortadan kaldırabilmek adına

'cross validation' yöntemi kullanılabilir.

-------

### 1. cross validation (eğitim / train için)

In [246]:
X = df.drop("sales", axis = 1)
y = df["sales"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 144)
lm = LinearRegression()
model = lm.fit(X_train, y_train)

rmse_train = np.sqrt(mean_squared_error(y_train, model.predict(X_train)))
r_square_train = model.score(X_train, y_train)

In [247]:
rmse_train

1.6748559274650712

In [248]:
r_square_train

0.8971614078663419

In [249]:
cross_val_score(model, X, y, cv = 10, scoring = "r2")  # 10 tane birbirinden farklı r^2 değeri geldi.

array([0.87302696, 0.8581613 , 0.92968723, 0.89013272, 0.93146498,
       0.93138735, 0.7597901 , 0.91217097, 0.83891753, 0.92882311])

In [250]:
# modelimizin daha güvenilir(valide), doğrulanmış r^2 değerini tespit etmek için;

cross_val_score(model, X, y, cv = 10, scoring = "r2").mean()  # bu değerlerin ortalaması

0.8853562237979616

In [251]:
# negative train square errors (-1 ile çarpmayı unutma!)

-1 * cross_val_score(model, X_train, y_train, cv = 10, scoring = "neg_mean_squared_error")

array([3.77011117, 1.38904597, 1.31506551, 3.32109589, 8.82506973,
       2.37926645, 2.2872061 , 1.05714426, 2.99532621, 3.15248307])

In [252]:
# negative train mean square error (-1 ile çarpmayı unutma!)

-1 * cross_val_score(model, X_train, y_train, cv = 10, scoring = "neg_mean_squared_error").mean()

3.0491814361587064

In [253]:
# negative train square root errors (-1 ile çarpmayı unutma!)

np.sqrt(-1 * cross_val_score(model, X_train, y_train, cv = 10, scoring = "neg_mean_squared_error"))

array([1.94167741, 1.17857794, 1.14676306, 1.82238742, 2.97070189,
       1.5424871 , 1.51235118, 1.02817521, 1.73070108, 1.77552332])

In [254]:
# negative train mean square root error (-1 ile çarpmayı unutma!)

# GERÇEK EĞİTİM HATAM BU : Daha doğru bir hatayı bu şekilde tespit edebiliriz.

np.sqrt(-1 * cross_val_score(model, X_train, y_train, cv = 10, scoring = "neg_mean_squared_error")).mean()

1.6649345607872927

-----------

### 2. cross validation (test için)

In [255]:
X = df.drop("sales", axis = 1)
y = df["sales"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 144)
lm = LinearRegression()
model = lm.fit(X_train, y_train)

rmse_test = np.sqrt(mean_squared_error(y_test, model.predict(X_test)))
r_square_test = model.score(X_train, y_train)

In [256]:
rmse_test

1.6640263686701033

In [257]:
r_square_test

0.8971614078663419

In [258]:
cross_val_score(model, X, y, cv = 10, scoring = "r2")  # 10 tane birbirinden farklı r^2 değeri geldi.

array([0.87302696, 0.8581613 , 0.92968723, 0.89013272, 0.93146498,
       0.93138735, 0.7597901 , 0.91217097, 0.83891753, 0.92882311])

In [259]:
# modelimizin daha güvenilir(valide), doğrulanmış r^2 değerini tespit etmek için;

cross_val_score(model, X, y, cv = 10, scoring = "r2").mean()  # bu değerlerin ortalaması

0.8853562237979616

In [260]:
cross_val_score(model, X_test, y_test, cv = 10, scoring = "r2").mean()

0.7323744767914121

In [261]:
# negative test square errors (-1 ile çarpmayı unutma!)

-1 * cross_val_score(model, X_test, y_test, cv = 10, scoring = "neg_mean_squared_error")

array([1.01930472, 1.00492969, 6.61930405, 3.00595358, 2.49950664,
       6.17364417, 5.35729881, 2.47949459, 3.29827621, 1.71683906])

In [262]:
# negative test mean square error (-1 ile çarpmayı unutma!)

-1 * cross_val_score(model, X_test, y_test, cv = 10, scoring = "neg_mean_squared_error").mean()

3.3174551530490852

In [263]:
# negative test square root errors (-1 ile çarpmayı unutma!)

np.sqrt(-1 * cross_val_score(model, X_test, y_test, cv = 10, scoring = "neg_mean_squared_error"))

array([1.00960622, 1.00246182, 2.57280082, 1.73376861, 1.58098281,
       2.4846819 , 2.31458394, 1.5746411 , 1.81611569, 1.31028205])

In [264]:
# negative test mean square root error (-1 ile çarpmayı unutma!)

# GERÇEK TEST HATAM BU : Daha doğru bir hatayı bu şekilde tespit edebiliriz.

np.sqrt(-1 * cross_val_score(model, X_test, y_test, cv = 10, scoring = "neg_mean_squared_error")).mean()

1.7399924960346644

-----------

### 3. cross validation (Eğitim - Test Karşılaştırma)

valide edilmiş test ve eğitim hatasına bu şekilde erişmiş oluruz.

In [265]:
# negative train mean square root error (-1 ile çarpmayı unutma!)

# GERÇEK EĞİTİM HATAM BU : Daha doğru bir hatayı bu şekilde tespit edebiliriz.

np.sqrt(-1 * cross_val_score(model, X_train, y_train, cv = 10, scoring = "neg_mean_squared_error")).mean()

1.6649345607872927

In [266]:
# negative test mean square root error (-1 ile çarpmayı unutma!)

# GERÇEK TEST HATAM BU : Daha doğru bir hatayı bu şekilde tespit edebiliriz.

np.sqrt(-1 * cross_val_score(model, X_test, y_test, cv = 10, scoring = "neg_mean_squared_error")).mean()

1.7399924960346644