In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import scale, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

In [10]:
from warnings import filterwarnings

filterwarnings('ignore')

In [11]:
df = pd.read_csv("data/Hitters.csv")

df = df.dropna()  # eksik değerler silindi
dms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])

y = df["Salary"]  # bagimli degisken
x_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
x = pd.concat([x_, dms[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [12]:
x_train.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,League_N,Division_W,NewLeague_N
183,328.0,91.0,12.0,51.0,43.0,33.0,2.0,342.0,94.0,12.0,51.0,44.0,33.0,145.0,59.0,8.0,True,False,True
229,514.0,144.0,0.0,67.0,54.0,79.0,9.0,4739.0,1169.0,13.0,583.0,374.0,528.0,229.0,453.0,15.0,True,False,True
286,593.0,152.0,23.0,69.0,75.0,53.0,6.0,2765.0,686.0,133.0,369.0,384.0,321.0,315.0,10.0,6.0,False,True,False
102,233.0,49.0,2.0,41.0,23.0,18.0,8.0,1350.0,336.0,7.0,166.0,122.0,106.0,102.0,132.0,10.0,False,False,False
153,341.0,95.0,6.0,48.0,42.0,20.0,10.0,2964.0,808.0,81.0,379.0,428.0,221.0,158.0,4.0,5.0,True,True,True


K-NN
Gözlemlerin birbirine olan benzerlikleri üzerinden tahmin yapılır

In [None]:
knn_model = KNeighborsRegressor().fit(x_train, y_train)
knn_model.n_neighbors

In [None]:
knn_model.metric

In [None]:
dir(knn_model)

In [None]:
knn_model.predict(x_test)[0:5]

In [None]:
y_pred = knn_model.predict(x_test)

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))

KNN MODEL TUNING

In [None]:
RMSE = []

for k in range(10):
    k = k + 1
    knn_model = KNeighborsRegressor(n_neighbors=k).fit(x_train, y_train)
    y_pred = knn_model.predict(x_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    RMSE.append(rmse)
    print("k=", k, "için RMSE değeri:", rmse)

In [None]:
# GridsearchCV

knn_params = [{"n_neighbors": np.arange(1, 30, 1)}]

In [None]:
knn = KNeighborsRegressor()

In [None]:
knn_cv_model = GridSearchCV(knn, knn_params, cv=10).fit(x_train, y_train)

In [None]:
knn_cv_model.best_params_

In [None]:
# Final Model
knn_tuned = KNeighborsRegressor(n_neighbors=knn_cv_model.best_params_["n_neighbors"]).fit(x_train, y_train)

In [None]:
y_pred = knn_tuned.predict(x_test)
np.sqrt(mean_squared_error(y_test, y_pred))

SUPPORT VECTOR REGRESSION(SVR)

In [None]:
svr_model = SVR(kernel="linear").fit(x_train, y_train)
svr_model

In [None]:
svr_model.predict(x_train)[0:5]

In [None]:
svr_model.predict(x_test)[0:5]

In [None]:
svr_model.intercept_

In [None]:
svr_model.coef_

In [None]:
y_pred = svr_model.predict(x_test)
np.sqrt(mean_squared_error(y_test, y_pred))

SVR MODEL TUNING

In [None]:
svr_params = {"C": [0.1, 0.5, 1, 3]}

In [None]:
svr_cv_model = GridSearchCV(svr_model, svr_params, cv=5, verbose=2, n_jobs=-1).fit(x_train, y_train)

In [None]:
svr_cv_model.best_params_

In [None]:
svr_tuned = SVR(kernel="linear", C=0.5).fit(x_train, y_train)

In [None]:
y_pred = svr_tuned.predict(x_test)

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))

ARTIFICIAL NEURAL NETWORK

MODEL - PREDICT

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(x_train, y_train)
x_train_scaled = scaler.transform(x_train)

In [None]:
x_test_scaled = scaler.transform(x_test)

In [None]:
mlp_model = MLPRegressor().fit(x_train_scaled, y_train)
mlp_model

In [None]:
y_pred = mlp_model.predict(x_test_scaled)

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))

MODEL TUNING

In [None]:
mlp_params = {"alpha": [0.1, 0.01, 0.02, 0.001, 0.0001],  # ceza parametresi
              "hidden_layer_sizes": [(10, 20), (5, 5), (100, 100)]}  # iki katmanlı x tane nöron

In [None]:
mlp_cv_model = GridSearchCV(mlp_model, mlp_params, cv=10, verbose=2, n_jobs=-1).fit(x_train_scaled, y_train)

In [None]:
mlp_cv_model.best_params_

In [None]:
# final model
mlp_tuned = MLPRegressor(alpha=0.0001, hidden_layer_sizes=(100, 100)).fit(x_train_scaled, y_train)

In [None]:
y_pred = mlp_tuned.predict(x_test_scaled)

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))

CLASSIFICATION and REGRESSION TREE(CART)

Veri setindeki karmaşık yapıları basit karar yapılarına dönüştürmek amaçlanır
Heterojen veri setleri hedefe yönelik homojen alt gruplara ayırma işlemi gerçekleşir
Aşırı öğrenmeye eğilimlidir, karar ağaçlarının temelini oluşturur

MODEL - PREDICT

In [None]:
x_train = pd.DataFrame(x_train["Hits"])
x_test = pd.DataFrame(x_test["Hits"])

In [None]:
cart_model = DecisionTreeRegressor(max_leaf_nodes=10)

In [None]:
cart_model.fit(x_train, y_train)

In [None]:
x_grid = np.arange(min(np.array(x_train)), max(np.array(x_train)), 0.01)
x_grid = x_grid.reshape((len(x_grid), 1))

plt.scatter(x_train, y_train, color='red')

plt.plot(x_grid, cart_model.predict(x_grid), color='blue')

plt.title('Cart regression tree')
plt.xlabel('Atış sayısı(Hits)')
plt.ylabel('Maaş(Salary');

TEK DEGISKENLI

In [None]:
y_pred = cart_model.predict(x_test)

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))

TUM DEGISKENLER

In [None]:
df = pd.read_csv("data/Hitters.csv")
df = df.dropna()  # eksik değerler silindi
dms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
y = df["Salary"]  # bagimli degisken
x_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
x = pd.concat([x_, dms[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [None]:
cart_model = DecisionTreeRegressor().fit(x_train, y_train)

In [None]:
y_pred = cart_model.predict(x_test)
np.sqrt(mean_squared_error(y_test, y_pred))

CART MODEL TUNING

In [None]:
cart_model = DecisionTreeRegressor(max_depth=5).fit(x_train, y_train)
y_pred = cart_model.predict(x_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
cart_params = {"max_depth": [2, 3, 4, 5, 10, 20],
               "min_samples_split": [2, 10, 5, 30, 50, 10]}

In [None]:
cart_model = DecisionTreeRegressor()

In [None]:
cart_cv_model = GridSearchCV(cart_model, cart_params, cv=10).fit(x_train, y_train)

In [None]:
cart_cv_model.best_params_

In [None]:
# final model
cart_tuned = DecisionTreeRegressor(max_depth=4, min_samples_split=50).fit(x_train, y_train)

In [None]:
y_pred = cart_tuned.predict(x_test)
np.sqrt(mean_squared_error(y_test, y_pred))

RANDOM FORESTS

Temeli bootstrap yöntemi ile oluşturulan birden fazla karar ağacının ürettiği tahminlerin bir araya getirilerek değerlendirilmesine dayanır

MODEL - PREDICT

In [None]:
rf_model = RandomForestRegressor(random_state=42).fit(x_train, y_train)

In [None]:
y_pred = rf_model.predict(x_test)
np.sqrt(mean_squared_error(y_test, y_pred))

MODEL TUNING

In [None]:
params = {"max_depth": [5, 8, 10],
          "max_features": [2, 5, 10],
          "n_estimators": [200, 500, 1000, 2000],
          "min_samples_split": [2, 10, 80, 100]}

In [None]:
rf_cv_model = GridSearchCV(rf_model, params, cv=10, n_jobs=-1, verbose=2).fit(x_train, y_train)

In [None]:
rf_cv_model = RandomForestRegressor(
    random_state=42,
    max_depth=8,
    max_features=2,
    min_samples_split=2,
    n_estimators=200)

rf_tuned = rf_model.fit(x_train, y_train)

In [None]:
y_pred = rf_tuned.predict(x_test)
np.sqrt(mean_squared_error(y_test, y_pred))

DEGISKEN ONEM DUZEYI

In [None]:
Importance = pd.DataFrame({'Importance': rf_tuned.feature_importances_ * 100},
                          index=x_train.columns)

Importance.sort_values(by='Importance',
                       axis=0,
                       ascending=True).plot(kind='barh')

GRADIENT BOOSTING MACHINES(GBM)

MODEL - PREDICT

In [None]:
gbm_model = GradientBoostingRegressor().fit(x_train, y_train)

In [None]:
y_pred = gbm_model.predict(x_test)
np.sqrt(mean_squared_error(y_test, y_pred))

MODEL TUNING

In [None]:
gbm_params = {"learning_rate": [0.001, 0.1, 0.01],
              "max_depth": [3, 5, 8],
              "n_estimators": [100, 200, 500],
              "subsample": [1, 0.5, 0.8],
              "loss": ["ls", "lad", "quantile"]}

In [None]:
gbm_model = GradientBoostingRegressor().fit(x_train, y_train)

In [None]:
gbm_cv_model = GridSearchCV(gbm_model, gbm_params, cv=10, verbose=2, n_jobs=-1).fit(x_train, y_train)

In [None]:
gbm_cv_model.best_params_

In [None]:
gbm_tuned = GradientBoostingRegressor(learning_rate=0.1,
                                      loss="lad",
                                      max_depth=3,
                                      n_estimators=200,
                                      subsample=1).fit(x_train, y_train)

In [None]:
y_pred = gbm_tuned.predict(x_test)
np.sqrt(mean_squared_error(y_test, y_pred))

XGBost

Gbm'in hız ve tahmin performansını artırmak için optimize edilmiştir

In [None]:
import xgboost

In [None]:
from xgboost import XGBRegressor

In [None]:
xgb = XGBRegressor().fit(x_train, y_train)

In [None]:
y_pred = xgb.predict(x_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
xgb_params = {"learning_reate": [0.1, 0.01, 0.5],
              "max_depth": [2, 3, 4, 5, 8],
              "n_estimators": [100, 200, 500, 1000],
              "colsample_bytree": [0.4, 0.7, 1]}

In [None]:
xgb_cv_model = GridSearchCV(xgb, xgb_params, cv=10, verbose=2, n_jobs=-1).fit(x_train, y_train)

In [None]:
xgb_cv_model.best_params_

In [None]:
xgb_tuned = XGBRegressor(colsample_bytree=0.7,
                         learning_rate=0.5,
                         max_depth=2,
                         n_estimators=100).fit(x_train, y_train)

In [None]:
y_pred = xgb_tuned.predict(x_test)
np.sqrt(mean_squared_error(y_test, y_pred))

LIGHT GBM
XGBoost'un eğitim süresi performansını artırmaya yönelik geliştirilen

In [1]:
from lightgbm import LGBMRegressor

In [2]:
lgbm_model = LGBMRegressor().fit(x_train, y_train)

NameError: name 'x_train' is not defined

In [165]:
y_pred = lgbm_model.predict(x_test)
np.sqrt(mean_squared_error(y_test, y_pred))

363.8712087611089

In [None]:
lgbm_params = {"learning_rate": [0.01, 0.1, 0.5, 1],
               "n_estimators": [20, 40, 100, 200, 500, 1000],
               "max_depth": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}

In [None]:
lgbm_cv_model = GridSearchCV(lgbm_model, lgbm_params, cv=10, n_jobs=10, verbose=2).fit(x_train, y_train)

In [None]:
lgbm_tuned = LGBMRegressor(learning_rate=0.1, max_depth=6, n_estimators=20).fit(x_train, y_train)

In [None]:
y_pred = lgbm_tuned.predict(x_test)
np.sqrt(mean_squared_error(y_test, y_pred))

CATEGORY BOOSTING(CATBOOST)

In [None]:
df = pd.read_csv("data/Hitters.csv")
df = df.dropna()
dms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])

In [7]:
def compML(df, y, alg):
    # train-test ayrimi
    y = df[y]
    x_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
    x = pd.concat([x_, dms[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

    #modelleme
    model = alg().fit(x_train, y_train)
    y_pred = model.predict(x_test)
    RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
    model_name = alg.__name__
    print(model_name, "Modeli test hatası:", RMSE)

In [14]:
compML(df, "Salary", LGBMRegressor)

SVR Modeli test hatası: 460.0032657244849
