<a href="https://colab.research.google.com/github/UmutSonmezz/Data_Science/blob/main/CatBoost_with_optuna.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install catboost
!pip install optuna
!pip install dask[dataframe]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostRegressor
import optuna
from optuna.exceptions import TrialPruned

# Veri yükleme ve ön işleme
train_data = pd.read_csv("/content/drive/MyDrive/Datathon/Duzenlenmis_Train .csv", low_memory=False)
test_data = pd.read_csv("/content/drive/MyDrive/Datathon/duzenlenmis_test - test_x (1).csv", low_memory=False)

# Hedef değişkeni ayırma ve NaN değerleri ele alma
y_train = train_data['Degerlendirme Puani']
X_train = train_data.drop(['Degerlendirme Puani', 'id'], axis=1)
X_test = test_data.drop('id', axis=1)

# Hedef değişkendeki NaN değerleri ele alma
nan_mask = y_train.isna()
print(f"Hedef değişkende {nan_mask.sum()} adet NaN değer bulundu.")
if nan_mask.sum() > 0:
    X_train = X_train[~nan_mask]
    y_train = y_train[~nan_mask]
    print("NaN değerlere sahip örnekler kaldırıldı.")

# Kategorik ve sayısal değişkenleri belirleme
categorical_features = X_train.select_dtypes(include=['object']).columns
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# CatBoost modeli oluşturma
def create_model(params=None):
    if params is None:
        model = CatBoostRegressor(verbose=0, task_type='GPU')
    else:
        model = CatBoostRegressor(verbose=0, task_type='GPU', **params)

    return Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

# Objective fonksiyonu
def objective(trial):
    params = {
        'depth': trial.suggest_int('depth', 1, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1.0),
        'iterations': trial.suggest_int('iterations', 50, 1000),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-8, 10.0),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'task_type': 'CPU'  # GPU kullanımını devre dışı bırak
    }

    model = Pipeline([
        ('scaler', StandardScaler()),
        ('catboost', CatBoostRegressor(**params, random_state=42, verbose=0))
    ])

    score = cross_val_score(
        model, X_train, y_train,
        cv=KFold(n_splits=5, shuffle=True, random_state=42),
        scoring='neg_mean_squared_error',
        error_score='raise'  # Hata ayıklama modunu etkinleştir
    )

    return -score.mean()

# Optuna çalışması
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=5)  # Deneme sayısı

# En iyi modeli eğitme
best_params = study.best_params
best_model = Pipeline([
    ('scaler', StandardScaler()),
    ('catboost', CatBoostRegressor(**best_params, random_state=42, verbose=0))
])
best_model.fit(X_train, y_train)

# Test seti üzerinde tahmin yapma
y_pred = best_model.predict(X_test)

# Optuna çalışması
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=5)  # Deneme sayısı

# En iyi modeli eğitme
best_model = create_model()  # Parametresiz çağrı
best_model.set_params(**study.best_params)
best_model.fit(X_train, y_train)

# Test setinde tahmin
y_test_pred = best_model.predict(X_test)

# Tahmin sonuçlarını kaydetme
submission = pd.DataFrame({'id': test_data['id'], 'Degerlendirme Puani': y_test_pred})
submission.to_csv('/content/drive/MyDrive/submission.csv', index=False)

# Sonuçları yazdırma
print("En iyi parametreler:", study.best_params)
print("En iyi RMSE değeri:", -study.best_value)

# Önemli özellikleri görselleştirme
feature_importance = best_model.named_steps['model'].feature_importances_
feature_names = numeric_features.tolist() + list(best_model.named_steps['preprocessor']
                                                 .named_transformers_['cat']
                                                 .named_steps['encoder']
                                                 .get_feature_names_out(categorical_features))

plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importance, y=feature_names)
plt.title('Özellik Önemlilikleri')
plt.tight_layout()
plt.show()

In [None]:
print(study.best_trial)  # En iyi denemenin detayları
print(study.best_params)  # En iyi denemenin parametreleri
print(study.best_value)   # En iyi denemenin sonucu (objective value)

6.02


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier
import optuna

# Veriyi yükleyin (örnek)
train_df = pd.read_csv("/content/drive/MyDrive/Datathon/Duzenlenmis_Train .csv", low_memory=False)
test_df = pd.read_csv("/content/drive/MyDrive/Datathon/duzenlenmis_test - test_x (1).csv", low_memory=False)

# Hedef değişken ve bağımsız değişkenler
X = df.drop(columns=['hedef'])
y = df['hedef']

# Kategorik sütunları tespit et
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

# Kategorik sütunları etiketle
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

# Veriyi eğitim ve test setlerine ayır
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Sayısal ve kategorik sütunları tespit et
numeric_cols = X_train.select_dtypes(include=[np.number]).columns
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns

# Sayısal sütunlardaki eksik değerleri medyan ile doldur
X_train[numeric_cols] = X_train[numeric_cols].fillna(X_train[numeric_cols].median())
X_test[numeric_cols] = X_test[numeric_cols].fillna(X_train[numeric_cols].median())  # Train'deki medyanı kullan

# Kategorik sütunlardaki eksik değerleri en sık (mode) görülen değerle doldur
X_train[categorical_cols] = X_train[categorical_cols].fillna(X_train[categorical_cols].mode().iloc[0])
X_test[categorical_cols] = X_test[categorical_cols].fillna(X_train[categorical_cols].mode().iloc[0])

# CatBoost modeli
def objective(trial):
    param = {
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-5, 10),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_seed': 42,
        'task_type': 'GPU'  # Eğer GPU kullanıyorsanız, aksi halde 'CPU'
    }

    model = CatBoostClassifier(**param, verbose=0)

    # K-fold çapraz doğrulama
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    score = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')

    return -score.mean()

# Optuna çalışması
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=5)

# En iyi modeli eğitme
best_params = study.best_params
best_model = CatBoostClassifier(**best_params, random_seed=42, task_type='GPU')  # GPU varsa 'GPU', yoksa 'CPU'
best_model.fit(X_train, y_train)

# Test setinde model performansı
accuracy = best_model.score(X_test, y_test)
print(f"Test seti doğruluk oranı: {accuracy}")


**6.17**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import re

# ... (Önceki kod aynı kalıyor)

# K-fold çapraz doğrulama
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
rmse_scores = []

for fold, (train_index, val_index) in enumerate(kf.split(X), 1):
    print(f"Fold {fold}")
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    model = CatBoostRegressor(
        iterations=3000,
        learning_rate=0.05,
        depth=4,
        l2_leaf_reg=3,
        loss_function='RMSE',
        eval_metric='RMSE',
        random_seed=42,
        early_stopping_rounds=100,
        use_best_model=True
    )

    model.fit(
        X_train, y_train,
        eval_set=(X_val, y_val),
        verbose=100
    )

    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores.append(rmse)
    print(f"RMSE for fold {fold}: {rmse}")

print(f"\nMean RMSE: {np.mean(rmse_scores):.4f} (+/- {np.std(rmse_scores) * 2:.4f})")

# Final model eğitimi
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

final_model = CatBoostRegressor(
    iterations=3000,
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=3,
    loss_function='RMSE',
    eval_metric='RMSE',
    random_seed=42,
    early_stopping_rounds=100,
    use_best_model=True
)

final_model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    verbose=100
)

# Test seti üzerinde tahmin yapma
y_pred = final_model.predict(X_test)

# Özellik önemini görselleştirme
feature_importance = final_model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values('importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='importance', y='feature', data=feature_importance_df.head(20))
plt.title('Top 20 Most Important Features')
plt.tight_layout()
plt.show()

# Tahminleri kaydetme
submission = pd.DataFrame({'id': test_data['id'], 'Degerlendirme_Puani': y_pred})

# Tahmin değerlerini düzeltme
submission['Degerlendirme_Puani'] = submission['Degerlendirme_Puani'].apply(lambda x: '{:.2f}'.format(x))

submission.to_csv('improved_catboost_predictions.csv', index=False)

print("Tahminler 'improved_catboost_predictions.csv' dosyasına kaydedildi.")

**OPTUNA İLE DENEME**

In [None]:
pip install optuna
!pip install catboost
!pip install optuna
!pip install dask[dataframe]

In [None]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split, KFold
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

# Optuna ile CatBoost hiperparametre optimizasyonu
def objective(trial):
    # Optuna'nın optimize edeceği hiperparametreler
    params = {
        'iterations': trial.suggest_int('iterations', 1000, 5000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-2, 10),
        'random_seed': 42,
        'loss_function': 'RMSE',
        'eval_metric': 'RMSE',
        'early_stopping_rounds': 100,
        'use_best_model': True
    }

    # K-fold çapraz doğrulama
    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    rmse_scores = []

    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        model = CatBoostRegressor(**params)

        model.fit(
            X_train, y_train,
            eval_set=(X_val, y_val),
            verbose=False
        )

        y_pred = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_scores.append(rmse)

    return np.mean(rmse_scores)

# Optuna çalıştırma
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# En iyi hiperparametreler
best_params = study.best_params
print("En iyi parametreler:", best_params)

# Final model eğitimi
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

final_model = CatBoostRegressor(
    **best_params,
    random_seed=42,
    loss_function='RMSE',
    eval_metric='RMSE',
    early_stopping_rounds=100,
    use_best_model=True
)

final_model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    verbose=100
)

# Test seti üzerinde tahmin yapma
y_pred = final_model.predict(X_test)

# Özellik önemini görselleştirme
feature_importance = final_model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values('importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='importance', y='feature', data=feature_importance_df.head(20))
plt.title('Top 20 Most Important Features')
plt.tight_layout()
plt.show()

# Tahminleri kaydetme
submission = pd.DataFrame({'id': test_data['id'], 'Degerlendirme_Puani': y_pred})

# Tahmin değerlerini düzeltme
submission['Degerlendirme_Puani'] = submission['Degerlendirme_Puani'].apply(lambda x: '{:.2f}'.format(x))

submission.to_csv('improved_catboost_predictions.csv', index=False)

print("Tahminler 'improved_catboost_predictions.csv' dosyasına kaydedildi.")