<a href="https://colab.research.google.com/github/UmutSonmezz/Data_Science/blob/main/DATATHON_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install catboost
!pip install optuna
!pip install dask[dataframe]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import re
from catboost import CatBoostRegressor
import optuna

# Özellik isimlerini temizleme fonksiyonu
def clean_feature_names(df):
    return df.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '_', x))

# Veri setini yükleme
train_data = pd.read_csv("/content/drive/MyDrive/train.csv", low_memory=False)
test_data = pd.read_csv("/content/drive/MyDrive/test_x.csv", low_memory=False)

# Özellik isimlerini temizleme
train_data = clean_feature_names(train_data)
test_data = clean_feature_names(test_data)

# Hedef değişkeni ayırma ve NaN değerleri ele alma
y_train = train_data['Degerlendirme_Puani']
X_train = train_data.drop(['Degerlendirme_Puani', 'id'], axis=1)
X_test = test_data.drop('id', axis=1)

# Hedef değişkendeki NaN değerleri ele alma
nan_mask = y_train.isna()
print(f"Hedef değişkende {nan_mask.sum()} adet NaN değer bulundu.")
if nan_mask.sum() > 0:
    X_train = X_train[~nan_mask]
    y_train = y_train[~nan_mask]
    print("NaN değerlere sahip örnekler kaldırıldı.")

# Veri türlerini kontrol etme ve düzeltme fonksiyonu
def fix_data_types(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            try:
                df[col] = pd.to_numeric(df[col])
            except ValueError:
                pass
    return df

X_train = fix_data_types(X_train)
X_test = fix_data_types(X_test)

# Tarih sütunlarını düzenleme ve yeni özellikler oluşturma
date_columns = ['Dogum_Tarihi']
for col in date_columns:
    X_train[col] = pd.to_datetime(X_train[col], errors='coerce', dayfirst=True)
    X_test[col] = pd.to_datetime(X_test[col], errors='coerce', dayfirst=True)

    for df in [X_train, X_test]:
        df[f'{col}_year'] = df[col].dt.year
        df[f'{col}_month'] = df[col].dt.month
        df[f'{col}_day'] = df[col].dt.day
        df[f'{col}_dayofweek'] = df[col].dt.dayofweek
        df[f'{col}_quarter'] = df[col].dt.quarter
        df[f'{col}_is_weekend'] = df[col].dt.dayofweek >= 5

    X_train = X_train.drop(col, axis=1)
    X_test = X_test.drop(col, axis=1)

# Kategorik ve sayısal değişkenleri belirleme
categorical_features = X_train.select_dtypes(include=['object']).columns
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Model oluşturma
def create_model(model_type='lightgbm'):
    if model_type == 'lightgbm':
        model = LGBMRegressor(random_state=42)
    elif model_type == 'xgboost':
        model = XGBRegressor(random_state=42)
    elif model_type == 'random_forest':
        model = RandomForestRegressor(random_state=42)
    elif model_type == 'catboost':
        model = CatBoostRegressor(random_state=42, verbose=0)
    else:
        raise ValueError("Model type not recognized.")

    return Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

models = {
    'lightgbm': create_model('lightgbm'),
    'xgboost': create_model('xgboost'),
    'random_forest': create_model('random_forest'),
    'catboost': create_model('catboost')
}

# Hiperparametre alanları
param_dist = {
    'lightgbm__model__num_leaves': [31, 127, 255],
    'lightgbm__model__max_depth': [-1, 5, 10, 20],
    'lightgbm__model__learning_rate': [0.01, 0.05, 0.1],
    'xgboost__model__max_depth': [3, 5, 7],
    'xgboost__model__learning_rate': [0.01, 0.05, 0.1],
    'random_forest__model__n_estimators': [100, 200, 300],
    'catboost__model__depth': [4, 6, 8],
    'catboost__model__learning_rate': [0.01, 0.05, 0.1]
}

# Optuna ile hiperparametre optimizasyonu
def objective(trial):
    model_name = trial.suggest_categorical('model_name', list(models.keys()))
    model = models[model_name]

    params = {}
    if model_name == 'lightgbm':
        params = {
            'model__num_leaves': trial.suggest_int('num_leaves', 31, 255),
            'model__max_depth': trial.suggest_int('max_depth', 5, 20),
            'model__learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True)
        }
    elif model_name == 'xgboost':
        params = {
            'model__max_depth': trial.suggest_int('max_depth', 3, 7),
            'model__learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True)
        }
    elif model_name == 'random_forest':
        params = {
            'model__n_estimators': trial.suggest_int('n_estimators', 100, 300)
        }
    elif model_name == 'catboost':
        params = {
            'model__depth': trial.suggest_int('depth', 4, 8),
            'model__learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True)
        }

    model.set_params(**params)

    score = cross_val_score(model, X_train, y_train, cv=KFold(n_splits=5, shuffle=True, random_state=42), scoring='neg_root_mean_squared_error').mean()

    return -score

# Optuna ile en iyi modeli bulma
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# En iyi model
best_model_name = study.best_trial.params['model_name']
best_model = models[best_model_name].set_params(**study.best_trial.params)

# Modeli eğitme
best_model.fit(X_train, y_train)

# Test setinde tahmin
y_test_pred = best_model.predict(X_test)

# Tahmin sonuçlarını kaydetme
submission = pd.DataFrame({'id': test_data['id'], 'Degerlendirme_Puani': y_test_pred})
submission.to_csv('/content/drive/MyDrive/submission.csv', index=False)

# Önemli özellikleri çıkarma ve görselleştirme
if hasattr(best_model.named_steps['model'], 'feature_importances_'):
    importances = best_model.named_steps['model'].feature_importances_
    features = X_train.columns
    indices = np.argsort(importances)[::-1]

    plt.figure(figsize=(10, 6))
    plt.title('Feature Importances')
    plt.barh(range(len(indices)), importances[indices], color='b', align='center')
    plt.yticks(range(len(indices)), [features[i] for i in indices])
    plt.gca().invert_yaxis()
    plt.show()

print("En iyi model:", best_model_name)
print("Optuna ile bulunan en iyi hiperparametreler:", study.best_trial.params)
