EDA

In [None]:
# Gerekli kütüphaneler
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# ML için
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Görsel ayarlar
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12,6)

from imblearn.over_sampling import SMOTE

In [None]:
df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')

df.head()

In [None]:
df.isnull().sum()

In [None]:
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
cat_cols

In [None]:
encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le  # hangi sınıf hangi label'a denk geliyor, bunu kaydeder


In [None]:
print(encoders["Gender"].classes_)


In [None]:
print(encoders["Attrition"].classes_)

In [None]:
attrition_counts = df['Attrition'].value_counts()
print(attrition_counts)
print("\nOranlar:")
print(attrition_counts / len(df))

# Görselleştirme
sns.countplot(data=df, x='Attrition')
plt.title('Attrition Sınıf Dağılımı')
plt.show()

In [None]:
sns.countplot(data=df, x='Attrition')
plt.title("Çalışanların İşten Ayrılma Durumu")

In [None]:
sns.countplot(x='Gender', hue='Attrition', data=df)
plt.title("Cinsiyete Göre İşten Ayrılma")
plt.show()

In [None]:
df['YearsAtCompany'].hist(bins=20)
plt.title("Şirkette Geçirilen Yılların Dağılımı")
plt.xlabel("Yıl")
plt.ylabel("Çalışan Sayısı")
plt.show()


In [None]:
sns.violinplot(x='Attrition', y='Age', data=df)
plt.title("Yaşa Göre İşten Ayrılma Yoğunluğu")
plt.show()


In [None]:
plt.figure(figsize=(16, 12))
sns.heatmap(df.corr(), cmap="coolwarm", annot=False)
plt.title("Değişkenler Arası Korelasyon")


TEST - TRAİN DATA


In [None]:
X = df.drop("Attrition" , axis = 1)
y= df["Attrition"]

In [None]:
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
y.head()

In [None]:
y_train.head()

In [None]:
X_train, X_val , y_train , y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [None]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
print("Final shapes:")
print(f"Train: {X_resampled.shape}")
print(f"Val: {X_val.shape}")
print(f"Test: {X_test.shape}")

In [None]:
# 1. Sınıf dağılımlarını karşılaştır
print("SMOTE ETKİSİ:")
print("=" * 30)

print("ÖNCE (Orijinal):")
original_counts = pd.Series(y_train).value_counts()
print(f"Kalmış (0): {original_counts[0]} kişi")
print(f"Ayrılmış (1): {original_counts[1]} kişi")
print(f"Oran: {original_counts[1]/original_counts[0]:.2f}")

print("\nSONRA (SMOTE):")
smote_counts = pd.Series(y_resampled).value_counts()
print(f"Kalmış (0): {smote_counts[0]} kişi")
print(f"Ayrılmış (1): {smote_counts[1]} kişi") 
print(f"Oran: {smote_counts[1]/smote_counts[0]:.2f}")

MODEL TRAİN (XGBOOST)

In [None]:
xgb_model = XGBClassifier(random_state = 42, eval_metric='logloss')

In [None]:
param_grid = {
    'n_estimators':[100, 200, 300], #ağaç sayısı
    'max_depth':[3, 5, 7], #ağaç derinliği
    'learning_rate': [0.01, 0.1, 0.2],#öğrenme oranı
    'subsample':[0.8, 0.9, 1.0], #alt örnekleme oranı
    'colsample_bytree':[0.8, 0.9, 1.0] #özellik alt örnekleme oranı 
}

print(f"Toplam test edilecek kombinasyon sayısı: {len(param_grid['n_estimators']) * len(param_grid['max_depth']) * len(param_grid['learning_rate']) * len(param_grid['subsample']) * len(param_grid['colsample_bytree'])}")

In [None]:
from sklearn.model_selection import GridSearchCV

scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    cv=5,
    scoring=scoring,
    refit='f1',  # En iyi modeli F1 skoruna göre seç
    n_jobs=-1,
    return_train_score=True
)

print("\nModel eğitimi başlıyor...")
grid_search.fit(X_resampled, y_resampled)


In [None]:
print("\nEN İYİ PARAMETRELER:")
print("-" * 30)
for param, value in grid_search.best_params_.items():
    print(f"{param}: {value}")

print(f"\nEn iyi CV skoru (ROC-AUC): {grid_search.best_score_:.4f}")

# 6. En iyi modeli al
best_model = grid_search.best_estimator_

# 7. Validation seti ile değerlendir
print("\nVALIDATION SETİ DEĞERLENDİRMESİ:")
print("-" * 35)

y_val_pred = best_model.predict(X_val)
y_val_pred_proba = best_model.predict_proba(X_val)[:, 1]

val_roc_auc = roc_auc_score(y_val, y_val_pred_proba)
val_accuracy = best_model.score(X_val, y_val)

print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Validation ROC-AUC: {val_roc_auc:.4f}")

# 8. Detaylı classification report
print("\nDetaylı Performans Raporu:")
print(classification_report(y_val, y_val_pred, 
                          target_names=['Kalmış', 'Ayrılmış']))

# 9. Confusion Matrix görselleştirme
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_val, y_val_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Kalmış', 'Ayrılmış'],
            yticklabels=['Kalmış', 'Ayrılmış'])
plt.title('Validation Set - Confusion Matrix')
plt.ylabel('Gerçek')
plt.xlabel('Tahmin')
plt.show()

# 10. Feature Importance
feature_importance = best_model.feature_importances_
feature_names = X_resampled.columns

# En önemli 15 özellik
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False).head(15)

plt.figure(figsize=(10, 8))
sns.barplot(data=importance_df, y='feature', x='importance')
plt.title('En Önemli 15 Özellik')
plt.xlabel('Önem Skoru')
plt.tight_layout()
plt.show()

print("\nEn Önemli 10 Özellik:")
print(importance_df.head(10).to_string(index=False))

# 11. Test seti ile final değerlendirme
print("\nTEST SETİ (FINAL) DEĞERLENDİRMESİ:")
print("-" * 40)

y_test_pred = best_model.predict(X_test)
y_test_pred_proba = best_model.predict_proba(X_test)[:, 1]

test_roc_auc = roc_auc_score(y_test, y_test_pred_proba)
test_accuracy = best_model.score(X_test, y_test)

print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test ROC-AUC: {test_roc_auc:.4f}")

print("\nFinal Test Raporu:")
print(classification_report(y_test, y_test_pred, 
                          target_names=['Kalmış', 'Ayrılmış']))

In [76]:
def make_results(model_name: str, model_object, metric: str):
    '''
    Arguments:
    model_name (string): what you want the model to be called in the output table
    model_object: a fit GridSearchCV object
    metric (string): precision, recall, f1, or accuracy 
     
    Returns a pandas df with the F1, recall, precision, and accuracy scores
    for the model with the best mean 'metric' score across all validation folds.
    '''
    
    # Create dictionary that maps input metric to actual metric name in GridSearchCV
    metric_dict = {'precision': 'mean_test_precision',
                  'recall': 'mean_test_recall',
                  'f1': 'mean_test_f1',
                  'accuracy': 'mean_test_accuracy',
                  'roc_auc': 'mean_test_roc_auc'
                  }
    
    # Get all the results from the CV and put them in a df
    cv_results = pd.DataFrame(model_object.cv_results_)
    
    # Isolate the row of the df with the max(metric) score
    best_estimator_results = cv_results.iloc[cv_results[metric_dict[metric]].idxmax(), :]
    
    # Extract all scores from that row
    f1 = best_estimator_results.mean_test_f1
    recall = best_estimator_results.mean_test_recall
    precision = best_estimator_results.mean_test_precision
    accuracy = best_estimator_results.mean_test_accuracy
    roc_auc = best_estimator_results.mean_test_roc_auc
    
    # Create table of results
    table = pd.DataFrame({'model': [model_name],
                         'precision': [precision],
                         'recall': [recall],
                         'F1': [f1],
                         'accuracy': [accuracy],
                         'roc_auc': [roc_auc]
                         })
    
    return table

In [77]:
def get_test_scores(model_name: str, preds, y_test_data):
    '''
    Generate a table of test scores.

    In:
    model_name (string): Your choice: how the model will be named in the output table
    preds: numpy array of test predictions
    y_test_data: numpy array of y_test data

    Out:
    table: a pandas df of precision, recall, f1, and accuracy scores for your model
    '''
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    
    accuracy = accuracy_score(y_test_data, preds)
    precision = precision_score(y_test_data, preds)
    recall = recall_score(y_test_data, preds)
    f1 = f1_score(y_test_data, preds)

    table = pd.DataFrame({'model': [model_name],
                        'precision': [precision],
                        'recall': [recall],
                        'F1': [f1],
                        'accuracy': [accuracy]
                        })

    return table

In [78]:
cv_results = make_results('XGBoost + SMOTE (CV)', grid_search, 'roc_auc')

# Test seti sonuçları
test_results = get_test_scores('XGBoost', y_test_pred, y_test)

print("\nKARŞILAŞTIRMALI SONUÇLAR:")
print("=" * 50)
print("Cross-Validation Sonuçları:")
print(cv_results.round(4))
print("\nTest Seti Sonuçları:")
print(test_results.round(4))

# İki tabloyu birleştir
comparison_table = pd.concat([cv_results, test_results], ignore_index=True)
print("\nTÜM SONUÇLAR:")
print(comparison_table.round(4))
print("\n" + "="*50)
print("MODEL PERFORMANS ÖZETİ")
print("="*50)
print(f"En iyi CV ROC-AUC: {grid_search.best_score_:.4f}")
print(f"Validation ROC-AUC: {val_roc_auc:.4f}")
print(f"Test ROC-AUC: {test_roc_auc:.4f}")
print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

# Overfitting kontrolü
if abs(grid_search.best_score_ - test_roc_auc) > 0.05:
    print("\n⚠️  Uyarı: Model overfitting yapıyor olabilir!")
else:
    print("\n✅ Model performansı tutarlı görünüyor.")


KARŞILAŞTIRMALI SONUÇLAR:
Cross-Validation Sonuçları:
                  model  precision  recall      F1  accuracy  roc_auc
0  XGBoost + SMOTE (CV)     0.9105  0.8875  0.8837     0.899   0.9721

Test Seti Sonuçları:
     model  precision  recall    F1  accuracy
0  XGBoost     0.4359  0.2787  0.34    0.8503

TÜM SONUÇLAR:
                  model  precision  recall      F1  accuracy  roc_auc
0  XGBoost + SMOTE (CV)     0.9105  0.8875  0.8837    0.8990   0.9721
1               XGBoost     0.4359  0.2787  0.3400    0.8503      NaN

MODEL PERFORMANS ÖZETİ
En iyi CV ROC-AUC: 0.8902
Validation ROC-AUC: 0.8115
Test ROC-AUC: 0.7943
Validation Accuracy: 0.8605
Test Accuracy: 0.8503

⚠️  Uyarı: Model overfitting yapıyor olabilir!


In [None]:
# Her CV fold'unda SMOTE'u ayrı ayrı uygula
# Validation kısmına SMOTE uygulama
# Böylece CV skoru daha gerçekçi olur