
Bu notebook'ta 3 farklı öznitelik seçimi yöntemi uygulanarak en önemli özniteliklerin belirlenmesi hedeflenmektedir:

1. **Filtre Yöntemi (Filter Method)** - SelectKBest
2. **Sarmalama Yöntemi (Wrapper Method)** - RFE (Recursive Feature Elimination)
3. **Gömülü Yöntem (Embedded Method)** - Random Forest Feature Importances

### 1. Kutuphanelerin Import Edilmesi

In [1]:
import pandas as pd
import numpy as np  
import plotly.express as px
from sklearn.feature_selection import SelectKBest , f_classif, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
import time
import pickle

print("kutuphaneler yuklendi")

kutuphaneler yuklendi


### 2. İşlenmiş Veriyi Yükleme

In [2]:
X_train = pd.read_csv('../data/processed/X_train.csv')
X_test = pd.read_csv('../data/processed/X_test.csv')
y_train = pd.read_csv('../data/processed/y_train.csv').values.ravel()
y_test = pd.read_csv('../data/processed/y_test.csv').values.ravel()

print(f"Train Set: X_train={X_train.shape}, y_train={y_train.shape}")
print(f"Test Set: X_test={X_test.shape}, y_test={y_test.shape}")
print(f"\nÖznitelik Sayısı: {X_train.shape[1]}")
print(f"\nÖznitelikler:")
print(list(X_train.columns))
print(f"hedef Öznitelik sınıfları: {np.unique(y_train)}")

Train Set: X_train=(800, 18), y_train=(800,)
Test Set: X_test=(200, 18), y_test=(200,)

Öznitelik Sayısı: 18

Öznitelikler:
['Age', 'Saving accounts', 'Checking account', 'Credit amount', 'Duration', 'Sex_male', 'Job_1', 'Job_2', 'Job_3', 'Housing_own', 'Housing_rent', 'Purpose_car', 'Purpose_domestic appliances', 'Purpose_education', 'Purpose_furniture/equipment', 'Purpose_radio/TV', 'Purpose_repairs', 'Purpose_vacation/others']
hedef Öznitelik sınıfları: [0 1]


### 3. Filtre Yöntemi - SelectKBest


In [3]:
# SelectKBest ile en iyi k özniteliği seç

# Farklı k değerleri için skorları hesapla
k_values = [5, 10, 15]  
scores_list = []

for k in k_values:  
    selector = SelectKBest(score_func=f_classif, k=k)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)
    
    # RandomForestClassifier ile performans değerlendirmesi
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    cv_scores = cross_val_score(model, X_train_selected, y_train, cv=5, scoring='roc_auc')
    
    scores_list.append({
        'k': k,
        'mean_auc': cv_scores.mean(),
        'std_auc': cv_scores.std()
    })
    
    print(f"k={k:2d} öznitelik -> ROC-AUC: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")

# Skorları DataFrame'e çevir
scores_df = pd.DataFrame(scores_list)
print("\n" + "="*50)
print(f"En iyi k değeri: {scores_df.loc[scores_df['mean_auc'].idxmax(), 'k']}")

k= 5 öznitelik -> ROC-AUC: 0.6794 (±0.0395)
k=10 öznitelik -> ROC-AUC: 0.7213 (±0.0392)
k=15 öznitelik -> ROC-AUC: 0.7143 (±0.0187)

En iyi k değeri: 10


In [4]:
# En iyi k değerini kullanarak final seçim
best_k = int(scores_df.loc[scores_df['mean_auc'].idxmax(), 'k'])

selector_filter = SelectKBest(score_func=f_classif, k=best_k)
X_train_filter = selector_filter.fit_transform(X_train, y_train)
X_test_filter = selector_filter.transform(X_test)

# Seçilen öznitelikleri al
selected_features_filter = X_train.columns[selector_filter.get_support()].tolist()

print(f"Filtre Yöntemi ile Seçilen {len(selected_features_filter)} Öznitelik:")
print("="*60)
for i, feature in enumerate(selected_features_filter, 1):
    score = selector_filter.scores_[X_train.columns.get_loc(feature)]
    print(f"{i:2d}. {feature:30s} (F-score: {score:.2f})")

Filtre Yöntemi ile Seçilen 10 Öznitelik:
 1. Age                            (F-score: 2.67)
 2. Checking account               (F-score: 35.02)
 3. Credit amount                  (F-score: 15.67)
 4. Duration                       (F-score: 35.47)
 5. Sex_male                       (F-score: 5.70)
 6. Job_3                          (F-score: 1.46)
 7. Housing_own                    (F-score: 15.63)
 8. Housing_rent                   (F-score: 7.74)
 9. Purpose_education              (F-score: 6.34)
10. Purpose_radio/TV               (F-score: 10.48)


In [5]:
# F-skorlarını görselleştir 
feature_scores = pd.DataFrame({
    'Feature': X_train.columns,
    'F_Score': selector_filter.scores_
}).sort_values('F_Score', ascending=False).head(best_k)

fig = px.bar(feature_scores, 
             x='F_Score', 
             y='Feature',
             orientation='h',
             title=f'Filtre Yöntemi - En Yüksek {best_k} F-Skorlu Öznitelik',
             labels={'F_Score': 'F-Score', 'Feature': 'Öznitelik'},
             color='F_Score',
             color_continuous_scale='Blues',
             height=500)

fig.update_layout(yaxis={'categoryorder': 'total ascending'})
fig.show()

### 4. Sarmalama Yontemi (Wrapper Method) - RFE

In [6]:
# RFE ile öznitelik seçimi
# Estimator olarak LogisticRegression kullanıyorum (hızlı ve etkili)

print("RFE ile farklı öznitelik sayıları deneniyor...")
print("="*60)

rfe_scores = []
feature_counts = [5, 10, 15]

for n_features in feature_counts:
    start_time = time.time()
    
    estimator = LogisticRegression(max_iter=1000, random_state=42)
    rfe = RFE(estimator=estimator, n_features_to_select=n_features)
    
    X_train_rfe = rfe.fit_transform(X_train, y_train)
    X_test_rfe = rfe.transform(X_test)
    
    # Model performansı
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    cv_scores = cross_val_score(model, X_train_rfe, y_train, cv=5, scoring='roc_auc')
    
    elapsed_time = time.time() - start_time
    
    rfe_scores.append({
        'n_features': n_features,
        'mean_auc': cv_scores.mean(),
        'std_auc': cv_scores.std(),
        'time': elapsed_time
    })
    
    print(f"n={n_features:2d} öznitelik -> ROC-AUC: {cv_scores.mean():.4f} (±{cv_scores.std():.4f}) [{elapsed_time:.2f}s]")

rfe_scores_df = pd.DataFrame(rfe_scores)
print("\n" + "="*60)
print(f"En iyi öznitelik sayısı: {rfe_scores_df.loc[rfe_scores_df['mean_auc'].idxmax(), 'n_features']}")

RFE ile farklı öznitelik sayıları deneniyor...
n= 5 öznitelik -> ROC-AUC: 0.6761 (±0.0407) [0.60s]
n=10 öznitelik -> ROC-AUC: 0.6535 (±0.0549) [0.57s]
n=15 öznitelik -> ROC-AUC: 0.7021 (±0.0382) [0.61s]

En iyi öznitelik sayısı: 15


In [7]:
# En iyi öznitelik sayısı ile final RFE
best_n_features_rfe = int(rfe_scores_df.loc[rfe_scores_df['mean_auc'].idxmax(), 'n_features'])

estimator = LogisticRegression(max_iter=1000, random_state=42)
selector_rfe = RFE(estimator=estimator, n_features_to_select=best_n_features_rfe)
X_train_rfe = selector_rfe.fit_transform(X_train, y_train)
X_test_rfe = selector_rfe.transform(X_test)

# Seçilen öznitelikleri al
selected_features_rfe = X_train.columns[selector_rfe.get_support()].tolist()

print(f"RFE ile Seçilen {len(selected_features_rfe)} Öznitelik:")
print("="*60)
for i, feature in enumerate(selected_features_rfe, 1):
    ranking = selector_rfe.ranking_[X_train.columns.get_loc(feature)]
    print(f"{i:2d}. {feature:30s} (Rank: {ranking})")

RFE ile Seçilen 15 Öznitelik:
 1. Age                            (Rank: 1)
 2. Saving accounts                (Rank: 1)
 3. Checking account               (Rank: 1)
 4. Duration                       (Rank: 1)
 5. Sex_male                       (Rank: 1)
 6. Job_1                          (Rank: 1)
 7. Housing_own                    (Rank: 1)
 8. Housing_rent                   (Rank: 1)
 9. Purpose_car                    (Rank: 1)
10. Purpose_domestic appliances    (Rank: 1)
11. Purpose_education              (Rank: 1)
12. Purpose_furniture/equipment    (Rank: 1)
13. Purpose_radio/TV               (Rank: 1)
14. Purpose_repairs                (Rank: 1)
15. Purpose_vacation/others        (Rank: 1)


In [8]:
# RFE ile seçilen ve elenen özniteliklerin görselleştirilmesi
rfe_all_features = pd.DataFrame({
    'Feature': X_train.columns,
    'Ranking': selector_rfe.ranking_,
    'Selected': selector_rfe.get_support()
})

top_15 = rfe_all_features[rfe_all_features['Selected'] == True].nsmallest(15, 'Ranking')
bottom_3 = rfe_all_features[rfe_all_features['Selected'] == False].nlargest(3, 'Ranking')

combined_features = pd.concat([top_15, bottom_3])
combined_features['Status'] = combined_features['Selected'].map({True: 'Seçilen', False: 'Elenen'})
combined_features = combined_features.sort_values('Ranking')

fig = px.bar(combined_features, 
             x='Ranking', 
             y='Feature',
             orientation='h',
             title='RFE - Seçilen En İyi 15 Öznitelik ve Elenen 3 Öznitelik',
             labels={'Ranking': 'RFE Ranking (Düşük = İyi)', 'Feature': 'Öznitelik'},
             color='Status',
             color_discrete_map={'Seçilen': '#2ecc71', 'Elenen': '#e74c3c'},
             height=600)

fig.update_layout(
    yaxis={'categoryorder': 'total descending'},
    showlegend=True,
    legend=dict(title='Durum', orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1)
)

fig.show()

### 5. Gömülü Yöntem - Random Forest Feature Importances

In [None]:
# Random Forest ile feature importances hesapliyorum
rf_model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Feature importances'ları al
feature_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("Random Forest - Feature Importances:")
print("="*60)
print(feature_importances.head(15))

Random Forest - Feature Importances:
                        Feature  Importance
3                 Credit amount    0.235396
0                           Age    0.175574
4                      Duration    0.151245
2              Checking account    0.132653
1               Saving accounts    0.071202
5                      Sex_male    0.031287
11                  Purpose_car    0.026261
9                   Housing_own    0.023998
7                         Job_2    0.023890
15             Purpose_radio/TV    0.022590
14  Purpose_furniture/equipment    0.021034
6                         Job_1    0.019749
13            Purpose_education    0.017960
8                         Job_3    0.017718
10                 Housing_rent    0.017109


In [10]:
#feature importanceslarını görselleştir
fig = px.bar(feature_importances.head(15), 
             x='Importance', 
             y='Feature',
             orientation='h',
             title='Random Forest - En Önemli 15 Öznitelik',
             labels={'Importance': 'Önem Derecesi', 'Feature': 'Öznitelik'},
             color='Importance',
             color_continuous_scale='Viridis',
             height=500)

fig.update_layout(yaxis={'categoryorder': 'total ascending'})
fig.show()

In [11]:

# Ilk 5 öznitelikten sonra onem derecesi cok dusuyor bu yuzden ilk 5 ozniteligi seciliyorum
selected_features_embedded = feature_importances['Feature'].head(5).tolist()

print("Gömülü Yöntem ile Seçilen 5 Öznitelik:")
print(feature_importances.head(5))

Gömülü Yöntem ile Seçilen 5 Öznitelik:
            Feature  Importance
3     Credit amount    0.235396
0               Age    0.175574
4          Duration    0.151245
2  Checking account    0.132653
1   Saving accounts    0.071202


### 6. Yöntemlerin Karşılaştırılması

In [12]:
# seçilen öznitelikleri karşılaştır
print("Seçilen Öznitelik Sayıları:")
print("="*60)
print(f"Filtre Yöntemi (SelectKBest):     {len(selected_features_filter)} öznitelik")
print(f"Sarmalama Yöntemi (RFE):          {len(selected_features_rfe)} öznitelik")
print(f"Gömülü Yöntem (Random Forest):   {len(selected_features_embedded)} öznitelik")

# Ortak ve farklı öznitelikleri bul
common_all = set(selected_features_filter) & set(selected_features_rfe) & set(selected_features_embedded)
common_filter_rfe = set(selected_features_filter) & set(selected_features_rfe)
common_filter_embedded = set(selected_features_filter) & set(selected_features_embedded)
common_rfe_embedded = set(selected_features_rfe) & set(selected_features_embedded)

print("\n" + "="*60)
print(f"3 Yöntemde de Ortak: {len(common_all)} öznitelik")
print(f"Filtre & RFE: {len(common_filter_rfe)} öznitelik")
print(f"Filtre & Embedded: {len(common_filter_embedded)} öznitelik")
print(f"RFE & Embedded: {len(common_rfe_embedded)} öznitelik")

if len(common_all) > 0:
    print(f"\n3 Yöntemde de Seçilen Öznitelikler:")
    for feature in sorted(common_all):
        print(f"  - {feature}")

Seçilen Öznitelik Sayıları:
Filtre Yöntemi (SelectKBest):     10 öznitelik
Sarmalama Yöntemi (RFE):          15 öznitelik
Gömülü Yöntem (Random Forest):   5 öznitelik

3 Yöntemde de Ortak: 3 öznitelik
Filtre & RFE: 8 öznitelik
Filtre & Embedded: 4 öznitelik
RFE & Embedded: 4 öznitelik

3 Yöntemde de Seçilen Öznitelikler:
  - Age
  - Checking account
  - Duration


### 7. Seçilmiş Veri Setlerinin Kaydedilmesi

In [13]:
# Seçilen öznitelikleri dictionary olarak sakla
selected_features = {
    'filter': selected_features_filter,
    'rfe': selected_features_rfe,
    'embedded': selected_features_embedded,
    'all_features': list(X_train.columns)
}

# Pickle ile kaydet
with open('../data/processed/selected_features.pkl', 'wb') as f:
    pickle.dump(selected_features, f)

print("Seçilen öznitelikler kaydedildi!")
print("\nKaydedilen dosya:")
print("  - ../data/processed/selected_features.pkl")

print("\n" + "="*60)
print("ÖZET:")
print(f"  Filter Method: {len(selected_features_filter)} öznitelik")
print(f"  RFE Method: {len(selected_features_rfe)} öznitelik")
print(f"  Embedded Method: {len(selected_features_embedded)} öznitelik")
print(f"  Tüm Öznitelikler: {len(selected_features['all_features'])} öznitelik")

Seçilen öznitelikler kaydedildi!

Kaydedilen dosya:
  - ../data/processed/selected_features.pkl

ÖZET:
  Filter Method: 10 öznitelik
  RFE Method: 15 öznitelik
  Embedded Method: 5 öznitelik
  Tüm Öznitelikler: 18 öznitelik
