# Feature Engineering - Proje: Kapsamlı Feature Engineering Pipeline

Bu proje, gerçek bir veri seti üzerinde kapsamlı feature engineering tekniklerini uygular.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_classif, RFE, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.datasets import make_classification

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline


## 1. Veri Hazırlama ve Keşif


In [None]:
# Örnek veri seti (gerçek projede kendi veri setinizi yükleyin)
X, y = make_classification(n_samples=2000, n_features=30, n_informative=20, 
                           n_redundant=10, random_state=42)

# DataFrame oluştur
feature_names = [f'Feature_{i+1}' for i in range(30)]
df = pd.DataFrame(X, columns=feature_names)
df['Target'] = y

print(f"Veri seti boyutu: {df.shape}")
print(f"\nEksik değerler: {df.isnull().sum().sum()}")
print(f"\nSınıf dağılımı:\n{df['Target'].value_counts()}")
df.head()


## 2. Feature Engineering Adımları


In [None]:
X = df.drop('Target', axis=1)
y = df['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Eğitim seti boyutu: {X_train_scaled.shape}")
print(f"Test seti boyutu: {X_test_scaled.shape}")


## 3. Feature Selection ve Model Karşılaştırması


In [None]:
# Farklı feature selection yöntemleri
selection_methods = {
    'All Features': None,
    'SelectKBest (k=15)': SelectKBest(score_func=f_classif, k=15),
    'RFE (15 features)': RFE(estimator=RandomForestClassifier(n_estimators=50, random_state=42), n_features_to_select=15),
    'Mutual Info (k=15)': SelectKBest(score_func=mutual_info_classif, k=15)
}

results = {}
for method_name, selector in selection_methods.items():
    if selector is None:
        X_train_final = X_train_scaled
        X_test_final = X_test_scaled
    else:
        X_train_final = selector.fit_transform(X_train_scaled, y_train)
        X_test_final = selector.transform(X_test_scaled)
    
    # Model eğit
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train_final, y_train)
    
    # Değerlendir
    y_pred = model.predict(X_test_final)
    y_pred_proba = model.predict_proba(X_test_final)[:, 1]
    
    results[method_name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'ROC-AUC': roc_auc_score(y_test, y_pred_proba),
        'Num Features': X_train_final.shape[1]
    }

results_df = pd.DataFrame(results).T
print("Feature Selection Yöntemlerinin Karşılaştırması:")
print(results_df.round(4))


## 4. Sonuçlar ve Öneriler


In [None]:
best_method = results_df['ROC-AUC'].idxmax()
print("=" * 50)
print("EN İYİ YÖNTEM")
print("=" * 50)
print(f"Yöntem: {best_method}")
print(f"ROC-AUC: {results_df.loc[best_method, 'ROC-AUC']:.4f}")
print(f"Accuracy: {results_df.loc[best_method, 'Accuracy']:.4f}")
print(f"Feature Sayısı: {results_df.loc[best_method, 'Num Features']}")

print("\n" + "=" * 50)
print("ÖNERİLER:")
print("=" * 50)
print("1. Feature selection ile model performansı artırılabilir")
print("2. Daha az feature ile daha hızlı eğitim ve daha az overfitting riski")
print("3. Farklı feature selection yöntemleri farklı algoritmalar için optimize edilebilir")
