# 🚀 Sahte Haber Analizi - Gelişmiş Analizler ve Modelleme

Bu notebook'ta gelişmiş makine öğrenmesi modelleri ve deep learning yaklaşımları kullanacağız.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import roc_curve, auc, precision_recall_curve
import xgboost as xgb
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')

plt.rcParams['figure.figsize'] = (12, 8)
print("🤖 Gelişmiş ML kütüphaneleri hazır!")

In [None]:
# Veri hazırlama
fake_news = pd.read_csv('../data/Fake.csv')
real_news = pd.read_csv('../data/True.csv')

fake_news['label'] = 0
real_news['label'] = 1

df = pd.concat([fake_news, real_news], ignore_index=True)
df = df.dropna(subset=['title', 'text'])
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Metin birleştirme
df['combined_text'] = df['title'] + ' ' + df['text']

print(f"📊 Gelişmiş analiz için veri: {len(df):,} haber")

In [None]:
# Veri setini böl
X = df['combined_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"🔄 Eğitim seti: {len(X_train):,}")
print(f"🔄 Test seti: {len(X_test):,}")

# TF-IDF Vektörizasyon
tfidf = TfidfVectorizer(max_features=15000, stop_words='english', 
                       lowercase=True, ngram_range=(1, 3), min_df=2, max_df=0.95)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"✅ TF-IDF: {X_train_tfidf.shape[1]} özellik")

In [None]:
# Gelişmiş modeller
advanced_models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000, C=1.0),
    'Random Forest': RandomForestClassifier(n_estimators=200, random_state=42, max_depth=20),
    'XGBoost': xgb.XGBClassifier(random_state=42, n_estimators=200, max_depth=6),
    'LightGBM': LGBMClassifier(random_state=42, n_estimators=200, max_depth=6, verbose=-1),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42, n_estimators=100),
    'SVM': SVC(probability=True, random_state=42, C=1.0, kernel='rbf')
}

results = {}

print("🚀 Gelişmiş modeller eğitiliyor...")

for name, model in advanced_models.items():
    print(f"\n🔄 {name} eğitiliyor...")
    
    # Eğitim
    model.fit(X_train_tfidf, y_train)
    
    # Tahmin
    y_pred = model.predict(X_test_tfidf)
    y_pred_proba = model.predict_proba(X_test_tfidf)[:, 1]
    
    # Cross validation
    cv_scores = cross_val_score(model, X_train_tfidf, y_train, cv=5, scoring='accuracy')
    
    # Sonuçları kaydet
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }
    
    print(f"✅ {name}:")
    print(f"   Doğruluk: {accuracy:.4f}")
    print(f"   CV Ortalama: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")

print("\n🎯 Tüm gelişmiş modeller eğitildi!")

In [None]:
# Model performans görselleştirmeleri
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# 1. Doğruluk karşılaştırması
model_names = list(results.keys())
accuracies = [results[name]['accuracy'] for name in model_names]
cv_means = [results[name]['cv_mean'] for name in model_names]

x_pos = np.arange(len(model_names))
bars1 = axes[0,0].bar(x_pos - 0.2, accuracies, 0.4, label='Test Doğruluğu', color='#ff6b6b')
bars2 = axes[0,0].bar(x_pos + 0.2, cv_means, 0.4, label='CV Doğruluğu', color='#4ecdc4')

axes[0,0].set_title('🎯 Model Performans Karşılaştırması', fontweight='bold')
axes[0,0].set_ylabel('Doğruluk Oranı')
axes[0,0].set_xticks(x_pos)
axes[0,0].set_xticklabels(model_names, rotation=45, ha='right')
axes[0,0].legend()
axes[0,0].grid(True, alpha=0.3)

# Değerleri bar üzerine yaz
for bar, acc in zip(bars1, accuracies):
    axes[0,0].text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.005,
                  f'{acc:.3f}', ha='center', va='bottom', fontsize=8)

# 2. En iyi modelin confusion matrix
best_model_name = max(results.keys(), key=lambda k: results[k]['accuracy'])
best_predictions = results[best_model_name]['predictions']

cm = confusion_matrix(y_test, best_predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0,1],
           xticklabels=['Sahte', 'Gerçek'], yticklabels=['Sahte', 'Gerçek'])
axes[0,1].set_title(f'🔥 {best_model_name} - Confusion Matrix', fontweight='bold')
axes[0,1].set_xlabel('Tahmin Edilen')
axes[0,1].set_ylabel('Gerçek')

# 3. ROC eğrileri
colors = ['#ff6b6b', '#4ecdc4', '#45b7d1', '#96ceb4', '#feca57', '#ff9ff3']
for i, name in enumerate(model_names):
    fpr, tpr, _ = roc_curve(y_test, results[name]['probabilities'])
    roc_auc = auc(fpr, tpr)
    axes[0,2].plot(fpr, tpr, color=colors[i % len(colors)], 
                  label=f'{name} (AUC = {roc_auc:.3f})')

axes[0,2].plot([0, 1], [0, 1], 'k--', alpha=0.5)
axes[0,2].set_title('📈 ROC Eğrileri Karşılaştırması', fontweight='bold')
axes[0,2].set_xlabel('False Positive Rate')
axes[0,2].set_ylabel('True Positive Rate')
axes[0,2].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
axes[0,2].grid(True, alpha=0.3)

# 4. Precision-Recall eğrileri
for i, name in enumerate(model_names):
    precision, recall, _ = precision_recall_curve(y_test, results[name]['probabilities'])
    axes[1,0].plot(recall, precision, color=colors[i % len(colors)], label=name)

axes[1,0].set_title('📊 Precision-Recall Eğrileri', fontweight='bold')
axes[1,0].set_xlabel('Recall')
axes[1,0].set_ylabel('Precision')
axes[1,0].legend()
axes[1,0].grid(True, alpha=0.3)

# 5. Feature importance (en iyi model için)
if hasattr(results[best_model_name]['model'], 'feature_importances_'):
    feature_names = tfidf.get_feature_names_out()
    importances = results[best_model_name]['model'].feature_importances_
    
    top_indices = np.argsort(importances)[-20:]
    top_features = [feature_names[i] for i in top_indices]
    top_importances = importances[top_indices]
    
    axes[1,1].barh(range(len(top_features)), top_importances, color='orange')
    axes[1,1].set_yticks(range(len(top_features)))
    axes[1,1].set_yticklabels(top_features)
    axes[1,1].set_title(f'🌟 En Önemli Özellikler ({best_model_name})', fontweight='bold')
    axes[1,1].set_xlabel('Önem Skoru')

# 6. Model karşılaştırma radar chart
metrics = ['Doğruluk', 'CV Ortalama']
model_metrics = np.array([[results[name]['accuracy'], results[name]['cv_mean']] for name in model_names])

# Normalize et
model_metrics_norm = (model_metrics - model_metrics.min(axis=0)) / (model_metrics.max(axis=0) - model_metrics.min(axis=0))

for i, name in enumerate(model_names):
    axes[1,2].plot(metrics, model_metrics_norm[i], 'o-', label=name, color=colors[i % len(colors)])

axes[1,2].set_title('📡 Model Performans Radar', fontweight='bold')
axes[1,2].legend()
axes[1,2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../analysis/gelismis_model_analizi.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\n🏆 En iyi model: {best_model_name}")
print(f"🎯 Test doğruluğu: {results[best_model_name]['accuracy']:.4f}")
print(f"🔄 CV doğruluğu: {results[best_model_name]['cv_mean']:.4f} (±{results[best_model_name]['cv_std']:.4f})")

In [None]:
# Detaylı model raporu
print("📋 DETAYLI MODEL PERFORMANS RAPORU")
print("=" * 60)

for name in model_names:
    print(f"\n🤖 {name}:")
    print("-" * 40)
    print(f"Test Doğruluğu: {results[name]['accuracy']:.4f}")
    print(f"CV Doğruluğu: {results[name]['cv_mean']:.4f} (±{results[name]['cv_std']:.4f})")
    print("\nClassification Report:")
    print(classification_report(y_test, results[name]['predictions'], 
                              target_names=['Sahte', 'Gerçek']))

# En iyi modeli kaydet
import joblib
best_model = results[best_model_name]['model']
joblib.dump(best_model, f'../models/en_iyi_model_{best_model_name.lower().replace(" ", "_")}.pkl')
joblib.dump(tfidf, '../models/tfidf_vectorizer.pkl')

print(f"\n💾 En iyi model kaydedildi: {best_model_name}")
print("💾 TF-IDF vektörizer kaydedildi")