# 🤖 Sahte Haber Tespiti - Makine Öğrenmesi Modeli

Bu notebook'ta sahte haber tespiti için makine öğrenmesi modeli geliştireceğiz.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import roc_curve, auc, precision_recall_curve
import warnings
warnings.filterwarnings('ignore')

plt.rcParams['figure.figsize'] = (12, 8)
print("✅ ML kütüphaneleri hazır!")

In [None]:
# Veri yükleme ve hazırlama
fake_news = pd.read_csv('../data/Fake.csv')
real_news = pd.read_csv('../data/True.csv')

fake_news['label'] = 0
real_news['label'] = 1

df = pd.concat([fake_news, real_news], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Eksik değerleri temizle
df = df.dropna(subset=['title', 'text'])
df = df[df['title'].str.strip() != '']
df = df[df['text'].str.strip() != '']

print(f"📊 Model için hazır veri: {len(df):,} haber")
print(f"📈 Sahte: {len(df[df['label'] == 0]):,}")
print(f"📈 Gerçek: {len(df[df['label'] == 1]):,}")

In [None]:
# Metin özelliklerini birleştir
df['combined_text'] = df['title'] + ' ' + df['text']

# Veri setini böl
X = df['combined_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"🔄 Eğitim seti: {len(X_train):,} örnek")
print(f"🔄 Test seti: {len(X_test):,} örnek")

In [None]:
# TF-IDF Vektörizasyon
print("🔤 TF-IDF vektörizasyon yapılıyor...")

tfidf = TfidfVectorizer(max_features=10000, stop_words='english', 
                       lowercase=True, ngram_range=(1, 2))

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"✅ TF-IDF tamamlandı: {X_train_tfidf.shape[1]} özellik")

In [None]:
# Model eğitimi
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Naive Bayes': MultinomialNB(),
    'SVM': SVC(probability=True, random_state=42)
}

results = {}

print("🤖 Modeller eğitiliyor...")

for name, model in models.items():
    print(f"\n🔄 {name} eğitiliyor...")
    
    # Eğitim
    model.fit(X_train_tfidf, y_train)
    
    # Tahmin
    y_pred = model.predict(X_test_tfidf)
    y_pred_proba = model.predict_proba(X_test_tfidf)[:, 1]
    
    # Sonuçları kaydet
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }
    
    print(f"✅ {name} Doğruluk: {accuracy:.4f}")

print("\n🎯 Tüm modeller eğitildi!")

In [None]:
# Model performans karşılaştırması
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Doğruluk karşılaştırması
model_names = list(results.keys())
accuracies = [results[name]['accuracy'] for name in model_names]

bars = axes[0,0].bar(model_names, accuracies, color=['#ff6b6b', '#4ecdc4', '#45b7d1', '#96ceb4'])
axes[0,0].set_title('🎯 Model Doğruluk Karşılaştırması', fontweight='bold')
axes[0,0].set_ylabel('Doğruluk Oranı')
axes[0,0].set_ylim(0.8, 1.0)
axes[0,0].tick_params(axis='x', rotation=45)

for bar, acc in zip(bars, accuracies):
    axes[0,0].text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.005,
                  f'{acc:.3f}', ha='center', va='bottom', fontweight='bold')

# En iyi modelin confusion matrix'i
best_model_name = max(results.keys(), key=lambda k: results[k]['accuracy'])
best_predictions = results[best_model_name]['predictions']

cm = confusion_matrix(y_test, best_predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0,1])
axes[0,1].set_title(f'🔥 {best_model_name} - Confusion Matrix', fontweight='bold')
axes[0,1].set_xlabel('Tahmin')
axes[0,1].set_ylabel('Gerçek')

# ROC eğrileri
for name in model_names:
    fpr, tpr, _ = roc_curve(y_test, results[name]['probabilities'])
    roc_auc = auc(fpr, tpr)
    axes[1,0].plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.3f})')

axes[1,0].plot([0, 1], [0, 1], 'k--')
axes[1,0].set_title('📈 ROC Eğrileri', fontweight='bold')
axes[1,0].set_xlabel('False Positive Rate')
axes[1,0].set_ylabel('True Positive Rate')
axes[1,0].legend()
axes[1,0].grid(True, alpha=0.3)

# Feature importance (Random Forest için)
if 'Random Forest' in results:
    rf_model = results['Random Forest']['model']
    feature_names = tfidf.get_feature_names_out()
    importances = rf_model.feature_importances_
    
    # En önemli 15 özellik
    top_indices = np.argsort(importances)[-15:]
    top_features = [feature_names[i] for i in top_indices]
    top_importances = importances[top_indices]
    
    axes[1,1].barh(range(len(top_features)), top_importances, color='orange')
    axes[1,1].set_yticks(range(len(top_features)))
    axes[1,1].set_yticklabels(top_features)
    axes[1,1].set_title('🌟 En Önemli Özellikler (Random Forest)', fontweight='bold')

plt.tight_layout()
plt.savefig('../analysis/model_performansi.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\n🏆 En iyi model: {best_model_name} (Doğruluk: {results[best_model_name]['accuracy']:.4f})")

In [None]:
# Detaylı model raporu
print("📋 DETAYLI MODEL RAPORU")
print("=" * 50)

for name in model_names:
    print(f"\n🤖 {name}:")
    print("-" * 30)
    print(f"Doğruluk: {results[name]['accuracy']:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, results[name]['predictions']))