In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score, precision_recall_curve
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE
from scipy.sparse import hstack

# 600 kolon + hedef değişken
data = pd.read_excel(r"file_path.xlsx")

data.columns = data.columns.astype(str)

text_columns = [f'a{i}' for i in range(1, 601)]

y = data["target"]
X = data.drop(columns=["target"] + text_columns)

# TF-IDF
combined_text = data[text_columns].apply(lambda row: " ".join(row.values.astype(str)), axis=1)
tfidf_vectorizer = TfidfVectorizer()
tfidf_features = tfidf_vectorizer.fit_transform(combined_text)

# # One-hot encode (metin formatında olmayan özellikler için)
categorical_columns = [col for col in X.columns if col not in ["combined_text"]]
encoder = OneHotEncoder(handle_unknown="ignore")
encoded_categorical = encoder.fit_transform(X[categorical_columns].fillna("missing"))

# Birleştirme
X_processed = hstack([tfidf_features, encoded_categorical])

# 3 pozitif...
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.25, stratify=y, random_state=42)

# Dengeleme
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

# Ağırlık değiştirme (otomatik)
class_weights = "balanced"
model = SVC(probability=True, class_weight=class_weights, kernel="rbf", random_state=42)
model.fit(X_train_sm, y_train_sm)

y_probs = model.predict_proba(X_test)[:, 1]

# Yanlılık için eşik belirleme (otomatik)
precision, recall, thresholds = precision_recall_curve(y_test, y_probs)

fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
optimal_threshold = thresholds[ix]

y_pred = (y_probs >= optimal_threshold).astype(int)

# Tahmin ve skorlama
print(classification_report(y_test, y_pred))

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

roc_auc = roc_auc_score(y_test, y_probs)
print("ROC-AUC Score:", roc_auc)

f1 = f1_score(y_test, y_pred, zero_division=1)
print("F1 Score:", f1)