In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_auc_score
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

customer_return_df = pd.read_csv('hyphomz_indian_cleaning_data.csv')

le_service = LabelEncoder()
le_city = LabelEncoder()
customer_return_df['service_type_enc'] = le_service.fit_transform(customer_return_df['service_type'])
customer_return_df['city_tier_enc'] = le_city.fit_transform(customer_return_df['city_tier'])

features = ['service_type_enc', 'bhk', 'area_sqft', 'has_heavy_furniture', 'customer_rating', 'city_tier_enc', 'last_cleaning_days_ago']
X = customer_return_df[features]
y = customer_return_df['customer_returned']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Support Vector Machine": SVC(probability=True),
    "K-Nearest Neighbors": KNeighborsClassifier()
}

def print_classification_metrics(y_true, y_pred, y_proba, task_name):
    print(f"\n=== {task_name} ===")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision (macro): {precision_score(y_true, y_pred, average='macro'):.4f}")
    print(f"Precision (weighted): {precision_score(y_true, y_pred, average='weighted'):.4f}")
    print(f"Recall (macro): {recall_score(y_true, y_pred, average='macro'):.4f}")
    print(f"Recall (weighted): {recall_score(y_true, y_pred, average='weighted'):.4f}")
    print(f"F1 Score (macro): {f1_score(y_true, y_pred, average='macro'):.4f}")
    print(f"F1 Score (weighted): {f1_score(y_true, y_pred, average='weighted'):.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    
    try:
        roc_auc = roc_auc_score(y_true, y_proba[:, 1])
        print(f"ROC AUC Score: {roc_auc:.4f}")
    except Exception as e:
        print("ROC AUC Score: Not available")

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)
    else:
        if hasattr(model, "decision_function"):
            decision_scores = model.decision_function(X_test)
            # Convert decision scores to probabilities with sigmoid (approximate)
            import scipy.special
            y_proba = scipy.special.expit(decision_scores)
            y_proba = np.vstack([1 - y_proba, y_proba]).T
        else:
            y_proba = None
    print_classification_metrics(y_test, y_pred, y_proba, f"Customer Return Prediction - {name}")


KeyError: 'service_type'