In [1]:
# importing libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
import numpy as np
import scipy.special

# reading data
data = pd.read_csv("hyphomz_indian_cleaning_data.csv")

# encoding text to numbers
le1 = LabelEncoder()
le2 = LabelEncoder()
data["service_type_enc"] = le1.fit_transform(data["service_type"])
data["city_tier_enc"] = le2.fit_transform(data["city_tier"])

# selecting input and output
X = data[["service_type_enc", "bhk", "area_sqft", "has_heavy_furniture", "customer_rating", "city_tier_enc", "last_cleaning_days_ago"]]
y = data["customer_returned"]

# splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# trying different models
rf = RandomForestClassifier()
lr = LogisticRegression(max_iter=500)
svm = SVC(probability=True)
knn = KNeighborsClassifier()

models = [("Random Forest", rf), ("Logistic Regression", lr), ("SVM", svm), ("KNN", knn)]

# checking results
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)
    elif hasattr(model, "decision_function"):
        score = model.decision_function(X_test)
        prob = scipy.special.expit(score)
        y_prob = np.vstack([1 - prob, prob]).T
    else:
        y_prob = None
    
    print("========", name, "========")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision (macro):", precision_score(y_test, y_pred, average="macro"))
    print("Precision (weighted):", precision_score(y_test, y_pred, average="weighted"))
    print("Recall (macro):", recall_score(y_test, y_pred, average="macro"))
    print("Recall (weighted):", recall_score(y_test, y_pred, average="weighted"))
    print("F1 (macro):", f1_score(y_test, y_pred, average="macro"))
    print("F1 (weighted):", f1_score(y_test, y_pred, average="weighted"))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    if y_prob is not None:
        try:
            auc = roc_auc_score(y_test, y_prob[:, 1])
            print("ROC AUC:", auc)
        except:
            print("ROC AUC: error or not available")
    else:
        print("No probabilities available")


KeyError: 'service_type'