In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

# Data preprocess


def preprocess_dataset(df):
    # Handle missing values
    for column in df.columns:
        if df[column].dtype == 'object':  # Categorical data
            df[column].fillna(df[column].mode()[0], inplace=True)
        else:
            df[column].fillna(df[column].median(), inplace=True)
    
    # One-hot encode categorical columns
    df = pd.get_dummies(df)
    
    return df


# Feature Selection
def feature_selector(X, y, num_feats):
    selector = SelectKBest(f_classif, k=num_feats)
    X_new = selector.fit_transform(X, y)
    return X_new

user_data = pd.read_csv('https://raw.githubusercontent.com/subashgandyer/datasets/main/great_customers.csv')

traindf = preprocess_dataset(user_data)

X = traindf.drop(columns=["great_customer_class"])
y = traindf["great_customer_class"]

X_new = feature_selector(X, y, 10)
scaler = StandardScaler()
X_new = scaler.fit_transform(X_new)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

# Initialize models
rf_model = RandomForestClassifier(random_state=42)
svm_model = SVC(random_state=42, probability=True)
logistic_model = LogisticRegression(random_state=42)
nb_model = GaussianNB()
knn_model = KNeighborsClassifier()

# Train and evaluate individual models
models = [rf_model, svm_model, logistic_model, nb_model, knn_model]
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    
    print(f"{model.__class__.__name__} Metrics:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"ROC-AUC: {roc_auc:.2f}")
    print(f"Confusion Matrix:\n{confusion}")

# Ensemble Learning (Voting Classifier)
ensemble_model = VotingClassifier(estimators=[("rf", rf_model), ("svm", svm_model), ("lr", logistic_model), ("nb", nb_model), ("knn", knn_model)], voting="soft")
ensemble_model.fit(X_train, y_train)
y_ensemble_pred = ensemble_model.predict(X_test)
ensemble_accuracy = accuracy_score(y_test, y_ensemble_pred)
ensemble_precision = precision_score(y_test, y_ensemble_pred)
ensemble_recall = recall_score(y_test, y_ensemble_pred)
ensemble_f1 = f1_score(y_test, y_ensemble_pred)
ensemble_roc_auc = roc_auc_score(y_test, y_ensemble_pred)
ensemble_confusion = confusion_matrix(y_test, y_ensemble_pred)

print("Ensemble Model Metrics:")
print(f"Accuracy: {ensemble_accuracy:.2f}")
print(f"Precision: {ensemble_precision:.2f}")
print(f"Recall: {ensemble_recall:.2f}")
print(f"F1-Score: {ensemble_f1:.2f}")
print(f"ROC-AUC: {ensemble_roc_auc:.2f}")
print(f"Confusion Matrix:\n{ensemble_confusion}")


RandomForestClassifier Metrics:
Accuracy: 0.91
Precision: 0.51
Recall: 0.40
F1-Score: 0.45
ROC-AUC: 0.68
Confusion Matrix:
[[2380   96]
 [ 146   98]]
SVC Metrics:
Accuracy: 0.93
Precision: 0.81
Recall: 0.24
F1-Score: 0.37
ROC-AUC: 0.62
Confusion Matrix:
[[2462   14]
 [ 186   58]]
LogisticRegression Metrics:
Accuracy: 0.93
Precision: 0.71
Recall: 0.28
F1-Score: 0.40
ROC-AUC: 0.63
Confusion Matrix:
[[2448   28]
 [ 176   68]]
GaussianNB Metrics:
Accuracy: 0.89
Precision: 0.41
Recall: 0.55
F1-Score: 0.47
ROC-AUC: 0.74
Confusion Matrix:
[[2286  190]
 [ 110  134]]
KNeighborsClassifier Metrics:
Accuracy: 0.92
Precision: 0.64
Recall: 0.35
F1-Score: 0.45
ROC-AUC: 0.66
Confusion Matrix:
[[2428   48]
 [ 159   85]]
Ensemble Model Metrics:
Accuracy: 0.93
Precision: 0.69
Recall: 0.39
F1-Score: 0.50
ROC-AUC: 0.69
Confusion Matrix:
[[2434   42]
 [ 149   95]]
