# Analysis Notebook

This notebook is generated from the provided Python script.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Binarizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, confusion_matrix, classification_report
from sklearn.utils.class_weight import compute_class_weight
from hyperopt import fmin, tpe, hp, STATUS_OK, space_eval
from hyperopt.pyll.base import scope
from sklearn.feature_selection import RFE
import seaborn as sns

## Background

We are analyzing customer happiness based on survey data.

In [None]:
# Set Dir
path_dir = r'E:\My Stuff\Projects\Apziva\pG736HzU7DLB8fEa'
os.chdir(path_dir)

# Load Data
df = pd.read_csv("ACME-HappinessSurvey2020.csv")
fitted_models = {}

# Prepare Data
X, y = shuffle(df.iloc[:, 1:], df['Y'], random_state=3921)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.55, random_state=3921)
binarizer = Binarizer(threshold=3)
X_train_binary = binarizer.fit_transform(X_train)
X_test_binary = binarizer.fit_transform(X_test)

In [None]:
# Compute Class Weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(enumerate(class_weights))
sample_weights = np.array([class_weight_dict[label] for label in y_train])

In [None]:
# Train BernoulliNB Model
model = BernoulliNB(alpha=0.5, binarize=3, fit_prior=True, class_prior=[0.55, 0.45])
model.fit(X_train, y_train, sample_weight=sample_weights)
fitted_models['BernoulliNB'] = model

In [None]:
# Train ExtraTreesClassifier Model
model = ExtraTreesClassifier(n_estimators=100, max_depth=2, min_samples_split=3, min_samples_leaf=1, bootstrap=False, random_state=3921, class_weight={0: 1.025, 1: 0.975})
model.fit(X_train, y_train, sample_weight=sample_weights)
fitted_models['ExtraTreesClassifier'] = model

In [None]:
# Evaluate Performance
def get_recall_score(model_name):
    model = fitted_models[model_name]
    y_pred = model.predict(X_test)
    return model_name, recall_score(y_test, y_pred, pos_label=0)

recall_results = dict(map(get_recall_score, fitted_models))
print(recall_results)

In [None]:
# Plot Confusion Matrices
fig_cm, axes = plt.subplots(1, len(fitted_models), figsize=(12, 5))
for ax, (name, model) in zip(axes, fitted_models.items()):
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, ax=ax)
    ax.set_title(name)
plt.tight_layout()
plt.show()