In [None]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split,RepeatedKFold,GridSearchCV
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
import graphviz
from sklearn.tree import export_graphviz

In [None]:
data = pd.read_csv("insurance[1].csv")
data.head()

In [None]:
print(data.info())

In [None]:
print(data.isna().sum())

In [None]:
plt.hist(data['charges'], color = 'skyblue', bins = 15)
plt.xlabel('Charges')
plt.title('Distribution of Insurance Charges($)')
plt.show()

In [None]:
plt.boxplot(data['charges'])
plt.xticks([])
plt.ylabel('Charges')
plt.title('Boxplot of Charges($)')
plt.show()

In [None]:
data['charges'].describe()

In [None]:
# Encoding smoker or
data[["smoker"]] = OrdinalEncoder().fit_transform(data[["smoker"]])
print(data["smoker"].value_counts())

data["sex"] = LabelEncoder().fit_transform(data["sex"])
print(data["sex"].value_counts())
data["region"] = LabelEncoder().fit_transform(data["region"])
print(data["region"].value_counts())

In [None]:
percentiles = np.percentile(data['charges'], [20, 40, 60, 80])
print(percentiles)

In [None]:
data['charge_class'] = data['charges'].apply(lambda x: 0 if x < 3991.5757 
                                             else 1 if x < 7369.05 
                                             else 2 if x < 11399.85716 
                                             else 3 if x < 20260.626406 
                                             else 4)

In [None]:
# Train/tune/test split
X = data.drop(columns=['charges', 'charge_class'])
y = data['charge_class']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=13)
X_tune, X_test, y_tune, y_test = train_test_split(X_test,y_test,  train_size = 0.50, random_state=72)

# Creating kfold object for validation
kf = RepeatedKFold(n_splits=10, n_repeats =5, random_state=10)

scoring = ['f1_macro','balanced_accuracy']
param = {"max_depth" : [1,2,3,4,5]}
cl= DecisionTreeClassifier(random_state=1000)
#Set up search for best decisiontreeclassifier estimator across all of our folds based on roc_auc
search = GridSearchCV(cl, param, scoring=scoring, n_jobs=-1, cv=kf,refit='f1_macro')
#execute search on our training data, this may take a few seconds ...
model = search.fit(X_train, y_train)
best = model.best_estimator_
print(best)

In [None]:
print(best)

In [None]:
dot_data = export_graphviz(best, out_file=None,
               feature_names=X.columns, # feature names from dataset
               filled=True, 
               rounded=True, 
               class_names=['Lowest', 'Low', 'Medium', 'High', 'Highest']) # classification labels 
graph = graphviz.Source(dot_data)
graph=graphviz.Source(dot_data)
graph
#graph.view()

In [None]:
varimp=pd.DataFrame(best.feature_importances_,index = X.columns,columns=['importance']).sort_values('importance', ascending=False)
print(varimp)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

print(ConfusionMatrixDisplay.from_estimator(best,X_tune,y_tune, display_labels = ['0','1', '2', '3', '4'], colorbar=False))

In [None]:
# Predictions
from sklearn.metrics import classification_report


y_pred_tune = best.predict(X_tune)
y_pred_test = best.predict(X_test)
print("Classification Report (Test Set):")
print(classification_report(y_test, y_pred_test, digits=4))


In [None]:
# Train/tune/test split
X = data.drop(columns=['charges', 'charge_class'])
y = data['charge_class']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=13)
X_tune, X_test, y_tune, y_test = train_test_split(X_test,y_test,  train_size = 0.50, random_state=72)

# Creating kfold object for validation
kf = RepeatedKFold(n_splits=10, n_repeats =5, random_state=10)

scoring = ['f1_macro','balanced_accuracy']
param_grid = {
    'max_depth': range(3, 8),
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [2, 5, 7],
    'criterion': ['gini', 'entropy']
}
cl= DecisionTreeClassifier(random_state=1000)
#Set up search for best decisiontreeclassifier estimator across all of our folds based on roc_auc
search = GridSearchCV(cl, param_grid, scoring=scoring, n_jobs=-1, cv=kf,refit='f1_macro')
#execute search on our training data, this may take a few seconds ...
model = search.fit(X_train, y_train)
best = model.best_estimator_
print(best)

In [None]:
print(ConfusionMatrixDisplay.from_estimator(best,X_tune,y_tune, display_labels = ['0','1', '2', '3', '4'], colorbar=False))