# This notebook is based on the following excellent notebook by Landfall:
https://www.kaggle.com/landfallmotto/fetal-health-data-profile-boruta-model-stacking

## This Notebook is primarily designed to compare BorutaSHAP's results against Auto_ViML's results

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Fetal Health Dataset Notebook**


I tried to demonstrate multiple methods in a single notebook:<p>
- XGBoost multiclass mode
- XGBoost built-in feature importance
- XGBoost feature importance with Shap
- Feature Selection with BorutaShap
- Multiple model running and selection
- Model Stacking with sklearn
- Data profiling and Visualization
- Scaling data
- Finding best parameter with GridSearchCV
   
    
    

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from collections import Counter
from yellowbrick.classifier import ROCAUC
from yellowbrick.features import Rank1D, Rank2D
from xgboost import plot_importance
from matplotlib import pyplot
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_score, learning_curve, cross_validate, train_test_split, KFold
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")

In [None]:
pip install git+https://github.com/AutoViML/Auto_ViML.git

In [None]:
data=pd.read_csv('/kaggle/input/fetal-health-classification/fetal_health.csv')
print(data.shape)
data.head()

# **Target Variable**

Target variable distribution looks imbalanced. 
Some options are over/under sampling data or weighting classes.

In [None]:
sns.countplot(data.fetal_health)

# **Features in Dataset**

In [None]:
cols=data.columns
print(cols)

# **Missing Values**

Looks like we don't have any missing values.

In [None]:
data.isnull().any()

In [None]:
features = ['baseline value', 'accelerations', 'fetal_movement',
       'uterine_contractions', 'light_decelerations', 'severe_decelerations',
       'prolongued_decelerations', 'abnormal_short_term_variability',
       'mean_value_of_short_term_variability',
       'percentage_of_time_with_abnormal_long_term_variability',
       'mean_value_of_long_term_variability', 'histogram_width',
       'histogram_min', 'histogram_max', 'histogram_number_of_peaks',
       'histogram_number_of_zeroes', 'histogram_mode', 'histogram_mean',
       'histogram_median', 'histogram_variance', 'histogram_tendency']

X = pd.DataFrame(data=data, columns=features)
y = pd.DataFrame(data=data, columns=['fetal_health'])
y = y.astype(int)
# to make labels start from 0 to n_classes, otherwise I couldn't manage to run xgb with labels starting from 1 to n_classes 😟😟😟
y = y-1 
X.head()


In [None]:
all_features = features+['fetal_health']
new_data = data[all_features]
new_data.shape

# Feature Selection: Comparing BorutaSHAP with Auto_ViML

# **Train Test Split: First let's fix train and test

In [None]:
# 30% test and 70% train data as mentioned by dataset Author
# in the task https://www.kaggle.com/andrewmvd/fetal-health-classification/tasks?taskId=2410
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.30, stratify=y)

X_train.shape, y_train.shape, X_test.shape, y_test.shape,

In [None]:
train, test = train_test_split(new_data,random_state=42, test_size=0.30, stratify=y)

train.shape, test.shape

In [None]:
### Let's import Auto_ViML
from autoviml.Auto_ViML import Auto_ViML

In [None]:
### Auto_ViML with all settings false except for IMbalanced Flag set to True ### 
### Using CatBoost Model #####
target = 'fetal_health'
m, feats, trainm, testm = Auto_ViML(train, target, test,
                            sample_submission='',
                            scoring_parameter='balanced_accuracy', KMeans_Featurizer=False,
                            hyper_param='RS',feature_reduction=True,
                             Boosting_Flag=True, Binning_Flag=False,
                            Add_Poly=0, Stacking_Flag=True,Imbalanced_Flag=True,
                            verbose=1)

In [None]:
#### These are the 11 features selected as important by BorutaSHAP ############
boruta = ['histogram_min', 'histogram_mean', 'percentage_of_time_with_abnormal_long_term_variability', 'mean_value_of_short_term_variability', 'uterine_contractions', 'histogram_variance', 'histogram_mode', 'prolongued_decelerations', 'abnormal_short_term_variability', 'mean_value_of_long_term_variability', 'accelerations']
len(boruta)

In [None]:
### These are the 14 features selected as important by Auto_ViML ###########
print(len(feats))
feats

In [None]:
def left_subtract(l1,l2):
    lst = []
    for i in l1:
        if i not in l2:
            lst.append(i)
    return lst
left_subtract(feats, boruta)

In [None]:
left_subtract(boruta, feats)

In [None]:
dictio = {1.0: 0, 2.0: 1, 3.0: 2}
reverse_dictio = dict(zip(dictio.values(),dictio.keys()))
reverse_dictio

In [None]:
y_true = test[target]
y_pred = pd.Series(m.predict(testm[feats]).ravel()).map(reverse_dictio).values
y_true.shape, y_pred.shape

In [None]:
from autoviml.Auto_ViML import print_regression_metrics, print_classification_metrics

In [None]:
print_classification_metrics(y_true, y_pred,False)

# **Model**

# **Confusion Matrix**

In [None]:
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report

In [None]:
cm = confusion_matrix(y_true, y_pred)
cm

In [None]:
def plot_confusion_matrix(cm, classes, normalized=True, cmap='bone'):
    plt.figure(figsize=[7, 6])
    norm_cm = cm
    if normalized:
        norm_cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        sns.heatmap(norm_cm, annot=cm, fmt='g', xticklabels=classes, yticklabels=classes, cmap=cmap)
plot_confusion_matrix(cm, ['Normal', 'Suspect', 'Pathological'])

# **Classification Report**

In [None]:
print(classification_report(y_true, y_pred))

# **F1 Score**

In [None]:
f1_score(y_true, y_pred, average=None)