In [None]:
# Import required base libraries.

import sys
import random
import time
import pickle

import IPython

import pandas as pd
import matplotlib
import numpy as np
import scipy as sp
import sklearn as sk
import plotly

from IPython import display
from pathlib import Path

main_path = Path("..")

print(f"Python version: {sys.version}")
print(f"Pandas version: {pd.__version__}")
print(f"Matplotlib version: {matplotlib.__version__}")
print(f"Numpy version: {np.__version__}")
print(f"Scipy version: {sp.__version__}")
print(f"IPython version: {IPython.__version__}")
print(f"Sklearn version: {sk.__version__}")


In [None]:
# Now import classification models and initialize visualisation tools.

# Algorithms.
from sklearn import (
    tree, linear_model, neighbors, naive_bayes, ensemble,
    discriminant_analysis, gaussian_process, neural_network,
    multiclass,
)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Preprocessing tools.
from sklearn.preprocessing import (
    OneHotEncoder, OrdinalEncoder, LabelEncoder, MinMaxScaler,
)
from sklearn.metrics import (
    classification_report, f1_score, precision_score,recall_score,
)

# Statistics
from scipy.stats import boxcox, zscore

# Collinearity analysis.
from statsmodels.stats.outliers_influence import  variance_inflation_factor

# Model selection tools.
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

# Copy models.
from copy import deepcopy
from sklearn import base

# Visualization tools.
import matplotlib.pyplot as plt
import seaborn as sns
from plotly import express as px

rnd_state=42

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv(
    "/kaggle/input/fetal-health-classification/fetal_health.csv",
    encoding="utf-8"
)
data.head(10)

In [None]:
# Lets see is it balanced or not.
data.fetal_health.value_counts()

# 2.0 and 3.0 are minority.
# If i say 1 for all, approximately 77% of my predictions will be true.

In [None]:
data.dtypes

In [None]:
data.describe().transpose()

In [None]:
data.isnull().sum()

# No null values.

In [None]:
# Drop duplicates.
data.drop_duplicates(inplace=True, ignore_index=True)

data.info()

# 13 rows deleted.

In [None]:
data.columns

In [None]:
# Rename columns.

data.rename(
    columns={
        "baseline value": "baseline_value",
        "fetal_health": "class"
    },
    inplace=True
)

columns = list(data.columns)
columns.remove("class")

data.head(5)

Ok, we know data contains full of numeric values.
There is no categorical column to convert it to numeric values.

__Btw 1 -> Normal, 2 -> Suspect, 3 -> Pathological. There is no big or small relation between them.
So I used dummy encoding.__

## See distributions for each feature.

In [None]:
px.histogram(data, x="baseline_value", color="class")

# Normal distribution type data.

In [None]:
px.histogram(data, x="accelerations", color="class")

# Right skewed.

In [None]:
px.histogram(data, x="fetal_movement", color="class")

# Right skewed.

In [None]:
px.histogram(data, x="uterine_contractions", color="class")

# Normal like.

In [None]:
px.histogram(data, x="light_decelerations", color="class")

# Right skewed.

In [None]:
px.histogram(data, x="prolongued_decelerations", color="class")

# Maybe we can categorize it.

In [None]:
px.histogram(data, x="abnormal_short_term_variability", color="class")

# bi-model, also uniform like.

In [None]:
px.histogram(data, x="mean_value_of_short_term_variability", color="class")

# Right skewed.

In [None]:
px.histogram(data, x="percentage_of_time_with_abnormal_long_term_variability", color="class")

# Right skewed.

In [None]:
px.histogram(data, x="mean_value_of_long_term_variability", color="class")

# Right skewed but no transformation required.

In [None]:
px.histogram(data, x="histogram_width", color="class")

# Bimodel and it seems mixed..

In [None]:
px.histogram(data, x="histogram_min", color="class")

# Uniform.

In [None]:
px.histogram(data, x="histogram_max", color="class")

# Normal.

In [None]:
px.histogram(data, x="histogram_number_of_peaks", color="class")

In [None]:
px.histogram(data, x="histogram_number_of_zeroes", color="class")

# Right skewed but i wont transform it.

In [None]:
px.histogram(data, x="histogram_mode", color="class")

# Left skewed normal.

In [None]:
px.histogram(data, x="histogram_mean", color="class")

# Left skewed normal. I wont transform it.

In [None]:
px.histogram(data, x="histogram_median", color="class")

# Left skewed normal. Mode, mean and median seems like same

In [None]:
px.histogram(data, x="histogram_variance", color="class")

# Highly right skewed. This shows less variance in histograms.

In [None]:
px.histogram(data, x="histogram_tendency", color="class")

# Highly right skewed. This shows less variance in histograms.

## Discriminant Analysis detects collinearity between independent variables.

In [None]:
# Transferred code for Variance Inflation Factor (VIF)

from sklearn.base import BaseEstimator, TransformerMixin

class ReduceVIF(BaseEstimator, TransformerMixin):
    def __init__(self, thresh=5.0, impute=False, impute_strategy='median'):
        # From looking at documentation, values between 5 and 10 are "okay".
        # Above 10 is too high and so should be removed.
        self.thresh = thresh
        
        # The statsmodel function will fail with NaN values, as such we have to impute them.
        # By default we impute using the median value.
        # This imputation could be taken out and added as part of an sklearn Pipeline.
        if impute:
            self.imputer = Imputer(strategy=impute_strategy)

    def fit(self, X, y=None):
        print('ReduceVIF fit')
        if hasattr(self, 'imputer'):
            self.imputer.fit(X)
        return self

    def transform(self, X, y=None):
        print('ReduceVIF transform')
        columns = X.columns.tolist()
        if hasattr(self, 'imputer'):
            X = pd.DataFrame(self.imputer.transform(X), columns=columns)
        return ReduceVIF.calculate_vif(X, self.thresh)

    @staticmethod
    def calculate_vif(X, thresh=5.0):
        # Taken from https://stats.stackexchange.com/a/253620/53565 and modified
        dropped=True
        while dropped:
            variables = X.columns
            dropped = False
            vif = [variance_inflation_factor(X[variables].values, X.columns.get_loc(var)) for var in X.columns]
            
            max_vif = max(vif)
            if max_vif > thresh:
                maxloc = vif.index(max_vif)
                print(f'Dropping {X.columns[maxloc]} with vif={max_vif}')
                X = X.drop([X.columns.tolist()[maxloc]], axis=1)
                dropped=True
        return X

In [None]:
transformer = ReduceVIF()

correlated_features_dropped_df = transformer.fit_transform(data[columns], data["class"])

correlated_features_dropped_df.columns

In [None]:
# Detect important features.
feature_selector = XGBClassifier()
feature_selector.fit(data[columns], data["class"])

# Create a dataframe for visualization and selection.
feature_selection_df = pd.DataFrame(columns=["Feature_Name", "Importance", "Cumulative_Importance", "Is_Correlated"])
feature_selection_df["Feature_Name"] = list(columns)
feature_selection_df["Importance"] = feature_selector.feature_importances_
feature_selection_df.sort_values(inplace=True, ascending=False, by="Importance")
feature_selection_df["Cumulative_Importance"] = feature_selection_df.Importance.cumsum()
feature_selection_df

In [None]:
feature_selection_df.loc[:, "Is_Correlated"] = feature_selection_df[feature_selection_df.isin(list(correlated_features_dropped_df.columns))].Feature_Name.isna()

feature_selection_df.reset_index(inplace=True, drop=True)
feature_selection_df

* __mean_value_of_short_term_variability__ has .25 importance value. It can' t be discarded.  
* __histogram_mean__ can' t be discarded.  
* __abnormal_short_term_variabilty__ has 5.66 vif.
* __histogram_max__, __histogram_mode__, __baseline_value__ has high vif value.
* __histogram_width__ has __inf__ vif value.
* __histogram_min__ and __histogram_median__ has high vif value

In [None]:
# Now, columns selected.

columns = list(feature_selection_df.iloc[[0, 1, 2, 3, 4, 5, 6, 10, 11, 14, 15, 17]].Feature_Name)
columns

## Test models and get the first one.

In [None]:
ml_algorithms = [
    
    # Ensemble Methods.
    ensemble.AdaBoostClassifier(random_state=rnd_state),
    ensemble.BaggingClassifier(random_state=rnd_state),
    ensemble.ExtraTreesClassifier(random_state=rnd_state),
    ensemble.GradientBoostingClassifier(random_state=rnd_state),
    ensemble.RandomForestClassifier(random_state=rnd_state),
    XGBClassifier(random_state=rnd_state),
    LGBMClassifier(random_state=rnd_state),
    
    # Gaussian Processes.
    gaussian_process.GaussianProcessClassifier(random_state=rnd_state),
    
    # Generalized Linear Methods.
    linear_model.LogisticRegressionCV(random_state=rnd_state),
    
    # Naive Bayes.
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    
    # Nearest Neighbor.
    neighbors.KNeighborsClassifier(),
    
    # Trees.
    tree.DecisionTreeClassifier(random_state=rnd_state, max_depth=16, min_samples_leaf=1, min_samples_split=.2),
    tree.ExtraTreeClassifier(random_state=rnd_state),
    
    # Discriminant Analysis.
    discriminant_analysis.LinearDiscriminantAnalysis(),
    discriminant_analysis.QuadraticDiscriminantAnalysis(),
    
    # Neural Networks.
    neural_network.MLPClassifier(max_iter=300, random_state=rnd_state),
    
    # Stacked Methods.
    ensemble.StackingClassifier(
        estimators=[
            ("adaboost", ensemble.AdaBoostClassifier(random_state=rnd_state)),
            ("gradient", ensemble.GradientBoostingClassifier(random_state=rnd_state)),
            ("knn", neighbors.KNeighborsClassifier(n_jobs=-1))
        ],
        final_estimator=linear_model.LogisticRegression(random_state=rnd_state)
    )
]

cv_split = model_selection.StratifiedKFold(n_splits=3, shuffle=True, random_state=rnd_state)

ml_columns = [
    "Algorithm_Name",
    "Algorithm_Parameters",
    "Train_Balanced_Accuracy_Mean",
    "Test_Balanced_Accuracy_Mean",
    "Train_F1_Weighted_Mean",
    "Test_F1_Weighted_Mean",
    "Train_AUC_ROC_OVR_Weighted_Mean",
    "Test_AUC_ROC_OVR_Weighted_Mean",
    "Train_Balanced_Accuracy",
    "Test_Balanced_Accuracy",
    "Test_Accuracy_3*STD",
    "Time (Mean)",
]

ml_compare = pd.DataFrame(columns=ml_columns)

data_used = data[columns]
labels = data["class"]

row_index = 0
for alg in ml_algorithms:
    ml_name = alg.__class__.__name__
    ml_compare.loc[row_index, "Algorithm_Name"] = ml_name
    ml_compare.loc[row_index, "Algorithm_Parameters"] = str(alg.get_params())
    
    # Cross validation.
    cv_results = model_selection.cross_validate(
        alg,
        data_used,
        labels,
        cv=cv_split,
        return_train_score=True,
        scoring=
            [
                "balanced_accuracy",
                "f1_weighted",
                "roc_auc_ovr_weighted",
            ],
    )
    
    ml_compare.loc[row_index, "Time (Mean)"] = cv_results["fit_time"].mean()
    ml_compare.loc[row_index, "Train_Balanced_Accuracy_Mean"] = cv_results["train_balanced_accuracy"].mean()
    ml_compare.loc[row_index, "Test_Balanced_Accuracy_Mean"] = cv_results["test_balanced_accuracy"].mean()
    ml_compare.loc[row_index, "Train_F1_Weighted_Mean"] = cv_results["train_f1_weighted"].mean()
    ml_compare.loc[row_index, "Test_F1_Weighted_Mean"] = cv_results["test_f1_weighted"].mean()
    ml_compare.loc[row_index, "Train_AUC_ROC_OVR_Weighted_Mean"] = cv_results["train_roc_auc_ovr_weighted"].mean()
    ml_compare.loc[row_index, "Test_AUC_ROC_OVR_Weighted_Mean"] = cv_results["test_roc_auc_ovr_weighted"].mean()
    
    ml_compare.loc[row_index, "Test_Balanced_Accuracy"] = str(cv_results["test_balanced_accuracy"])
    ml_compare.loc[row_index, "Train_Balanced_Accuracy"] = str(cv_results["train_balanced_accuracy"])

    # Worst case scenario.
    ml_compare.loc[row_index, "Test_Accuracy_3*STD"] = cv_results["test_balanced_accuracy"].std() * 3

    row_index += 1

ml_compare.sort_values(by=["Test_AUC_ROC_OVR_Weighted_Mean", "Test_F1_Weighted_Mean", "Test_Balanced_Accuracy_Mean"], ascending=False, inplace=True)
ml_compare

# NOTE: Average precision can be added for evaluation.

In [None]:
# Now select the most performant algorithm, then evaluate it.
ml_compare.iloc[0]

In [None]:
# Create a train test split.
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    data[columns],
    data["class"],
    test_size=.3,
    train_size=.7,
    random_state=rnd_state,
    shuffle=True,
    stratify=data["class"].values
)

# Create classifier with default parameters.
classifier = multiclass.OneVsRestClassifier(LGBMClassifier(class_weight="balanced", random_state=rnd_state))
classifier.fit(X_train, y_train)

predictions = classifier.predict(X_test)

precision = precision_score(y_test, predictions, average="weighted")
recall = recall_score(y_test, predictions, average="weighted")
auc_roc = ml_compare.iloc[0]["Test_AUC_ROC_OVR_Weighted_Mean"]
f1_weighted = ml_compare.iloc[0]["Test_F1_Weighted_Mean"]

print(f"AUC ROC: {auc_roc}, F1 - Score: {f1_weighted}, Precision Score: {precision}, Recall Score: {recall}")

In [None]:
# Print classification report.
print(classification_report(y_test, predictions, target_names=["Normal", "Suspect", "Pathological"]))