# Fetal Health Classification
Reduction of child mortality is reflected in several of the United Nations' Sustainable Development Goals and is a key indicator of human progress.
The UN expects that by 2030, countries end preventable deaths of newborns and children under 5 years of age, with all countries aiming to reduce under‑5 mortality to at least as low as 25 per 1,000 live births.

Parallel to notion of child mortality is of course maternal mortality, which accounts for 295 000 deaths during and following pregnancy and childbirth (as of 2017). The vast majority of these deaths (94%) occurred in low-resource settings, and most could have been prevented.

Dataset from Kaggle: [Fatal Health Classification](https://www.kaggle.com/andrewmvd/fetal-health-classification/tasks?taskId=2410)


#### What is Cardiotocogram (CTG) exam?
Cardiotocograms (CTGs) are a simple and cost accessible option to assess fetal health, allowing healthcare professionals to take action in order to prevent child and maternal mortality. The equipment itself works by sending ultrasound pulses and reading its response, thus shedding light on fetal heart rate (FHR), fetal movements, uterine contractions and more.
([see more about Cardiotocography](https://patient.info/pregnancy/cardiotocography)).

## 1. Import Libraries

In [None]:
import numpy as np # linear algeb ra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("dark")

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score, auc
from numpy import interp
from statistics import mean
from sklearn.metrics import average_precision_score

import warnings
warnings.filterwarnings('ignore')

## 2. Load Dataset

In [None]:
df = pd.read_csv('../input/fetal-health-classification/fetal_health.csv')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df['fetal_health'] = df['fetal_health'].astype('int')

In [None]:
sns.countplot(df['fetal_health']);
plt.xlabel('fetal health outcome')
plt.ylabel('count')

print(df['fetal_health'].value_counts())

In [None]:
sns.boxplot(data=df, orient='h');

In [None]:
sns.violinplot(x=df['fetal_health'], y=df['baseline value']);

In [None]:
sns.violinplot(x=df['fetal_health'], y=df['accelerations']);

In [None]:
sns.violinplot(x=df['fetal_health'], y=df['fetal_movement']);

In [None]:
plt.figure(figsize=(20, 10))
sns.heatmap(data=df.corr(), annot=True)
plt.show()

## 4. Model Building 

### Train-test Split

In [None]:
X = df.drop(['fetal_health'], axis=1)
y = df['fetal_health'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

print('Shape of X_train:', X_train.shape)
print('Shape of X_test:', X_test.shape)
print('Shape of y_train:', y_train.shape)
print('Shape of y_test:', y_test.shape)

In [None]:
scaler = StandardScaler() 

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

### Logistic Regression

In [None]:
lr = OneVsRestClassifier(LogisticRegression(multi_class='multinomial', solver='newton-cg'))
lr.fit(X_train_scaled, y_train)

y_pred_train = lr.predict(X_train_scaled)
y_train_acc = accuracy_score(y_pred_train, y_train)
print('Logistic Regression Training Accuracy Score: {:.4f}'.format(y_train_acc))

y_pred_lr = lr.predict(X_test_scaled)
accuracy = accuracy_score(y_pred_lr, y_test)
print('Logistic Regression Accuracy Score: {:.4f}'.format(accuracy))

In [None]:
print('Logistic Regression Model')
print(classification_report(y_test, y_pred_lr))

cf_matrix = confusion_matrix(y_test, y_pred_lr)
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, fmt='.2%', cmap='Blues')
plt.title('Logistic Regression', fontsize=15)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

### XGBoost

In [None]:
xgb = OneVsRestClassifier(XGBClassifier())
xgb.fit(X_train_scaled, y_train)

y_pred_train = xgb.predict(X_train_scaled)
y_train_acc = accuracy_score(y_pred_train, y_train)
print('XGBoost Training Accuracy Score: {:.4f}'.format(y_train_acc))

y_pred_xgb = xgb.predict(X_test_scaled)
accuracy = accuracy_score(y_pred_xgb, y_test)
print('XGBoost Accuracy Score: {:.4f}'.format(accuracy))

In [None]:
print('XGBoost Model')
print(classification_report(y_test, y_pred_xgb))

cf_matrix = confusion_matrix(y_test, y_pred_xgb)
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, fmt='.2%', cmap='Greens')
plt.title('XGBoost', fontsize=15)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

## 5. Stratified KFold Cross-validation

In [None]:
def get_skf_test_pred(X, y, train_index, test_index, model):
    """
    Split dataset for Stratified K-Fold.
    """
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(X_train, y_train)
    y_pred_skf = model.predict(X_test)
    return y_test, y_pred_skf

### Logistic Regression with SKF

In [None]:
def skf_accuracy(X, y, model):
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    X_scaled = scaler.fit_transform(X)
    f_accuracy_list = []
    fold_no = 0
    for train_index, test_index in skf.split(X, y):
        fold_no += 1
        y_test, y_pred_skf = get_skf_test_pred(X_scaled, y, train_index, test_index, model)
        f_accuracy = accuracy_score(y_test, y_pred_skf)
        f_accuracy_list.append(f_accuracy)
        print(f'Fold {str(fold_no)} Accuracy: {f_accuracy:.4f}')
    return mean(f_accuracy_list)   

print(f'Mean Stratified KFold Accuracy: {skf_accuracy(X, y, lr):.4f}')

In [None]:
X = X.to_numpy()

In [None]:
def sum_cm(X, y, model):
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    confusion_list = []
    for train_index, test_index in skf.split(X, y):
        y_test, y_pred_skf = get_skf_test_pred(X, y, train_index, test_index, model)
        conf_mat = confusion_matrix(y_test, y_pred_skf)
        confusion_list.append(conf_mat)
    return confusion_list

def get_mean_cm(confusion_list):
    result = np.zeros([3,3])
    total_len = len(confusion_list)
    for item in confusion_list:
        result += item
    return result/total_len

In [None]:
confusion_list = sum_cm(X, y, lr)
confusion_mean = get_mean_cm(confusion_list)

sns.heatmap(confusion_mean/np.sum(confusion_mean), annot=True, fmt='.2%', cmap='Blues')
plt.title('Logistic Regression with SKF', fontsize=15)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

### XGBoost with SKF

In [None]:
print(f'Mean Stratified KFold Accuracy: {skf_accuracy(X, y, xgb):.4f}')

In [None]:
confusion_list = sum_cm(X, y, xgb)
confusion_mean = get_mean_cm(confusion_list)

sns.heatmap(confusion_mean/np.sum(confusion_mean), annot=True, fmt='.2%', cmap='Greens')
plt.title('XGBoost with SKF', fontsize=15)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

## ROC Curve and AUC

### Logistic Regression

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True)
X_scaled = scaler.fit_transform(X)

In [None]:
def ROC_and_PR_curve(model, skf, X_scaled):
    plt.figure(figsize=(15, 7))
    plt.subplot(1, 2, 1)

    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)
    all_f1_score = []
    precision_dict = {}
    recall_dict = {}
    y_binarized = preprocessing.label_binarize(y, classes=[1, 2, 3])

    for idx, (train_index, test_index) in enumerate(skf.split(X_scaled, y_binarized.argmax(1))):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        y_train, y_test = y_binarized[train_index], y_binarized[test_index]

        model = model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        prediction_proba = model.predict_proba(X_test)
        # Each ROC fold
        fpr, tpr, thresholds = roc_curve(y_test[:, 1], prediction_proba[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC Fold %d (AUC = %0.2f)'% (idx, roc_auc)) 
        # f1 Score
        f1score = f1_score(y_test, y_pred, average='weighted')
        all_f1_score.append(f1score)
        # Precision and Recall
        precision_dict[idx], recall_dict[idx], _ = precision_recall_curve(y_test[:, 1], prediction_proba[:, 1])
    
    # Mean ROC
    mean_tpr = np.mean(tprs, axis=0)   
    mean_auc = auc(mean_fpr, mean_tpr)
    print('Mean ROC: ', mean_auc)
    plt.plot(mean_fpr, mean_tpr, lw=2, alpha=1, color='blue', label='Mean ROC(AUC=%0.2f)' % (mean_auc))
    
    # f1 Score
    mean_f1_score = mean(all_f1_score)
    print('Mean f1-score: ', mean_f1_score)

    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='gray', label='Chance')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve')
    plt.grid()
    plt.legend(loc='lower right')

    
    # PR curve
    plt.subplot(1, 2, 2)
    
    for idx in range(len(precision_dict)):
        plt.plot(recall_dict[idx], precision_dict[idx], lw=2, alpha=0.3, label='Fold %d' % idx)

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.grid()
    plt.legend(loc='best')
    plt.show()
    
ROC_and_PR_curve(lr, skf, X_scaled);

### XGBoost 

In [None]:
ROC_and_PR_curve(xgb, skf, X_scaled);