## Imports

In [1]:
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, \
                            recall_score, accuracy_score

%matplotlib inline

Import the `fetal_health.csv` data set, which is available
[here, via kaggle](https://www.kaggle.com/datasets/andrewmvd/fetal-health-classification)  
<sup>kaggle data set was obtained via this research - 
[Ayres de Campos et al. (2000) SisPorto 2.0 A Program for Automated Analysis of Cardiotocograms. J Matern Fetal Med 5:311-318](https://onlinelibrary.wiley.com/doi/10.1002/1520-6661(200009/10)9:5%3C311::AID-MFM12%3E3.0.CO;2-9)</sup>

In [2]:
fetal_df = pd.read_csv('./data/fetal_health.csv')

In [21]:
fetal_df.head()

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,73.0,0.5,43.0,...,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,2.0
1,132.0,0.006,0.0,0.006,0.003,0.0,0.0,17.0,2.1,0.0,...,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,1.0
2,133.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.1,0.0,...,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,1.0
3,134.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,1.0
4,132.0,0.007,0.0,0.008,0.0,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,1.0


In [18]:
print(fetal_df.info(), \
    f"""\n\n
    Data shape: {fetal_df.shape}
    Total nulls in data set: {fetal_df.isna().sum().sum()}
    """)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2126 entries, 0 to 2125
Data columns (total 22 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   baseline value                                          2126 non-null   float64
 1   accelerations                                           2126 non-null   float64
 2   fetal_movement                                          2126 non-null   float64
 3   uterine_contractions                                    2126 non-null   float64
 4   light_decelerations                                     2126 non-null   float64
 5   severe_decelerations                                    2126 non-null   float64
 6   prolongued_decelerations                                2126 non-null   float64
 7   abnormal_short_term_variability                         2126 non-null   float64
 8   mean_value_of_short_term_variability  

There are no nulls in this data set ... may not need to impute any data

all data is float64, may have some categories to pull out (such as
fetal health)

In [30]:
# display the columns that seem like they could be made
# into categorical variables, possibly at least 

print(f"""
{fetal_df.value_counts('fetal_health')}

{fetal_df.value_counts('histogram_number_of_peaks')}

{fetal_df.value_counts('histogram_number_of_zeroes')}
""")


fetal_health
1.0    1655
2.0     295
3.0     176
dtype: int64

histogram_number_of_peaks
1.0     357
2.0     331
3.0     269
4.0     258
5.0     210
6.0     158
7.0     145
0.0     107
8.0     106
9.0      67
10.0     49
11.0     28
12.0     22
13.0     10
14.0      5
16.0      2
15.0      1
18.0      1
dtype: int64

histogram_number_of_zeroes
0.0     1624
1.0      366
2.0      108
3.0       21
5.0        2
4.0        2
10.0       1
8.0        1
7.0        1
dtype: int64



The only column that seems to be able to be broken down into
a categorical value is the fetal_health column. 

### Checking the values

In [40]:
print(fetal_df.value_counts('fetal_health') , '\n\n' , fetal_df.value_counts('fetal_health', normalize=True))

fetal_health
1.0    1655
2.0     295
3.0     176
dtype: int64 

 fetal_health
1.0    0.778457
2.0    0.138758
3.0    0.082785
dtype: float64
