In [1]:
# Create reference for CSV file
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [2]:
# Read CSV file into a dataframe
celiac_path = "resources/celiac_disease_lab_data.csv"

celiac_df = pd.read_csv(celiac_path, low_memory = False)

# Print the first 5 rows
celiac_df.head(5)

Unnamed: 0,Age,Gender,Diabetes,Diabetes Type,Diarrhoea,Abdominal,Short_Stature,Sticky_Stool,Weight_loss,IgA,IgG,IgM,Marsh,cd_type,Disease_Diagnose
0,10,Male,Yes,Type 1,inflammatory,yes,PSS,no,no,1.3,10.0,1.0,marsh type 0,potential,yes
1,9,Male,Yes,Type 1,fatty,yes,PSS,no,no,1.5,12.5,1.3,marsh type 3a,atypical,yes
2,8,Female,Yes,Type 1,watery,yes,Variant,yes,yes,0.4,8.0,0.5,marsh type 1,latent,yes
3,10,Male,Yes,Type 1,watery,yes,PSS,no,no,0.98,9.0,0.66,marsh type 3a,silent,yes
4,9,Male,Yes,Type 1,fatty,yes,PSS,no,no,1.0,10.5,1.1,marsh type 1,latent,yes


In [3]:
# Split the features and target data and copy the dataframe of X
y = celiac_df['Disease_Diagnose']
X = celiac_df.drop(columns='Disease_Diagnose')
celiac_sans_y_df = X

celiac_sans_y_df.head(20)

Unnamed: 0,Age,Gender,Diabetes,Diabetes Type,Diarrhoea,Abdominal,Short_Stature,Sticky_Stool,Weight_loss,IgA,IgG,IgM,Marsh,cd_type
0,10,Male,Yes,Type 1,inflammatory,yes,PSS,no,no,1.3,10.0,1.0,marsh type 0,potential
1,9,Male,Yes,Type 1,fatty,yes,PSS,no,no,1.5,12.5,1.3,marsh type 3a,atypical
2,8,Female,Yes,Type 1,watery,yes,Variant,yes,yes,0.4,8.0,0.5,marsh type 1,latent
3,10,Male,Yes,Type 1,watery,yes,PSS,no,no,0.98,9.0,0.66,marsh type 3a,silent
4,9,Male,Yes,Type 1,fatty,yes,PSS,no,no,1.0,10.5,1.1,marsh type 1,latent
5,8,Female,Yes,Type 1,fatty,yes,Variant,yes,yes,1.1,9.5,1.0,marsh type 3a,silent
6,9,Male,Yes,Type 1,watery,yes,Variant,yes,yes,2.1,11.4,1.0,marsh type 2,typical
7,5,Female,Yes,Type 1,fatty,yes,PSS,yes,yes,0.8,12.0,0.98,marsh type 1,latent
8,6,Female,Yes,Type 1,fatty,yes,PSS,yes,yes,1.5,8.0,1.1,marsh type 3b,silent
9,4,Male,Yes,Type 1,watery,yes,Variant,yes,yes,0.42,11.5,1.0,marsh type 2,typical


In [4]:
# Encode the features dataset's categorical variables using get_dummies
X = pd.get_dummies(X)

# Review the features DataFrame
X.head(20)

Unnamed: 0,Age,IgA,IgG,IgM,Gender_Female,Gender_Male,Diabetes_Yes,Diabetes_no,Diabetes Type_Type 1,Diabetes Type_Type 2,...,Marsh_marsh type 3a,Marsh_marsh type 3b,Marsh_marsh type 3c,Marsh_none,cd_type_atypical,cd_type_latent,cd_type_none,cd_type_potential,cd_type_silent,cd_type_typical
0,10,1.3,10.0,1.0,False,True,True,False,True,False,...,False,False,False,False,False,False,False,True,False,False
1,9,1.5,12.5,1.3,False,True,True,False,True,False,...,True,False,False,False,True,False,False,False,False,False
2,8,0.4,8.0,0.5,True,False,True,False,True,False,...,False,False,False,False,False,True,False,False,False,False
3,10,0.98,9.0,0.66,False,True,True,False,True,False,...,True,False,False,False,False,False,False,False,True,False
4,9,1.0,10.5,1.1,False,True,True,False,True,False,...,False,False,False,False,False,True,False,False,False,False
5,8,1.1,9.5,1.0,True,False,True,False,True,False,...,True,False,False,False,False,False,False,False,True,False
6,9,2.1,11.4,1.0,False,True,True,False,True,False,...,False,False,False,False,False,False,False,False,False,True
7,5,0.8,12.0,0.98,True,False,True,False,True,False,...,False,False,False,False,False,True,False,False,False,False
8,6,1.5,8.0,1.1,True,False,True,False,True,False,...,False,True,False,False,False,False,False,False,True,False
9,4,0.42,11.5,1.0,False,True,True,False,True,False,...,False,False,False,False,False,False,False,False,False,True


In [5]:
X.dtypes


Age                         int64
IgA                       float64
IgG                       float64
IgM                       float64
Gender_Female                bool
Gender_Male                  bool
Diabetes_Yes                 bool
Diabetes_no                  bool
Diabetes Type_Type 1         bool
Diabetes Type_Type 2         bool
Diarrhoea_fatty              bool
Diarrhoea_inflammatory       bool
Diarrhoea_watery             bool
Abdominal_no                 bool
Abdominal_yes                bool
Short_Stature_DSS            bool
Short_Stature_PSS            bool
Short_Stature_Variant        bool
Sticky_Stool_no              bool
Sticky_Stool_yes             bool
Weight_loss_no               bool
Weight_loss_yes              bool
Marsh_marsh type 0           bool
Marsh_marsh type 1           bool
Marsh_marsh type 2           bool
Marsh_marsh type 3a          bool
Marsh_marsh type 3b          bool
Marsh_marsh type 3c          bool
Marsh_none                   bool
cd_type_atypic

In [6]:
X.info

<bound method DataFrame.info of       Age   IgA   IgG   IgM  Gender_Female  Gender_Male  Diabetes_Yes  \
0      10  1.30  10.0  1.00          False         True          True   
1       9  1.50  12.5  1.30          False         True          True   
2       8  0.40   8.0  0.50           True        False          True   
3      10  0.98   9.0  0.66          False         True          True   
4       9  1.00  10.5  1.10          False         True          True   
...   ...   ...   ...   ...            ...          ...           ...   
2201    3  1.10   7.0  0.60          False         True          True   
2202   11  1.40  12.0  1.10           True        False          True   
2203   15  0.34  10.0  0.60          False         True          True   
2204    7  2.10   8.5  2.10           True        False         False   
2205   12  1.90  11.1  1.10           True        False          True   

      Diabetes_no  Diabetes Type_Type 1  Diabetes Type_Type 2  ...  \
0           False    

In [7]:
X.describe()

Unnamed: 0,Age,IgA,IgG,IgM
count,2206.0,2206.0,2206.0,2206.0
mean,12.768812,1.427384,10.052901,1.236963
std,7.43425,1.110042,2.047683,0.447216
min,1.0,0.34,5.0,0.5
25%,8.0,1.0,8.7,0.98
50%,10.0,1.1,10.0,1.1
75%,15.0,1.8,12.0,1.5
max,35.0,9.0,15.3,2.7


In [8]:
X.isna().sum()

Age                       0
IgA                       0
IgG                       0
IgM                       0
Gender_Female             0
Gender_Male               0
Diabetes_Yes              0
Diabetes_no               0
Diabetes Type_Type 1      0
Diabetes Type_Type 2      0
Diarrhoea_fatty           0
Diarrhoea_inflammatory    0
Diarrhoea_watery          0
Abdominal_no              0
Abdominal_yes             0
Short_Stature_DSS         0
Short_Stature_PSS         0
Short_Stature_Variant     0
Sticky_Stool_no           0
Sticky_Stool_yes          0
Weight_loss_no            0
Weight_loss_yes           0
Marsh_marsh type 0        0
Marsh_marsh type 1        0
Marsh_marsh type 2        0
Marsh_marsh type 3a       0
Marsh_marsh type 3b       0
Marsh_marsh type 3c       0
Marsh_none                0
cd_type_atypical          0
cd_type_latent            0
cd_type_none              0
cd_type_potential         0
cd_type_silent            0
cd_type_typical           0
dtype: int64

In [9]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2206 entries, 0 to 2205
Data columns (total 35 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Age                     2206 non-null   int64  
 1   IgA                     2206 non-null   float64
 2   IgG                     2206 non-null   float64
 3   IgM                     2206 non-null   float64
 4   Gender_Female           2206 non-null   bool   
 5   Gender_Male             2206 non-null   bool   
 6   Diabetes_Yes            2206 non-null   bool   
 7   Diabetes_no             2206 non-null   bool   
 8   Diabetes Type_Type 1    2206 non-null   bool   
 9   Diabetes Type_Type 2    2206 non-null   bool   
 10  Diarrhoea_fatty         2206 non-null   bool   
 11  Diarrhoea_inflammatory  2206 non-null   bool   
 12  Diarrhoea_watery        2206 non-null   bool   
 13  Abdominal_no            2206 non-null   bool   
 14  Abdominal_yes           2206 non-null   

In [10]:
y.value_counts()

Disease_Diagnose
yes    1843
no      363
Name: count, dtype: int64