In [1]:
# Create reference for CSV file
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder

In [2]:
# Read CSV file into a dataframe
celiac_path = "resources/celiac_disease_lab_data.csv"

celiac_df = pd.read_csv(celiac_path, low_memory = False)

# Print the first 5 rows
celiac_df.head(5)

Unnamed: 0,Age,Gender,Diabetes,Diabetes Type,Diarrhoea,Abdominal,Short_Stature,Sticky_Stool,Weight_loss,IgA,IgG,IgM,Marsh,cd_type,Disease_Diagnose
0,10,Male,Yes,Type 1,inflammatory,yes,PSS,no,no,1.3,10.0,1.0,marsh type 0,potential,yes
1,9,Male,Yes,Type 1,fatty,yes,PSS,no,no,1.5,12.5,1.3,marsh type 3a,atypical,yes
2,8,Female,Yes,Type 1,watery,yes,Variant,yes,yes,0.4,8.0,0.5,marsh type 1,latent,yes
3,10,Male,Yes,Type 1,watery,yes,PSS,no,no,0.98,9.0,0.66,marsh type 3a,silent,yes
4,9,Male,Yes,Type 1,fatty,yes,PSS,no,no,1.0,10.5,1.1,marsh type 1,latent,yes


In [3]:
# Split the features and target data and copy the dataframe of X
y = celiac_df['Disease_Diagnose']
X = celiac_df.drop(columns='Disease_Diagnose')
celiac_sans_y_df = X

celiac_sans_y_df.head(20)

Unnamed: 0,Age,Gender,Diabetes,Diabetes Type,Diarrhoea,Abdominal,Short_Stature,Sticky_Stool,Weight_loss,IgA,IgG,IgM,Marsh,cd_type
0,10,Male,Yes,Type 1,inflammatory,yes,PSS,no,no,1.3,10.0,1.0,marsh type 0,potential
1,9,Male,Yes,Type 1,fatty,yes,PSS,no,no,1.5,12.5,1.3,marsh type 3a,atypical
2,8,Female,Yes,Type 1,watery,yes,Variant,yes,yes,0.4,8.0,0.5,marsh type 1,latent
3,10,Male,Yes,Type 1,watery,yes,PSS,no,no,0.98,9.0,0.66,marsh type 3a,silent
4,9,Male,Yes,Type 1,fatty,yes,PSS,no,no,1.0,10.5,1.1,marsh type 1,latent
5,8,Female,Yes,Type 1,fatty,yes,Variant,yes,yes,1.1,9.5,1.0,marsh type 3a,silent
6,9,Male,Yes,Type 1,watery,yes,Variant,yes,yes,2.1,11.4,1.0,marsh type 2,typical
7,5,Female,Yes,Type 1,fatty,yes,PSS,yes,yes,0.8,12.0,0.98,marsh type 1,latent
8,6,Female,Yes,Type 1,fatty,yes,PSS,yes,yes,1.5,8.0,1.1,marsh type 3b,silent
9,4,Male,Yes,Type 1,watery,yes,Variant,yes,yes,0.42,11.5,1.0,marsh type 2,typical


In [4]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
le = LabelEncoder()

# Fit and transform the y training and testing data using the label encoder
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)
y_train_encoded

array([1, 1, 1, ..., 1, 1, 1])

In [5]:
# Use a OneHotEncoder to convert the training data to numerical values
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False, dtype='int')
X_train_encoded = pd.DataFrame(data=ohe.fit_transform(X_train), columns=ohe.get_feature_names_out())
X_train_encoded

Unnamed: 0,Age_1,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,Age_9,Age_10,Age_11,...,Marsh_marsh type 3a,Marsh_marsh type 3b,Marsh_marsh type 3c,Marsh_none,cd_type_atypical,cd_type_latent,cd_type_none,cd_type_potential,cd_type_silent,cd_type_typical
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1649,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
1650,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
1651,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1652,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0


In [6]:
from sklearn.linear_model import LogisticRegression

In [7]:
# Create the logistic regression classifier model with a random_state of 1
lr_model = LogisticRegression(random_state=1)
# Fit the model to the training data
lr_model.fit(X_train_encoded, y_train_encoded)

In [8]:
print('Train Accuracy: %.3f' % lr_model.score(X_train_encoded, y_train_encoded))

Train Accuracy: 0.997


In [9]:
# Encode the test data
X_test_encoded = pd.DataFrame(data=ohe.transform(X_test), columns=ohe.get_feature_names_out())
X_test_encoded

Unnamed: 0,Age_1,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,Age_9,Age_10,Age_11,...,Marsh_marsh type 3a,Marsh_marsh type 3b,Marsh_marsh type 3c,Marsh_none,cd_type_atypical,cd_type_latent,cd_type_none,cd_type_potential,cd_type_silent,cd_type_typical
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
547,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
548,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
549,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,1,0,0,0
550,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [10]:
y_test_encoded = le.transform(y_test)
y_test_encoded

array([0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,

In [11]:
print('Test Accuracy: %.3f' % lr_model.score(X_test_encoded, y_test_encoded))

Test Accuracy: 0.995


In [12]:
#!pip install imbalanced-learn
from imblearn.under_sampling import RandomUnderSampler



In [13]:
# Instantiate the RandomUnderSampler instance
rus = RandomUnderSampler(random_state=1)

In [14]:
# Fit the data to the model
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

In [15]:
y_resampled.value_counts()

Disease_Diagnose
no     266
yes    266
Name: count, dtype: int64

In [16]:
# Encode the resampled data
X_resampled_encoded = pd.DataFrame(data=ohe.transform(X_resampled), columns=ohe.get_feature_names_out())

In [17]:
y_resampled_encoded = le.transform(y_resampled)
y_resampled_encoded

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [18]:
# Create the logistic regression classifier model with a random_state of 1
lr_sample_model = LogisticRegression(random_state=1)

In [19]:
# Fit the model to the training data
lr_sample_model.fit(X_resampled_encoded, y_resampled_encoded)

In [20]:
print('Train Accuracy: %.3f' % lr_sample_model.score(X_resampled_encoded, y_resampled_encoded))

Train Accuracy: 0.994


In [22]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
# Instantiate an initial RamdonForestClassifier instance
model = RandomForestClassifier()
# Fit the initial model based the training data
model.fit(X_resampled_encoded, y_resampled_encoded)

In [26]:
y_predict = model.predict(X_test_encoded)

In [27]:
y_predict

array([0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,

In [29]:
# Instantiate an initial RamdonForestClassifier instance
model_encoded = RandomForestClassifier()
# Fit the initial model based the training data
model.fit(X_train_encoded, y_train_encoded)

In [31]:
y_predict_test_encoded = model.predict(X_test_encoded)
y_predict_test_encoded

array([0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,

In [33]:
from sklearn.metrics import classification_report

In [34]:
print(classification_report(y_test_encoded, y_predict))
print(classification_report(y_test_encoded, y_predict_test_encoded))

              precision    recall  f1-score   support

           0       0.98      0.97      0.97        97
           1       0.99      1.00      0.99       455

    accuracy                           0.99       552
   macro avg       0.99      0.98      0.98       552
weighted avg       0.99      0.99      0.99       552

              precision    recall  f1-score   support

           0       1.00      0.97      0.98        97
           1       0.99      1.00      1.00       455

    accuracy                           0.99       552
   macro avg       1.00      0.98      0.99       552
weighted avg       0.99      0.99      0.99       552



In [None]:
X.dtypes

In [None]:
X.info

In [None]:
X.describe()

In [None]:
X.isna().sum()

In [None]:
X.info()

In [None]:
y.value_counts()