In [1]:
# File Paths
FILE_PATH = 'hypothyroid.csv'

In [2]:
# All the imports
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

# Classification Algorithms to Use - 
# 1. Logistic Regression
# 2. KNN
# 3. Kernel SVM
# 5. Random Forest 
# 6. XGBoost
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from  xgboost import XGBClassifier

from sklearn.model_selection import cross_val_score
from matplotlib.colors import ListedColormap
from sklearn.metrics import precision_score, recall_score, plot_confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn import metrics
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler




In [3]:
# Reading the Data
thyroid_dataset = pd.read_csv(FILE_PATH)

In [4]:
thyroid_dataset

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,TBG,referral source,binaryClass
0,41,F,f,f,f,f,f,f,f,f,...,t,125,t,1.14,t,109,f,?,SVHC,P
1,23,F,f,f,f,f,f,f,f,f,...,t,102,f,?,f,?,f,?,other,P
2,46,M,f,f,f,f,f,f,f,f,...,t,109,t,0.91,t,120,f,?,other,P
3,70,F,t,f,f,f,f,f,f,f,...,t,175,f,?,f,?,f,?,other,P
4,70,F,f,f,f,f,f,f,f,f,...,t,61,t,0.87,t,70,f,?,SVI,P
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3767,30,F,f,f,f,f,f,f,f,f,...,f,?,f,?,f,?,f,?,other,P
3768,68,F,f,f,f,f,f,f,f,f,...,t,124,t,1.08,t,114,f,?,SVI,P
3769,74,F,f,f,f,f,f,f,f,f,...,t,112,t,1.07,t,105,f,?,other,P
3770,72,M,f,f,f,f,f,f,f,f,...,t,82,t,0.94,t,87,f,?,SVI,P


In [5]:
thyroid_dataset.rename(columns={'binaryClass':'Labels'},inplace=True)

In [6]:

thyroid_dataset.describe().T


Unnamed: 0,count,unique,top,freq
age,3772,94,59,95
sex,3772,3,F,2480
on thyroxine,3772,2,f,3308
query on thyroxine,3772,2,f,3722
on antithyroid medication,3772,2,f,3729
sick,3772,2,f,3625
pregnant,3772,2,f,3719
thyroid surgery,3772,2,f,3719
I131 treatment,3772,2,f,3713
query hypothyroid,3772,2,f,3538


In [7]:
thyroid_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3772 entries, 0 to 3771
Data columns (total 30 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   age                        3772 non-null   object
 1   sex                        3772 non-null   object
 2   on thyroxine               3772 non-null   object
 3   query on thyroxine         3772 non-null   object
 4   on antithyroid medication  3772 non-null   object
 5   sick                       3772 non-null   object
 6   pregnant                   3772 non-null   object
 7   thyroid surgery            3772 non-null   object
 8   I131 treatment             3772 non-null   object
 9   query hypothyroid          3772 non-null   object
 10  query hyperthyroid         3772 non-null   object
 11  lithium                    3772 non-null   object
 12  goitre                     3772 non-null   object
 13  tumor                      3772 non-null   object
 14  hypopitu

In [8]:
thyroid_dataset["Labels"] = thyroid_dataset["Labels"].map({"P":0,"N":1})
thyroid_dataset = thyroid_dataset.replace({"t":1,"f":0})

In [9]:
thyroid_dataset['sex'].isnull().sum()

0

In [10]:
thyroid_dataset["TBG"].value_counts()


?    3772
Name: TBG, dtype: int64

In [11]:
del thyroid_dataset["TBG"]

In [12]:
thyroid_dataset = thyroid_dataset.replace({"?":np.NAN})
thyroid_dataset.isnull().sum()


age                            1
sex                          150
on thyroxine                   0
query on thyroxine             0
on antithyroid medication      0
sick                           0
pregnant                       0
thyroid surgery                0
I131 treatment                 0
query hypothyroid              0
query hyperthyroid             0
lithium                        0
goitre                         0
tumor                          0
hypopituitary                  0
psych                          0
TSH measured                   0
TSH                          369
T3 measured                    0
T3                           769
TT4 measured                   0
TT4                          231
T4U measured                   0
T4U                          387
FTI measured                   0
FTI                          385
TBG measured                   0
referral source                0
Labels                         0
dtype: int64

In [13]:
thyroid_dataset["sex"].value_counts()


F    2480
M    1142
Name: sex, dtype: int64

In [14]:
thyroid_dataset = thyroid_dataset.replace({"F":1,"M":0})

In [15]:
thyroid_dataset["referral source"].value_counts()


other    2201
SVI      1034
SVHC      386
STMW      112
SVHD       39
Name: referral source, dtype: int64

In [16]:
del thyroid_dataset["referral source"]

In [17]:
thyroid_dataset["T3 measured"].value_counts()

1    3003
0     769
Name: T3 measured, dtype: int64

In [18]:
thyroid_dataset["TT4 measured"].value_counts()

1    3541
0     231
Name: TT4 measured, dtype: int64

In [19]:
thyroid_dataset["FTI measured"].value_counts()

1    3387
0     385
Name: FTI measured, dtype: int64

In [20]:
thyroid_dataset["TBG measured"].value_counts()


0    3772
Name: TBG measured, dtype: int64

In [21]:
thyroid_dataset["Labels"].value_counts()

0    3481
1     291
Name: Labels, dtype: int64

In [22]:
thyroid_dataset.dtypes


age                           object
sex                          float64
on thyroxine                   int64
query on thyroxine             int64
on antithyroid medication      int64
sick                           int64
pregnant                       int64
thyroid surgery                int64
I131 treatment                 int64
query hypothyroid              int64
query hyperthyroid             int64
lithium                        int64
goitre                         int64
tumor                          int64
hypopituitary                  int64
psych                          int64
TSH measured                   int64
TSH                           object
T3 measured                    int64
T3                            object
TT4 measured                   int64
TT4                           object
T4U measured                   int64
T4U                           object
FTI measured                   int64
FTI                           object
TBG measured                   int64
L

In [23]:
# Converting the remaining columns to numeric types
col_Names = thyroid_dataset.columns[thyroid_dataset.dtypes == 'object']
thyroid_dataset[col_Names] = thyroid_dataset[col_Names].apply(pd.to_numeric, errors='coerce')
thyroid_dataset.dtypes



age                          float64
sex                          float64
on thyroxine                   int64
query on thyroxine             int64
on antithyroid medication      int64
sick                           int64
pregnant                       int64
thyroid surgery                int64
I131 treatment                 int64
query hypothyroid              int64
query hyperthyroid             int64
lithium                        int64
goitre                         int64
tumor                          int64
hypopituitary                  int64
psych                          int64
TSH measured                   int64
TSH                          float64
T3 measured                    int64
T3                           float64
TT4 measured                   int64
TT4                          float64
T4U measured                   int64
T4U                          float64
FTI measured                   int64
FTI                          float64
TBG measured                   int64
L

In [24]:
thyroid_dataset.isnull().sum()

age                            1
sex                          150
on thyroxine                   0
query on thyroxine             0
on antithyroid medication      0
sick                           0
pregnant                       0
thyroid surgery                0
I131 treatment                 0
query hypothyroid              0
query hyperthyroid             0
lithium                        0
goitre                         0
tumor                          0
hypopituitary                  0
psych                          0
TSH measured                   0
TSH                          369
T3 measured                    0
T3                           769
TT4 measured                   0
TT4                          231
T4U measured                   0
T4U                          387
FTI measured                   0
FTI                          385
TBG measured                   0
Labels                         0
dtype: int64

In [25]:
# Columns where null values exist
thyroid_dataset.columns[thyroid_dataset.isnull().sum() > 0]

Index(['age', 'sex', 'TSH', 'T3', 'TT4', 'T4U', 'FTI'], dtype='object')

In [26]:
# Replacing the null values with the means
imputer = SimpleImputer(strategy='mean')
na_cols = thyroid_dataset.columns[thyroid_dataset.isnull().sum() > 0]
for col_name in na_cols:
    thyroid_dataset[col_name] = imputer.fit_transform(thyroid_dataset[[col_name]])

In [27]:
# thyroid_dataset[(thyroid_dataset['sex'] > 0.0) & (thyroid_dataset['sex'] < 1.0)]

In [28]:
thyroid_dataset

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,T3 measured,T3,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,Labels
0,41.0,1.0,0,0,0,0,0,0,0,0,...,1,2.5000,1,125.000000,1,1.140,1,109.000000,0,0
1,23.0,1.0,0,0,0,0,0,0,0,0,...,1,2.0000,1,102.000000,0,0.995,0,110.469649,0,0
2,46.0,0.0,0,0,0,0,0,0,0,0,...,0,2.0135,1,109.000000,1,0.910,1,120.000000,0,0
3,70.0,1.0,1,0,0,0,0,0,0,0,...,1,1.9000,1,175.000000,0,0.995,0,110.469649,0,0
4,70.0,1.0,0,0,0,0,0,0,0,0,...,1,1.2000,1,61.000000,1,0.870,1,70.000000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3767,30.0,1.0,0,0,0,0,0,0,0,0,...,0,2.0135,0,108.319345,0,0.995,0,110.469649,0,0
3768,68.0,1.0,0,0,0,0,0,0,0,0,...,1,2.1000,1,124.000000,1,1.080,1,114.000000,0,0
3769,74.0,1.0,0,0,0,0,0,0,0,0,...,1,1.8000,1,112.000000,1,1.070,1,105.000000,0,0
3770,72.0,0.0,0,0,0,0,0,0,0,0,...,1,2.0000,1,82.000000,1,0.940,1,87.000000,0,0


In [29]:
thyroid_dataset.columns

Index(['age', 'sex', 'on thyroxine', 'query on thyroxine',
       'on antithyroid medication', 'sick', 'pregnant', 'thyroid surgery',
       'I131 treatment', 'query hypothyroid', 'query hyperthyroid', 'lithium',
       'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH measured', 'TSH',
       'T3 measured', 'T3', 'TT4 measured', 'TT4', 'T4U measured', 'T4U',
       'FTI measured', 'FTI', 'TBG measured', 'Labels'],
      dtype='object')

In [30]:
X = thyroid_dataset.iloc[:, :-1]
y = thyroid_dataset['Labels']

In [31]:
X = sm.add_constant(X)
results = sm.OLS(y,X).fit()
results.summary()

0,1,2,3
Dep. Variable:,Labels,R-squared:,0.252
Model:,OLS,Adj. R-squared:,0.247
Method:,Least Squares,F-statistic:,48.57
Date:,"Sun, 01 May 2022",Prob (F-statistic):,1.4100000000000001e-213
Time:,15:36:00,Log-Likelihood:,179.17
No. Observations:,3772,AIC:,-304.3
Df Residuals:,3745,BIC:,-136.0
Df Model:,26,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0334,0.058,0.578,0.563,-0.080,0.147
age,-4.813e-05,0.000,-0.242,0.809,-0.000,0.000
sex,0.0343,0.009,3.943,0.000,0.017,0.051
on thyroxine,-0.0530,0.012,-4.339,0.000,-0.077,-0.029
query on thyroxine,0.0377,0.034,1.109,0.268,-0.029,0.104
on antithyroid medication,-0.0469,0.036,-1.299,0.194,-0.118,0.024
sick,-0.0130,0.020,-0.660,0.509,-0.052,0.026
pregnant,-0.0573,0.035,-1.654,0.098,-0.125,0.011
thyroid surgery,-0.0964,0.032,-2.989,0.003,-0.160,-0.033

0,1,2,3
Omnibus:,2079.207,Durbin-Watson:,2.009
Prob(Omnibus):,0.0,Jarque-Bera (JB):,14879.706
Skew:,2.602,Prob(JB):,0.0
Kurtosis:,11.221,Cond. No.,1.03e+16


In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [33]:
# Using standard Scaler to scale the values uniformly
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

In [34]:
X_train.shape

(2829, 28)

In [35]:
X_test.shape

(943, 28)

In [None]:
# 1. Logistic Regression
# 2. KNN
# 3. Kernel SVM
# 4. Naive Bayes
# 5. Random Forest 
# 6. XGBoost?

        
models_to_train = {
    'Logistic Regression': LogisticRegression(max_iter=500),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(),
    'RandomForest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(learning_rate=0.01)
}

for _, mod in models_to_train.items():
    mod.fit(X_train, y_train)


In [None]:
for _, mod in models_to_train.items():
     print(f"Accuracy Score for {_} is : ",mod.score(X_test,y_test)*100,"%")

In [None]:
# model = XGBClassifier(learning_rate=0.01).fit(X_train, y_train)
# y_pred = model.predict(X_test)
# print(f"Accuracy Score for xgboost is : ",model.score(X_test,y_test)*100,"%")