# hypo-thyroid prediction

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv(".\Datasets\hypothyroid.csv")
data.head(3)

Unnamed: 0,age,sex,on thyroxine,TSH,T3 measured,T3,TT4,binaryClass
0,41,F,f,1,t,3,125,P
1,23,F,f,4,t,2,102,P
2,46,M,f,1,f,?,109,P


In [3]:
data.shape

(3772, 8)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3772 entries, 0 to 3771
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   age           3772 non-null   object
 1   sex           3772 non-null   object
 2   on thyroxine  3772 non-null   object
 3   TSH           3772 non-null   object
 4   T3 measured   3772 non-null   object
 5   T3            3772 non-null   object
 6   TT4           3772 non-null   object
 7   binaryClass   3772 non-null   object
dtypes: object(8)
memory usage: 235.9+ KB


In [5]:
data.describe()

Unnamed: 0,age,sex,on thyroxine,TSH,T3 measured,T3,TT4,binaryClass
count,3772,3772,3772,3772,3772,3772,3772,3772
unique,94,3,2,92,2,12,239,2
top,59,F,f,1,t,2,?,P
freq,95,2480,3308,928,3003,1796,231,3481


In [6]:
data.isnull().sum()

age             0
sex             0
on thyroxine    0
TSH             0
T3 measured     0
T3              0
TT4             0
binaryClass     0
dtype: int64

In [7]:
data['sex'].unique()

array(['F', 'M', '?'], dtype=object)

In [8]:
data['T3'].unique()

array(['3', '2', '?', '1', '4', '0', '6', '5', '7', '11', '9', '8'],
      dtype=object)

In [9]:
data['TT4'].value_counts()

TT4
?      231
101     71
93      67
98      63
103     63
      ... 
4        1
30       1
55       1
216      1
258      1
Name: count, Length: 239, dtype: int64

In [10]:
for col in data.select_dtypes(include=['object']).columns:
    data[col].replace("?",data[col].mode()[0],inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].replace("?",data[col].mode()[0],inplace=True)


In [11]:
data['TT4'].replace("?",101,inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['TT4'].replace("?",101,inplace=True)


In [12]:
label_encoder = LabelEncoder()
data['sex'] = label_encoder.fit_transform(data['sex'])
data['sex'].unique()

array([0, 1])

In [13]:
data['on thyroxine'] = label_encoder.fit_transform(data['on thyroxine'])
data['on thyroxine'].unique()

array([0, 1])

In [14]:
data['T3 measured'] = label_encoder.fit_transform(data['T3 measured'])
data['T3 measured'].unique()

array([1, 0])

In [15]:
data['binaryClass'] = label_encoder.fit_transform(data['binaryClass'])
data['binaryClass'].unique()

array([1, 0])

In [16]:
data.head(3)

Unnamed: 0,age,sex,on thyroxine,TSH,T3 measured,T3,TT4,binaryClass
0,41,0,0,1,1,3,125,1
1,23,0,0,4,1,2,102,1
2,46,1,0,1,0,2,109,1


In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3772 entries, 0 to 3771
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   age           3772 non-null   object
 1   sex           3772 non-null   int64 
 2   on thyroxine  3772 non-null   int64 
 3   TSH           3772 non-null   object
 4   T3 measured   3772 non-null   int64 
 5   T3            3772 non-null   object
 6   TT4           3772 non-null   object
 7   binaryClass   3772 non-null   int64 
dtypes: int64(4), object(4)
memory usage: 235.9+ KB


In [18]:
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype(int)

In [19]:
data['binaryClass'].value_counts()

binaryClass
1    3481
0     291
Name: count, dtype: int64

### 1 ---> Hypo-thyroid positive
### 0 ---> Negative

In [20]:
data.to_csv(".\Datasets\preprocessed_hypothyroid_data.csv",index=False)

In [21]:
x = data.iloc[:,:-1]
y = data['binaryClass']

In [22]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

## Model Training

In [23]:
model = LogisticRegression()

In [24]:
model.fit(x_train,y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Model Evaluation

In [25]:
# accuracy on training data
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction,y_train)
print("Accuracy on Training data :",training_data_accuracy)

Accuracy on Training data : 0.957905203844879


In [26]:
# accuracy on testing data
x_test_prediction = model.predict(x_test)
testing_data_accuracy = accuracy_score(x_test_prediction,y_test)
print("Accuracy on Testing data :",testing_data_accuracy)

Accuracy on Testing data : 0.9576158940397351


## Predictive model

In [27]:
input_data = (41,0,0,1,1,3,125)

# change the input data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if(prediction[0] == 0):
    print("The Person does not have hypo-thyroid")
else:
    print("The Person has hypo-thyroid")

[1]
The Person has hypo-thyroid




## Saving the trained model

In [28]:
import pickle

In [29]:
filename = ".\Models\hypothyroid_prediction_model.sav"
pickle.dump(model,open(filename,"wb"))

In [30]:
# loading the saved model
loaded_model = pickle.load(open(filename,"rb"))

In [31]:
loaded_model.predict(input_data_reshaped)



array([1])