In [235]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import os
import re

In [236]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',10000)

### 3.1 Defining paths for Format-1 and Format-2 file folders

In [237]:
file_path_1='raw_thyroid_dataset/format_1_data'

In [238]:
file_path_2='raw_thyroid_dataset/format_2_data'

- Below function will be used for fetching the training file names

In [239]:
def fetch_file_names(dir_path):
    files=os.listdir(dir_path)
    data_file_names=[file for file in files if file.endswith('.test')]
    return data_file_names

- Below function will be used for combining all the individual datasets

In [240]:
def merge_dataset(dir_path, data_file_names_list):
    if len(data_file_names_list)!=0:
        df=pd.read_csv(f"{dir_path}/{data_file_names_list[0]}",header=None)
        for file in data_file_names_list[1:]:
            temp_df=pd.read_csv(os.path.join(dir_path,file),header=None,)
            df=pd.concat([df,temp_df],axis=0, ignore_index=True)
        return df
    else:
        pass

### 3.2 Combining the all Format-1 datasets in a single file

In [241]:
file_names=fetch_file_names(file_path_1)
df=merge_dataset(file_path_1,file_names)

In [242]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
0,35,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,f,?,f,?,f,?,f,?,f,?,f,?,other,negative.|219
1,63,M,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,3.5,t,2.5,t,108,t,0.96,t,113,f,?,SVI,negative.|2059
2,25,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,4.6,t,2.4,t,61,t,0.82,t,75,f,?,SVHD,negative.|399
3,53,F,f,f,f,f,f,f,f,t,f,f,f,f,f,f,t,0.25,t,2.1,t,145,t,1.03,t,141,f,?,other,negative.|1911
4,92,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,0.7,t,1.3,t,120,t,0.84,t,143,f,?,SVI,negative.|487


- Format-1 data has 5832 instances and 30 attributes

#### 3.2.1 Fetching Feature names of dataset

- Below function will fetch the features names from the given source path

In [243]:
def fetch_feature_names(src_path):
    with open(src_path) as file:
        itr_line=file.readlines()
        features_begining_point=itr_line.index('age:\t\t\t\tcontinuous.\n')
        itr_line=itr_line[features_begining_point:len(itr_line)]

        features_list=[]
        for features in itr_line:
            sep_point=features.find(":")
            features_list.append(features[:sep_point])
    return features_list

In [244]:
feature_list=fetch_feature_names(f"{file_path_1}/allhyper.names")

#### df dataset has 30 columns where we have only 29 feature names in features name list
- Appending the disease feature to list for 30th column

In [245]:
feature_list.append('disease')

- Assigning features names to the header of dataset

In [246]:
df.columns=feature_list

In [247]:
df.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,query hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH measured,TSH,T3 measured,T3,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,TBG,referral source,disease
0,35,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,f,?,f,?,f,?,f,?,f,?,f,?,other,negative.|219
1,63,M,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,3.5,t,2.5,t,108,t,0.96,t,113,f,?,SVI,negative.|2059
2,25,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,4.6,t,2.4,t,61,t,0.82,t,75,f,?,SVHD,negative.|399
3,53,F,f,f,f,f,f,f,f,t,f,f,f,f,f,f,t,0.25,t,2.1,t,145,t,1.03,t,141,f,?,other,negative.|1911
4,92,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,0.7,t,1.3,t,120,t,0.84,t,143,f,?,SVI,negative.|487


### 3.3 Combining all Format-2 datasets in a single file

In [248]:
file_names=fetch_file_names(file_path_2)
df2=merge_dataset(file_path_2,file_names)

In [249]:
df2

- Since no data from format-2 dataset. So our final dataset is format-1 dataset

In [250]:
final_dataset= df

#### Replacing whitespace with underscore in feature names of Format-1 dataset

In [251]:
df.columns=[features.replace(" ","_") for features in df.columns]

###  Shape of dataset

In [252]:
final_dataset.shape

(1944, 30)

### Dataset information

In [253]:
for features in final_dataset.columns:
    print(f"{features}:{final_dataset[features].unique()}")

age:[35 63 25 53 92 67 60 48 27 73 19 72 16 54 39 38 33 45 21 51 64 40 71 49
 79 20 59 37 42 46 50 69 30 31 89 77 68 65 24 75 80 23 11 18 62 76 22 70
 47 56 26 28 83 74 78 58 61 55 41 85 86 32 43 17 57 66 34 14 52 93 36 81
 84 15 12 44 29 82 87 88  7]
sex:['F' 'M' '?']
on_thyroxine:['f' 't']
query_on_thyroxine:['f' 't']
on_antithyroid_medication:['f' 't']
sick:['f' 't']
pregnant:['f' 't']
thyroid_surgery:['f' 't']
I131_treatment:['f' 't']
query_hypothyroid:['f' 't']
query_hyperthyroid:['f' 't']
lithium:['f' 't']
goitre:['f' 't']
tumor:['f' 't']
hypopituitary:['f']
psych:['f' 't']
TSH_measured:['f' 't']
TSH:['?' '3.5' '4.6' '0.25' '0.7' '0.81' '1.2' '27' '2.8' '2.6' '4.4' '3.1'
 '1.1' '4.5' '0.14' '0.06' '7.4' '2.3' '2' '5.8' '0.78' '4' '1' '0.52'
 '0.4' '0.3' '0.1' '1.9' '17' '1.3' '0.2' '0.005' '0.015' '12' '1.5' '2.9'
 '0.05' '0.42' '1.4' '5' '0.5' '0.9' '2.1' '0.15' '1.8' '7.6' '1.6' '0.79'
 '3.7' '0.77' '0.64' '2.7' '0.98' '0.025' '0.93' '0.58' '0.34' '3.4'
 '0.86' '0.47' '1.7' '0.

### 3.8 Dataset Information

- age: continuous values
- sex:['F' 'M' '?']
- on_thyroxine:['f' 't']
- query_on_thyroxine:['f' 't']
- on_antithyroid_medication:['f' 't']
- sick:['f' 't']
- pregnant:['f' 't']
- thyroid_surgery:['f' 't']
- I131_treatment:['f' 't' nan]
- query_hypothyroid:['f' 't']
- query_hyperthyroid:['f' 't']
- lithium:['f' 't']
- goitre:['f' 't']
- tumor:['f' 't']
- hypopituitary:['f' 't' nan]
- psych:['f' 't' nan]
- TSH_measured:['t' 'f']
- TSH: continuous values
- T3_measured:['t' 'f']
- T3: continuous values
- TT4_measured:['t' 'f']
- TT4: continuous values
- T4U_measured:['t' 'f']
- T4U: continuous values
- FTI_measured:['t' 'f']
- FTI: continuous values
- TBG_measured:['f' 't']
- TBG: continuous values
- referral_source:['SVHC' 'other' 'SVI' 'STMW' 'SVHD' 'WEST' nan]
- disease: multiclass categorical values

## 4) Fixing the dataset

- Due to presence of all features in object datatype some datachecks becomes difficult to implement. So fixing of dataset is required

### 4.1 Replacing ? with Nan in features of dataset

In [254]:
for features in final_dataset.columns:
    final_dataset[features]=np.where(final_dataset[features]=="?",np.nan,final_dataset[features])

#### Checking first 5 values of dataset

In [255]:
final_dataset.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,query_hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH_measured,TSH,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,disease
0,35.0,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,f,,f,,f,,f,,f,,f,,other,negative.|219
1,63.0,M,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,3.5,t,2.5,t,108.0,t,0.96,t,113.0,f,,SVI,negative.|2059
2,25.0,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,4.6,t,2.4,t,61.0,t,0.82,t,75.0,f,,SVHD,negative.|399
3,53.0,F,f,f,f,f,f,f,f,t,f,f,f,f,f,f,t,0.25,t,2.1,t,145.0,t,1.03,t,141.0,f,,other,negative.|1911
4,92.0,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,0.7,t,1.3,t,120.0,t,0.84,t,143.0,f,,SVI,negative.|487


### 4.2 Fixing Numerical features

In [256]:
num_features=['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI','TBG']

#### 4.2.1 Typecasting object features to float datatype

In [257]:
for features in num_features:
    final_dataset[features]=final_dataset[features].astype(float)

### 4.3 Fixing Categorical features

In [258]:
cat_features=['sex', 'referral_source','disease']

#### 4.3.1 Extracting the disease categories by splitting the character through "|" mark

In [259]:
final_dataset['disease']=final_dataset['disease'].str.split('.')
final_dataset['disease']=[val[0].replace(".","") for val in final_dataset['disease']]

#### Extracting the disease categories by splitting the character through "[" mark

In [260]:
final_dataset['disease']=final_dataset['disease'].str.split('[')
final_dataset['disease']=[val[0] for val in final_dataset['disease']]

#### Checking the frequency of each class in disease feature

In [261]:
final_dataset['disease'].value_counts()

disease
negative                   1848
compensated hypothyroid      40
primary hypothyroid          31
hyperthyroid                 17
goitre                        5
T3 toxic                      2
secondary toxic               1
Name: count, dtype: int64

##### 4.3.2 Applying mapping over Alphabetic characters in disease categories

In [262]:
disease_symbols={'-':'negative',
    'A':'hyperthyroid',
    'B':'T3 toxic',
    'C':'toxic goitre',
    'D':'secondary toxic',
    'E':'hypothyroid',
    'F':'primary hypothyroid',
    'G':'compensated hypothyroid',
    'H':'secondary hypothyroid',
    'I':'increased binding protein',
    'J':'decreased binding protein',
    'K':'concurrent non-thyroidal illness',
    'L':'consistent with replacement therapy',
    'M':'underreplaced',
    'N':'overreplaced',
    'O':'antithyroid drugs',
    'P':'I131 treatment',
    'Q':'surgery',
    'R':'discordant assay results',
    'S':'elevated TBG',
    'T':'elevated thyroid hormones'}

###### creating replica of disease feature and keeping only alphabetical values in disease feature replica

In [263]:
final_dataset['disease2']=final_dataset['disease']
final_dataset['disease2']=np.where(final_dataset['disease']==final_dataset['disease'].str.upper(),final_dataset['disease'],np.nan)

###### applying mapping over alphabetic symbols in disease feature

In [264]:
map_lis=[]
for features in final_dataset.loc[final_dataset['disease2'].notna(),'disease']:
    disease=""
    for char in features:
        if char!="|":
            disease+=disease_symbols[char]+" "
        if char=="|":
            disease+=" | "
    
    map_lis.append(disease.strip())

final_dataset.loc[final_dataset['disease2'].notna(),'disease']=map_lis

###### verifying the mapping at disease feature

In [265]:
final_dataset[final_dataset['disease2'].notna()]

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,query_hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH_measured,TSH,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,disease,disease2


###### verfying the frequency of each class

In [266]:
final_dataset['disease'].value_counts()

disease
negative                   1848
compensated hypothyroid      40
primary hypothyroid          31
hyperthyroid                 17
goitre                        5
T3 toxic                      2
secondary toxic               1
Name: count, dtype: int64

###### Insights
- 7 Rare classes in disease features
###### Some of the classes are relative to each other:
- antithyroid drugs increased binding protein , antithyroid drugs
- replacement therapy class has related classes like: (consistent with replacement therapy decreased binding protein, overreplacement, underreplacement)
- goitre class has related classes like: (secondary toxic  | discordant assay results, toxic goitre, toxic goitre  | increased binding protein, secondary toxic)
- increased binding proteins has related classes like: (antithyroid drugs increased binding protein, compensated hypothyroid increased binding protein)
- decreased binding protein has related classes like: (consistent with replacement therapy decreased binding protein)

##### dropping disease2 replica from dataset

In [267]:
final_dataset.drop(['disease2'],axis=1,inplace=True)

### 4.4 Fixing the boolean features

In [268]:
bool_features=[features for features in final_dataset.columns if features not in num_features+cat_features]

#### Showing all boolean features

In [269]:
bool_features

['on_thyroxine',
 'query_on_thyroxine',
 'on_antithyroid_medication',
 'sick',
 'pregnant',
 'thyroid_surgery',
 'I131_treatment',
 'query_hypothyroid',
 'query_hyperthyroid',
 'lithium',
 'goitre',
 'tumor',
 'hypopituitary',
 'psych',
 'TSH_measured',
 'T3_measured',
 'TT4_measured',
 'T4U_measured',
 'FTI_measured',
 'TBG_measured']

In [270]:
bool_dict={'y':1, 'n':0, 'f':0, 't':1}

for features in bool_features:
    final_dataset[features]=final_dataset[features].map(bool_dict)

## 5) Datachecks to performs

- Check Missing values
- Check data type
- Check the number of unique values of each column
- Check statistics of data set

### 5.1 Checking Missing values

In [271]:
final_dataset.isnull().sum()

age                             0
sex                            80
on_thyroxine                    0
query_on_thyroxine              0
on_antithyroid_medication       0
sick                            0
pregnant                        0
thyroid_surgery                 0
I131_treatment                  0
query_hypothyroid               0
query_hyperthyroid              0
lithium                         0
goitre                          0
tumor                           0
hypopituitary                   0
psych                           0
TSH_measured                    0
TSH                           170
T3_measured                     0
T3                            368
TT4_measured                    0
TT4                            94
T4U_measured                    0
T4U                           180
FTI_measured                    0
FTI                           180
TBG_measured                    0
TBG                          1944
referral_source                 0
disease       

### 5.2 Checking datatypes

In [272]:
final_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1944 entries, 0 to 1943
Data columns (total 30 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age                        1944 non-null   float64
 1   sex                        1864 non-null   object 
 2   on_thyroxine               1944 non-null   int64  
 3   query_on_thyroxine         1944 non-null   int64  
 4   on_antithyroid_medication  1944 non-null   int64  
 5   sick                       1944 non-null   int64  
 6   pregnant                   1944 non-null   int64  
 7   thyroid_surgery            1944 non-null   int64  
 8   I131_treatment             1944 non-null   int64  
 9   query_hypothyroid          1944 non-null   int64  
 10  query_hyperthyroid         1944 non-null   int64  
 11  lithium                    1944 non-null   int64  
 12  goitre                     1944 non-null   int64  
 13  tumor                      1944 non-null   int64

##### float datatype features: 
- age
- TSH
- T3
- TT4
- T4U
- FTI
- TBG

##### object datatype features:
- sex
- referral_source
- disease

##### int datatype features:
 -  on_thyroxine
 -   query_on_thyroxine
 -   on_antithyroid_medication
 -   sick
 -   pregnant
 -  thyroid_surgery
 -  I131_treatment
 -   query_hypothyroid
 -  query_hyperthyroid
 -  lithium
 -  goitre 
 -  tumor 
 -  hypopituitary
 -  psych
 -  TSH_measured
 -  T3_measured
 -  TT4_measured
 -  T4U_measured
 -  FTI_measured
 -  TBG_measured

#### Dropping "TBG" feature

In [273]:
final_dataset.drop(["TBG"], axis=1,inplace=True)

In [274]:
num_features.remove('TBG')

### 5.3 Checking the number of unique values in each feature

In [275]:
final_dataset.nunique()

age                           81
sex                            2
on_thyroxine                   2
query_on_thyroxine             2
on_antithyroid_medication      2
sick                           2
pregnant                       2
thyroid_surgery                2
I131_treatment                 2
query_hypothyroid              2
query_hyperthyroid             2
lithium                        2
goitre                         2
tumor                          2
hypopituitary                  1
psych                          2
TSH_measured                   2
TSH                          203
T3_measured                    2
T3                            54
TT4_measured                   2
TT4                          173
T4U_measured                   2
T4U                          109
FTI_measured                   2
FTI                          161
TBG_measured                   1
referral_source                5
disease                        7
dtype: int64

### 5.4 Check statistics of data set

In [276]:
final_dataset.describe()

Unnamed: 0,age,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,query_hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH_measured,TSH,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured
count,1944.0,1944.0,1944.0,1944.0,1944.0,1944.0,1944.0,1944.0,1944.0,1944.0,1944.0,1944.0,1944.0,1944.0,1944.0,1944.0,1774.0,1944.0,1576.0,1944.0,1850.0,1944.0,1764.0,1944.0,1764.0,1944.0
mean,51.423868,0.13786,0.010288,0.009259,0.038066,0.012346,0.014403,0.011317,0.073045,0.065844,0.004115,0.009259,0.02572,0.0,0.050412,0.912551,6.262835,0.8107,1.981269,0.951646,106.189622,0.907407,0.986735,0.907407,109.565533,0.0
std,18.961818,0.344842,0.100933,0.095803,0.191404,0.110452,0.119177,0.105804,0.260278,0.248072,0.064034,0.095803,0.15834,0.0,0.218849,0.282564,31.629429,0.391848,0.834766,0.214568,36.121177,0.289935,0.198276,0.289935,33.659989,0.0
min,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005,0.0,0.2,0.0,2.9,0.0,0.25,0.0,2.8,0.0
25%,35.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5125,1.0,1.5,1.0,87.0,1.0,0.87,1.0,92.0,0.0
50%,54.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.3,1.0,2.0,1.0,103.0,1.0,0.97,1.0,106.0,0.0
75%,67.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.9,1.0,2.3,1.0,123.0,1.0,1.08,1.0,124.0,0.0
max,93.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,530.0,1.0,8.5,1.0,372.0,1.0,2.32,1.0,291.0,0.0


###### Insights
- min value for each feature is 0 except age and TT4 feature
- high difference in standard deviation of each feature
- high difference in count value of TBG with respect to other features

## 1) Imputation of Missing values

### 1.1 Imputing numerical features

- All the values missing in numerical features like 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'TBG' are treated as report not ready which is represented by 0 value according to the database theory.
- Hence we will impute missing feature with 0, except 'age' feature

In [277]:
# for features in num_features:
#     if features!='age':
#         final_dataset[features]=final_dataset[features].fillna(0)

#### 1.1.1 Imputing age numerical feature

- Imputing age feature using knn imputer which will calculate the mean value of k closest neighbors mean value


#### 1.1.1.1 Importing KNN imputer library

In [278]:
from sklearn.impute import KNNImputer

#### 1.1.1.2 Creating instance of KNNImputer and fit the numerical features

In [279]:
knn_imp=KNNImputer()

imp_val=knn_imp.fit_transform(final_dataset[num_features])

#### 1.1.1.3 Assigning the missing values to the age feature

In [280]:
final_dataset[num_features]=imp_val

### 1.2 Imputing categorical features

#### 1.2.1 Assigning the missing values with most frequent category

In [281]:
for features in cat_features:
    print(final_dataset[features].mode())
    final_dataset[features]=np.where(final_dataset[features].isnull(),final_dataset[features].mode(),final_dataset[features])

0    F
Name: sex, dtype: object
0    other
Name: referral_source, dtype: object
0    negative
Name: disease, dtype: object


- Checking the null values in dataset

In [282]:
final_dataset.isnull().sum()

age                          0
sex                          0
on_thyroxine                 0
query_on_thyroxine           0
on_antithyroid_medication    0
sick                         0
pregnant                     0
thyroid_surgery              0
I131_treatment               0
query_hypothyroid            0
query_hyperthyroid           0
lithium                      0
goitre                       0
tumor                        0
hypopituitary                0
psych                        0
TSH_measured                 0
TSH                          0
T3_measured                  0
T3                           0
TT4_measured                 0
TT4                          0
T4U_measured                 0
T4U                          0
FTI_measured                 0
FTI                          0
TBG_measured                 0
referral_source              0
disease                      0
dtype: int64

## 2) Performing encoding

### 2.1 Performing encoding on categorical features

- Below code will be used to assign a numeric digit to each unique value in each feature of categorical feature

In [283]:
for features in cat_features:
    if features!='disease':
        cat_encode={val:idx for idx, val in enumerate(final_dataset[features].unique(),start=1)}
        print(cat_encode)
        final_dataset[features]=final_dataset[features].map(cat_encode)

{'F': 1, 'M': 2}
{'other': 1, 'SVI': 2, 'SVHD': 3, 'SVHC': 4, 'STMW': 5}


## 3) Handling Rare Classes in disease feature and also classes related to another one

### 3.1 Checking the frequency of each class in disease feature

In [284]:
final_dataset['disease'].value_counts()

disease
negative                   1848
compensated hypothyroid      40
primary hypothyroid          31
hyperthyroid                 17
goitre                        5
T3 toxic                      2
secondary toxic               1
Name: count, dtype: int64

### 3.2 Creating dictionary to map extra classes with their relative classes

In [285]:
disease_features={'hyperthyroid':['hyperthyroid', 'T3 toxic', 'toxic goitre', 'secondary toxic', 'goitre'],
                 'hypothyroid':['hypothyroid', 'primary hypothyroid','compensated hypothyroid','secondary hypothyroid'],
                 'antithyroid treatment':['antithyroid drugs','I131 treatment','surgery'],
                  'replacement therapy': ['consistent with replacement therapy', 'underreplacement', 'overreplacement'],
                  'discordant assay results':['discordant'],
                  'binding protein': ['increased binding protein', 'decreased binding protein'],
                  'miscellaneous': ['discordant assay results'],
                  'negative': ['concurrent non-thyroidal illness']
                  
                 }
# 'consistent with replacement therapy decreased binding protein'

### 3.3 Mapping extra classes with their relative classes

In [286]:
for lists in disease_features.values():
    for values in lists:
        final_dataset['disease']=np.where(final_dataset['disease']==values,list(disease_features.keys())[list(disease_features.values()).index(lists)],final_dataset['disease'])
#             print(lists)
#             print(disease_features.keys())
#             print(list(disease_features.keys())[list(disease_features.values()).index(lists)])

### Checking the frequency of each class in disease feature

In [287]:
final_dataset['disease'].value_counts()

disease
negative        1848
hypothyroid       71
hyperthyroid      25
Name: count, dtype: int64

#### Combining other categories rather than negative, hyperthyroid or hypothyroid 

In [288]:
other_categories={ 'other': ['binding protein', 'replacement therapy', 'antithyroid treatment']}

In [289]:
for lists in other_categories.values():
    for values in lists:
        final_dataset['disease']=np.where(final_dataset['disease']==values,list(other_categories.keys())[list(other_categories.values()).index(lists)],final_dataset['disease'])

In [290]:
final_dataset['disease'].value_counts()

disease
negative        1848
hypothyroid       71
hyperthyroid      25
Name: count, dtype: int64

### 3.2 Encoding disease features

In [291]:
# disease_encoded={cat:idx for idx, cat in enumerate(final_dataset['disease'].unique())}

# final_dataset['disease']=final_dataset['disease'].map(disease_encoded)


final_dataset['disease']=np.where(final_dataset['disease']=='negative',0,1)

In [292]:
# disease_encoded

In [293]:
# disease_encoding={'negative': 1, 'increased binding protein': 2, 'decreased binding protein': 3, 
#                   'hyperthyroid': 4, 'hypothyroid': 5, 'replacement therapy': 6, 'underreplacement': 7, 
#                   'overreplacement': 8, 'discordant assay results': 9, 'sick': 10, 'miscellaneous': 11, 
#                   'concurrent non-thyroidal illness': 12, 'antithyroid treatment': 13}

# final_dataset['disease']=final_dataset['disease'].map(disease_encoding)

In [294]:
# final_dataset['disease']=np.where(final_dataset['disease']=='negative',0,1)

In [295]:
final_dataset[final_dataset['disease'].isnull()]

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,query_hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH_measured,TSH,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,referral_source,disease


### Dropping Unwanted features

In [296]:
measured_bool_features=[features for features in final_dataset.columns if '_measured' in features]

In [297]:
final_dataset.drop(measured_bool_features, axis=1, inplace=True)

# 5) Feature Selection

In [298]:
from sklearn.feature_selection import mutual_info_classif, SelectKBest

In [299]:
best_feature=SelectKBest(mutual_info_classif,k=6)

In [300]:
best_feature.fit(final_dataset.iloc[:,:-1],final_dataset['disease'])

In [301]:
best_feature.get_support()

array([False,  True, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False,  True,  True,
        True, False,  True, False])

In [302]:
feature_names=final_dataset.iloc[:,:-1].columns[best_feature.get_support()]

In [303]:
X=final_dataset[['age', 'sex', 'TSH', 'T3', 'TT4', 'T4U', 'FTI']]

In [304]:
Y=final_dataset['disease']

## 6) Data Transformation 

In [305]:
from sklearn.preprocessing import StandardScaler

In [306]:
scale=StandardScaler()

In [307]:
trans_x=pd.DataFrame(scale.fit_transform(X), columns=X.columns)

In [308]:
trans_x.to_csv('Processed_testing_data2.csv',index=False)

In [309]:
Y.to_csv('output_testing_data2.csv',index=False)

In [310]:
trans_x.head()

Unnamed: 0,age,sex,TSH,T3,TT4,T4U,FTI
0,-0.866378,-0.639293,1.543324,-0.349557,-1.204719,0.638624,-1.108246
1,0.610654,1.564228,-0.087373,0.643906,0.049846,-0.133249,0.112671
2,-1.393889,-0.639293,-0.051288,0.516539,-1.27223,-0.8634,-1.0472
3,0.083143,-0.639293,-0.193987,0.134438,1.090628,0.231826,0.967313
4,2.140437,-0.639293,-0.179225,-0.884499,0.387397,-0.759093,1.028359


In [311]:
trans_x.shape

(1944, 7)

In [312]:
trans_x.columns

Index(['age', 'sex', 'TSH', 'T3', 'TT4', 'T4U', 'FTI'], dtype='object')