In [113]:
import pandas as pd
import statsmodels
import math
import numpy as np
from scipy.stats import shapiro, f_oneway, chi2_contingency, kruskal
import matplotlib.pyplot as plt

In [114]:
columns = [
    "age", 
    "sex",
    "on thyroxine",                 
    "query on thyroxine",
    "on antithyroid medication",
    "sick",
    "pregnant",
    "thyroid surgery",
    "I131 treatment",
    "query hypothyroid",
    "query hyperthyroid",
    "lithium",
    "goitre",
    "tumor",
    "hypopituitary",
    "psych",
    # "as",
    "TSH measured",
    "TSH",
    "T3 measured",
    "T3",
    "TT4 measured",
    "TT4",
    "T4U measured",
    "T4U",
    "FTI measured",
    "FTI",
    "TBG measured",
    "TBG",
    "referral source",
    "increased binding protein, decreased binding protein"
]

# df.Class= df.Class.str.split(".").iloc[:,0]



len(columns)

30

In [115]:
allbp = pd.read_csv("./allbp.data",na_values="?",names=columns, index_col=False)
print(allbp.shape)

allbp.head()



(2800, 30)


Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,TBG,referral source,"increased binding protein, decreased binding protein"
0,41.0,F,f,f,f,f,f,f,f,f,...,t,125.0,t,1.14,t,109.0,f,,SVHC,negative.|3733
1,23.0,F,f,f,f,f,f,f,f,f,...,t,102.0,f,,f,,f,,other,negative.|1442
2,46.0,M,f,f,f,f,f,f,f,f,...,t,109.0,t,0.91,t,120.0,f,,other,negative.|2965
3,70.0,F,t,f,f,f,f,f,f,f,...,t,175.0,f,,f,,f,,other,negative.|806
4,70.0,F,f,f,f,f,f,f,f,f,...,t,61.0,t,0.87,t,70.0,f,,SVI,negative.|2807


In [121]:
categorical = {
    "sex":	                        ("M", "F"),
    "on thyroxine":	                ("f", "t"),
    "query on thyroxine":           ("f", "t"),
    "on antithyroid medication":    ("f", "t"),
    "sick":	                        ("f", "t"),
    "pregnant":			            ("f", "t"),
    "thyroid surgery":		        ("f", "t"),
    "I131 treatment":			    ("f", "t"),
    "query hypothyroid":		    ("f", "t"),
    "query hyperthyroid":		    ("f", "t"),
    "lithium":			            ("f", "t"),
    "goitre":				        ("f", "t"),
    "tumor":				        ("f", "t"),
    "hypopituitary":			    ("f", "t"),
    "psych":				        ("f", "t"),
    "TSH measured":			        ("f", "t"),
    "T3 measured":			        ("f", "t"),
    "TT4 measured":			        ("f", "t"),
    "T4U measured":			        ("f", "t"),
    "FTI measured":			        ("f", "t"),
    "TBG measured":			        ("f", "t"),
    "referral source":		        ("WEST", "STMW", "SVHC", "SVI", "SVHD", "other")
}

quantitative =  [x for x in columns if not x in categorical]
quantitative.pop()
quantitative


['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'TBG']

In [117]:
for column in categorical:
    print(column)
    try: 
        s = pd.Categorical(allbp[column]).rename_categories(categorical[column])
        allbp[column] = s
    except:
        # Do nothing
        print()

print(allbp.dtypes)  

sex
on thyroxine
query on thyroxine
on antithyroid medication
sick
pregnant
thyroid surgery
I131 treatment
query hypothyroid
query hyperthyroid
lithium
goitre
tumor
hypopituitary
psych
TSH measured
T3 measured
TT4 measured
T4U measured
FTI measured
TBG measured

referral source

age                                                      float64
sex                                                     category
on thyroxine                                            category
query on thyroxine                                      category
on antithyroid medication                               category
sick                                                    category
pregnant                                                category
thyroid surgery                                         category
I131 treatment                                          category
query hypothyroid                                       category
query hyperthyroid                                      category
lithi

In [118]:
allbp.isna().sum()

age                                                        1
sex                                                      110
on thyroxine                                               0
query on thyroxine                                         0
on antithyroid medication                                  0
sick                                                       0
pregnant                                                   0
thyroid surgery                                            0
I131 treatment                                             0
query hypothyroid                                          0
query hyperthyroid                                         0
lithium                                                    0
goitre                                                     0
tumor                                                      0
hypopituitary                                              0
psych                                                      0
TSH measured            

In [125]:

print(allbp[quantitative].mean())
allbp[quantitative].std()

age     51.844230
TSH      4.672150
T3       2.024966
TT4    109.072401
T4U      0.997912
FTI    110.787984
TBG           NaN
dtype: float64


age    20.461160
TSH    21.449453
T3      0.824600
TT4    35.392443
T4U     0.194390
FTI    32.883986
TBG          NaN
dtype: float64

In [133]:
from locale import normalize


cate = [x for x in categorical]
allbp[cate].value_counts()


fn = lambda col: pd.Series({
    "freq": col.value_counts()[0],
    "rel_freq": col.value_counts(normalize=True)[0]
})





allbp[cate].apply(fn) 

# TBG remove later 

# allbp.describe()

Unnamed: 0,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,query hyperthyroid,...,tumor,hypopituitary,psych,TSH measured,T3 measured,TT4 measured,T4U measured,FTI measured,TBG measured,referral source
freq,1830.0,2470.0,2760.0,2766.0,2690.0,2759.0,2761.0,2752.0,2637.0,2627.0,...,2729.0,2799.0,2665.0,2516.0,2215.0,2616.0,2503.0,2505.0,2800.0,1632.0
rel_freq,0.680297,0.882143,0.985714,0.987857,0.960714,0.985357,0.986071,0.982857,0.941786,0.938214,...,0.974643,0.999643,0.951786,0.898571,0.791071,0.934286,0.893929,0.894643,1.0,0.582857


In [None]:
# Take t == cate sum() 
# count / df.shape[0]
# unstack and dropna ?