In [1]:
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
import json
import os
from cassandra.query import dict_factory
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

In [3]:
def connect_db():
    try:

      # This secure connect bundle is autogenerated when you download your SCB, 
      # if yours is different update the file name below
      cloud_config= {
        'secure_connect_bundle': './secure-connect-thyroid-detection-db.zip'
      }


      # This token JSON file is autogenerated when you download your token, 
      # if yours is different update the file name below
      with open(os.path.join("thyroid_detection_db_token.json")) as f:
          secrets = json.load(f)

      CLIENT_ID = secrets["clientId"]
      CLIENT_SECRET = secrets["secret"]

      auth_provider = PlainTextAuthProvider(CLIENT_ID, CLIENT_SECRET)
      cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider)
      session = cluster.connect()

      row= session.execute("select release_version from system.local").one()

      if row: 
        print("Connection established...")

      else:
        try:
            pass
        except Exception as e:
            print(e)
      
      return session
    
    except Exception as e:
      print(e)

In [4]:
def read_table():
    keyspace='thyroid_disease'
    table_name='raw_data'
    try:
      session=connect_db()
      session.row_factory=dict_factory
      row=session.execute(f"""SELECT * FROM {keyspace}.{table_name}""")
      return row
    except Exception as e:
      print(e)

In [5]:
data=read_table()[::]
dataframe=pd.DataFrame(data,).iloc[:,1:]

Connection established...


In [6]:
dataframe.head()

Unnamed: 0,age,disease,fti,fti_measured,goitre,hypopituitary,i131_treatment,lithium,on_antithyroid_medication,on_thyroxine,pregnant,psych,query_hyperthyroid,query_hypothyroid,query_on_thyroxine,referral_source,sex,sick,t3,t3_measured,t4u,t4u_measured,tbg,tbg_measured,thyroid_surgery,tsh,tsh_measured,tt4,tt4_measured,tumor
0,63,negative.|3080\n,97,t,f,f,f,f,f,f,f,f,f,f,f,SVI,F,f,2.2,t,1.03,t,?,f,f,0.88,t,101,t,f
1,49,negative.|947\n,88,t,f,f,f,f,f,f,f,f,f,f,f,other,F,f,1.8,t,0.91,t,?,f,f,1.9,t,81,t,f
2,45,negative.|2679\n,63,t,f,f,f,f,f,f,f,f,f,f,f,SVI,M,f,0.7,t,0.85,t,?,f,f,5.0,t,54,t,f
3,67,negative.|94\n,121,t,f,f,f,f,f,f,f,f,f,f,f,SVI,F,f,1.4,t,0.98,t,?,f,f,2.1,t,119,t,f
4,58,negative.|2568\n,85,t,f,f,f,f,f,f,f,f,f,f,f,SVI,F,f,1.2,t,0.88,t,?,f,f,0.17,t,75,t,f


In [7]:
dataframe.shape

(8571, 30)

In [8]:
dataframe.isnull().sum()

age                          0
disease                      0
fti                          0
fti_measured                 0
goitre                       0
hypopituitary                0
i131_treatment               0
lithium                      0
on_antithyroid_medication    0
on_thyroxine                 0
pregnant                     0
psych                        0
query_hyperthyroid           0
query_hypothyroid            0
query_on_thyroxine           0
referral_source              0
sex                          0
sick                         0
t3                           0
t3_measured                  0
t4u                          0
t4u_measured                 0
tbg                          0
tbg_measured                 0
thyroid_surgery              0
tsh                          0
tsh_measured                 0
tt4                          0
tt4_measured                 0
tumor                        0
dtype: int64

In [9]:
for features in dataframe.columns:
    print(f"{features}: {sum(dataframe[features]=='?')}")

age: 17
disease: 0
fti: 822
fti_measured: 0
goitre: 0
hypopituitary: 0
i131_treatment: 0
lithium: 0
on_antithyroid_medication: 0
on_thyroxine: 0
pregnant: 0
psych: 0
query_hyperthyroid: 0
query_hypothyroid: 0
query_on_thyroxine: 0
referral_source: 0
sex: 343
sick: 0
t3: 1752
t3_measured: 0
t4u: 827
t4u_measured: 0
tbg: 8559
tbg_measured: 0
thyroid_surgery: 0
tsh: 773
tsh_measured: 0
tt4: 471
tt4_measured: 0
tumor: 0


In [10]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8571 entries, 0 to 8570
Data columns (total 30 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   age                        8571 non-null   object
 1   disease                    8571 non-null   object
 2   fti                        8571 non-null   object
 3   fti_measured               8571 non-null   object
 4   goitre                     8571 non-null   object
 5   hypopituitary              8571 non-null   object
 6   i131_treatment             8571 non-null   object
 7   lithium                    8571 non-null   object
 8   on_antithyroid_medication  8571 non-null   object
 9   on_thyroxine               8571 non-null   object
 10  pregnant                   8571 non-null   object
 11  psych                      8571 non-null   object
 12  query_hyperthyroid         8571 non-null   object
 13  query_hypothyroid          8571 non-null   object
 14  query_on

In [11]:
dataframe.replace("?",np.nan,inplace=True)

In [12]:
dataframe.isnull().sum()

age                            17
disease                         0
fti                           822
fti_measured                    0
goitre                          0
hypopituitary                   0
i131_treatment                  0
lithium                         0
on_antithyroid_medication       0
on_thyroxine                    0
pregnant                        0
psych                           0
query_hyperthyroid              0
query_hypothyroid               0
query_on_thyroxine              0
referral_source                 0
sex                           343
sick                            0
t3                           1752
t3_measured                     0
t4u                           827
t4u_measured                    0
tbg                          8559
tbg_measured                    0
thyroid_surgery                 0
tsh                           773
tsh_measured                    0
tt4                           471
tt4_measured                    0
tumor         

In [13]:
num_features=['age','t4u','fti','tt4','tsh','t3','tbg']

In [14]:
dataframe[num_features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8571 entries, 0 to 8570
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   age     8554 non-null   object
 1   t4u     7744 non-null   object
 2   fti     7749 non-null   object
 3   tt4     8100 non-null   object
 4   tsh     7798 non-null   object
 5   t3      6819 non-null   object
 6   tbg     12 non-null     object
dtypes: object(7)
memory usage: 468.9+ KB


In [15]:
for features in num_features:
    dataframe[features]=pd.to_numeric(dataframe[features],errors='coerce')

In [16]:
dataframe[num_features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8571 entries, 0 to 8570
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     8553 non-null   float64
 1   t4u     7743 non-null   float64
 2   fti     7748 non-null   float64
 3   tt4     8099 non-null   float64
 4   tsh     7797 non-null   float64
 5   t3      6818 non-null   float64
 6   tbg     11 non-null     float64
dtypes: float64(7)
memory usage: 468.9 KB


In [17]:
cat_features=['sex','referral_source']

In [18]:
bool_features=[features for features in dataframe.columns if features not in num_features+cat_features and features!='disease']

In [19]:
for features in bool_features:
    print(f"{features}:{dataframe[features].unique()}")

fti_measured:['t' 'f' 'y' 't.4' 'FTI_measured']
goitre:['f' 't' 'f.10' 'goitre']
hypopituitary:['f' '' 't' 'f.12' 'hypopituitary']
i131_treatment:['f' 't' '' 'f.6' 'I131_treatment']
lithium:['f' 't' 'f.9' 'lithium']
on_antithyroid_medication:['f' 't' 'f.2' 'on_antithyroid_medication']
on_thyroxine:['f' 't' 'on_thyroxine']
pregnant:['f' 't' 'f.4' 'pregnant']
psych:['f' 't' '' 'f.13' 'psych']
query_hyperthyroid:['f' 't' 'f.8' 'query_hyperthyroid']
query_hypothyroid:['f' 't' 'f.7' 'query_hypothyroid']
query_on_thyroxine:['f' 't' 'f.1' 'query_on_thyroxine']
sick:['f' 't' 'f.3' 'sick']
t3_measured:['t' 'f' 'y' 'n' 't.1' 'T3_measured']
t4u_measured:['t' 'f' 'y' 't.3' 'T4U_measured']
tbg_measured:['f' 'n' 'y' 't' 'f.14' 'TBG_measured']
thyroid_surgery:['f' 't' 'f.5' 'thyroid_surgery']
tsh_measured:['t' 'f' 'y' 'n' 'TSH_measured']
tt4_measured:['t' 'f' 'y' 't.2' 'TT4_measured']
tumor:['f' 't' 'f.11' 'tumor']


In [20]:
bool_dict={'t':True, 'y':True,'f':False,'n':False}

for features in bool_features:
    dataframe[features]=dataframe[features].map(bool_dict)

In [21]:
for features in bool_features:
    print(f"{features}:{dataframe[features].unique()}")

fti_measured:[True False nan]
goitre:[False True nan]
hypopituitary:[False nan True]
i131_treatment:[False True nan]
lithium:[False True nan]
on_antithyroid_medication:[False True nan]
on_thyroxine:[False True nan]
pregnant:[False True nan]
psych:[False True nan]
query_hyperthyroid:[False True nan]
query_hypothyroid:[False True nan]
query_on_thyroxine:[False True nan]
sick:[False True nan]
t3_measured:[True False nan]
t4u_measured:[True False nan]
tbg_measured:[False True nan]
thyroid_surgery:[False True nan]
tsh_measured:[True False nan]
tt4_measured:[True False nan]
tumor:[False True nan]


In [22]:
dataframe[bool_features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8571 entries, 0 to 8570
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   fti_measured               8569 non-null   object
 1   goitre                     8569 non-null   object
 2   hypopituitary              8418 non-null   object
 3   i131_treatment             8418 non-null   object
 4   lithium                    8569 non-null   object
 5   on_antithyroid_medication  8569 non-null   object
 6   on_thyroxine               8570 non-null   object
 7   pregnant                   8569 non-null   object
 8   psych                      8418 non-null   object
 9   query_hyperthyroid         8569 non-null   object
 10  query_hypothyroid          8569 non-null   object
 11  query_on_thyroxine         8569 non-null   object
 12  sick                       8569 non-null   object
 13  t3_measured                8569 non-null   object
 14  t4u_meas

In [23]:
for features in cat_features:
    print(dataframe[features].unique())

['F' 'M' nan 'sex']
['SVI' 'other' 'SVHC' 'STMW' 'SVHD' '' 'referral_source']


In [24]:
for features in cat_features:
    dataframe[features]=np.where(dataframe[features]==features,np.nan,dataframe[features])

In [25]:
for features in cat_features:
    print(dataframe[features].unique())

['F' 'M' nan]
['SVI' 'other' 'SVHC' 'STMW' 'SVHD' '' nan]


In [26]:
dataframe['disease']=np.where(dataframe['disease']=='disease',np.nan,dataframe['disease'])
dataframe['disease']=dataframe['disease'].str.split(".")
dataframe['disease']=[val[0].replace(".","") for val in dataframe['disease']]
dataframe['disease']=dataframe['disease'].str.split("[")
dataframe['disease']=[val[0].replace(".","") for val in dataframe['disease']]
dataframe['disease']=dataframe['disease'].str.strip()

In [27]:
dataframe['disease']

0                      negative
1                      negative
2                      negative
3                      negative
4                      negative
5                      negative
6                      negative
7                  hyperthyroid
8                      negative
9                      negative
10                     negative
11                     negative
12                     negative
13                     negative
14                     negative
15                     negative
16                     negative
17                  hypothyroid
18                  hypothyroid
19                 hyperthyroid
20                     negative
21                  hypothyroid
22                     negative
23                     negative
24                     negative
25                     negative
26                 hyperthyroid
27                     negative
28                  hypothyroid
29                     negative
30                     negative
31      

In [28]:
dataframe.head()

Unnamed: 0,age,disease,fti,fti_measured,goitre,hypopituitary,i131_treatment,lithium,on_antithyroid_medication,on_thyroxine,pregnant,psych,query_hyperthyroid,query_hypothyroid,query_on_thyroxine,referral_source,sex,sick,t3,t3_measured,t4u,t4u_measured,tbg,tbg_measured,thyroid_surgery,tsh,tsh_measured,tt4,tt4_measured,tumor
0,63.0,negative,97.0,True,False,False,False,False,False,False,False,False,False,False,False,SVI,F,False,2.2,True,1.03,True,,False,False,0.88,True,101.0,True,False
1,49.0,negative,88.0,True,False,False,False,False,False,False,False,False,False,False,False,other,F,False,1.8,True,0.91,True,,False,False,1.9,True,81.0,True,False
2,45.0,negative,63.0,True,False,False,False,False,False,False,False,False,False,False,False,SVI,M,False,0.7,True,0.85,True,,False,False,5.0,True,54.0,True,False
3,67.0,negative,121.0,True,False,False,False,False,False,False,False,False,False,False,False,SVI,F,False,1.4,True,0.98,True,,False,False,2.1,True,119.0,True,False
4,58.0,negative,85.0,True,False,False,False,False,False,False,False,False,False,False,False,SVI,F,False,1.2,True,0.88,True,,False,False,0.17,True,75.0,True,False


In [29]:
dataframe.to_csv("raw_data.csv",index=False)


In [30]:
from sklearn.model_selection import train_test_split

In [31]:
train,test= train_test_split(dataframe,test_size=0.2,random_state=0)

In [32]:
print("Train data shape:",train.shape,", Test data shape:",test.shape)

Train data shape: (6856, 30) , Test data shape: (1715, 30)


In [33]:
train.to_csv('train.csv',index=False)
test.to_csv('test.csv',index=False)