# Thyroid
## Load Dependencies

In [138]:
import pandas as pd
!#pip install GitPython
import git
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

## Copy Github Repo

In [139]:
def download_github_repository(repo_url, destination_folder):
    try:
        git.Git(destination_folder).clone(repo_url)
        print(f"Repository successfully downloaded to {destination_folder}")
    except git.exc.GitCommandError as e:
        print(f"Error downloading the repository: {e}")

In [140]:
repo_owner = 'Riverag0011'
repo_name = 'ADS505_Team-4'
repository_url = f"https://github.com/{repo_owner}/{repo_name}"
destination_folder = "/content/sample_data"
download_github_repository(repository_url, destination_folder)

Error downloading the repository: Cmd('git') failed due to: exit code(128)
  cmdline: git clone https://github.com/Riverag0011/ADS505_Team-4
  stderr: 'fatal: destination path 'ADS505_Team-4' already exists and is not an empty directory.'


## CSV to Pandas

In [141]:
csv_name = 'thyroidDF'
df = pd.read_csv(f'{destination_folder}/ADS505_Team-4/Raw Data/{csv_name}.csv')
df.columns = df.columns.str.replace(' ', '_')
df.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,target,patient_id
0,29,F,f,f,f,f,f,f,f,t,...,,f,,f,,f,,other,-,840801013
1,29,F,f,f,f,f,f,f,f,f,...,128.0,f,,f,,f,,other,-,840801014
2,41,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,11.0,other,-,840801042
3,36,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,26.0,other,-,840803046
4,32,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,36.0,other,S,840803047


## Drop Columns
###### use your brain to rid df of columns that arnt helpful

In [142]:
columns_to_drop = ['patient_id', 'target', 'referral_source', 'query_hypothyroid', 'query_hyperthyroid']
df = df.drop(columns=columns_to_drop).copy()

## All datatypes


## Adjust Datatypes and drop NAs


In [145]:
percent = round(df.isna().sum()/df.shape[0]*100, 1)
sum = df.isna().sum()
dtype = df.dtypes
na = pd.DataFrame({'percent': percent, 'sum': sum, 'datatype': dtype})
na

Unnamed: 0,percent,sum,datatype
age,0.0,0,float64
sex,3.3,307,object
on_thyroxine,0.0,0,object
query_on_thyroxine,0.0,0,object
on_antithyroid_meds,0.0,0,object
sick,0.0,0,object
pregnant,0.0,0,object
thyroid_surgery,0.0,0,object
I131_treatment,0.0,0,object
lithium,0.0,0,object


## change age to float because it is continuous and then scale using Standard Scaler()

In [146]:
df['age'] = df['age'].astype(float)
scaler = StandardScaler()
df['age'] = scaler.fit_transform(df[['age']])

### 'sex' should be ant int with only 0's and 1's. missing rows are only 3 percent of data so lets drop them as well.

In [147]:
df['sex'].value_counts()

F    6073
M    2792
Name: sex, dtype: int64

In [148]:
le = LabelEncoder()
df['sex'] = le.fit_transform(df['sex'])
df['sex'].value_counts()

0    6073
1    2792
2     307
Name: sex, dtype: int64

In [149]:
df = df[df['sex'] != 2]

In [150]:
df['sex'].value_counts()

0    6073
1    2792
Name: sex, dtype: int64

## regroup and look at df again

In [151]:
percent = round(df.isna().sum()/df.shape[0]*100, 1)
sum = df.isna().sum()
dtype = df.dtypes
na = pd.DataFrame({'percent': percent, 'sum': sum, 'datatype': dtype})
na

Unnamed: 0,percent,sum,datatype
age,0.0,0,float64
sex,0.0,0,int64
on_thyroxine,0.0,0,object
query_on_thyroxine,0.0,0,object
on_antithyroid_meds,0.0,0,object
sick,0.0,0,object
pregnant,0.0,0,object
thyroid_surgery,0.0,0,object
I131_treatment,0.0,0,object
lithium,0.0,0,object


## Lets transform the rest of the boolean features in the same manner as we did for sex. They are all currently objects so lets retrive them this way and transform them all with the label encoder in a similar manner to 'sex'

In [152]:
df['on_thyroxine'].value_counts()

f    7646
t    1219
Name: on_thyroxine, dtype: int64

In [153]:
dtype_obj = df.select_dtypes(include=['object']).columns.tolist()

In [154]:
for col in dtype_obj:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

In [155]:
for col in dtype_obj:
  x = df[col].value_counts()
  print(x)

0    7646
1    1219
Name: on_thyroxine, dtype: int64
0    8714
1     151
Name: query_on_thyroxine, dtype: int64
0    8749
1     116
Name: on_antithyroid_meds, dtype: int64
0    8531
1     334
Name: sick, dtype: int64
0    8762
1     103
Name: pregnant, dtype: int64
0    8734
1     131
Name: thyroid_surgery, dtype: int64
0    8701
1     164
Name: I131_treatment, dtype: int64
0    8775
1      90
Name: lithium, dtype: int64
0    8781
1      84
Name: goitre, dtype: int64
0    8629
1     236
Name: tumor, dtype: int64
0    8863
1       2
Name: hypopituitary, dtype: int64
0    8459
1     406
Name: psych, dtype: int64
1    8052
0     813
Name: TSH_measured, dtype: int64
1    6344
0    2521
Name: T3_measured, dtype: int64
1    8445
0     420
Name: TT4_measured, dtype: int64
1    8090
0     775
Name: T4U_measured, dtype: int64
1    8097
0     768
Name: FTI_measured, dtype: int64
0    8534
1     331
Name: TBG_measured, dtype: int64


## regroup

In [156]:
percent = round(df.isna().sum()/df.shape[0]*100, 1)
sum = df.isna().sum()
dtype = df.dtypes
na = pd.DataFrame({'percent': percent, 'sum': sum, 'datatype': dtype})
na

Unnamed: 0,percent,sum,datatype
age,0.0,0,float64
sex,0.0,0,int64
on_thyroxine,0.0,0,int64
query_on_thyroxine,0.0,0,int64
on_antithyroid_meds,0.0,0,int64
sick,0.0,0,int64
pregnant,0.0,0,int64
thyroid_surgery,0.0,0,int64
I131_treatment,0.0,0,int64
lithium,0.0,0,int64


## NA
### Due to the presecnce of na values and the redududancy of information the following 6 columns will be dropped.

In [157]:
df.select_dtypes(include=['float64']).columns.tolist()

['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'TBG']

In [158]:
columns_to_drop = ['TSH', 'T3', 'TT4', 'T4U', 'FTI', 'TBG']
df = df.drop(columns=columns_to_drop).copy()

# Final DF

In [160]:
df

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,lithium,goitre,tumor,hypopituitary,psych,TSH_measured,T3_measured,TT4_measured,T4U_measured,FTI_measured,TBG_measured
0,-0.037634,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,-0.037634,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0
2,-0.027499,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,-0.031722,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,-0.035100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9167,-0.014829,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0
9168,-0.043547,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0
9169,-0.003848,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0
9170,-0.022431,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0
