In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold


# read the data
data = pd.read_csv("../data/thyroidDF.csv")



In [2]:
data.isna().sum()

age                       0
sex                     307
on_thyroxine              0
query_on_thyroxine        0
on_antithyroid_meds       0
sick                      0
pregnant                  0
thyroid_surgery           0
I131_treatment            0
query_hypothyroid         0
query_hyperthyroid        0
lithium                   0
goitre                    0
tumor                     0
hypopituitary             0
psych                     0
TSH_measured              0
TSH                     842
T3_measured               0
T3                     2604
TT4_measured              0
TT4                     442
T4U_measured              0
T4U                     809
FTI_measured              0
FTI                     802
TBG_measured              0
TBG                    8823
referral_source           0
target                    0
patient_id                0
dtype: int64

In [3]:
"""###################################################################################################"""

# get some information on the data
data.describe()
data.dtypes
data.head(5)
data.shape
data.isna().sum(axis=0)
data['target'].value_counts()
data_comp = data.dropna()
data_comp.shape

"""###################################################################################################"""

# Distribution plot before cleaning
numerical_columns = data.select_dtypes(include=['int', 'float']).columns
numerical_columns = [col for col in numerical_columns if col != 'patient_id']
num_plots = len(numerical_columns)
num_rows = (num_plots + 1) // 2  # Calculate the number of rows
fig, axs = plt.subplots(4, 2, figsize=(10,12))
for i, column in enumerate(numerical_columns):
    row = i // 2
    col = i % 2
    sns.histplot(data[column], ax=axs[row, col])
    axs[row, col].set_title(f'{column} distribution plot')
if num_plots % 2 == 1:
    fig.delaxes(axs[-1, -1])
plt.tight_layout()
plt.savefig('../output/displots/displots_init/displots_init.png', dpi = 600)
plt.clf()

thresholds = {
    'age': 110,
    'TSH': 100,
    'T3': 10,
    'TT4': 320,
    'T4U': 1000,
    'FTI': 400,
    'TBG': 125
}
# Iterate over columns and thresholds
for column, threshold in thresholds.items():
    # Calculate the most frequent value in the column
    most_frequent_value = data[column].mode()[0]
    # Create a boolean mask to identify values above the threshold
    mask = data[column] > threshold
    # Replace values above the threshold with the most frequent value
    data.loc[mask, column] = most_frequent_value

data = data.drop_duplicates('patient_id')

# Select only numerical columns for imputation
data_numerical = data[numerical_columns]

# Perform most_frequent imputation on numerical data only
imputer = SimpleImputer(strategy='most_frequent')
data_imputed_numerical = pd.DataFrame(imputer.fit_transform(data_numerical), columns=data_numerical.columns)

# Replace original numerical columns with imputed values
data[numerical_columns] = data_imputed_numerical

<Figure size 720x864 with 0 Axes>

In [4]:
data.isna().sum()

age                      0
sex                    307
on_thyroxine             0
query_on_thyroxine       0
on_antithyroid_meds      0
sick                     0
pregnant                 0
thyroid_surgery          0
I131_treatment           0
query_hypothyroid        0
query_hyperthyroid       0
lithium                  0
goitre                   0
tumor                    0
hypopituitary            0
psych                    0
TSH_measured             0
TSH                      0
T3_measured              0
T3                       0
TT4_measured             0
TT4                      0
T4U_measured             0
T4U                      0
FTI_measured             0
FTI                      0
TBG_measured             0
TBG                      0
referral_source          0
target                   0
patient_id               0
dtype: int64

In [5]:
df=data
# For missing sex values
# Calculate the ratio of males to females
male_count = df[df["sex"] == "M"].shape[0]
female_count = df[df["sex"] == "F"].shape[0]
ratio = male_count / female_count
# Fill in missing sex values with the ratio applied to the missing values
missing_sex_count = df["sex"].isnull().sum()
missing_male_count = int(round(missing_sex_count / (ratio + 1)))
missing_female_count = missing_sex_count - missing_male_count

df.loc[df["sex"].isnull(), "sex"] = ["M"] * missing_male_count + ["F"] * missing_female_count

In [6]:
df.isna().sum()

age                    0
sex                    0
on_thyroxine           0
query_on_thyroxine     0
on_antithyroid_meds    0
sick                   0
pregnant               0
thyroid_surgery        0
I131_treatment         0
query_hypothyroid      0
query_hyperthyroid     0
lithium                0
goitre                 0
tumor                  0
hypopituitary          0
psych                  0
TSH_measured           0
TSH                    0
T3_measured            0
T3                     0
TT4_measured           0
TT4                    0
T4U_measured           0
T4U                    0
FTI_measured           0
FTI                    0
TBG_measured           0
TBG                    0
referral_source        0
target                 0
patient_id             0
dtype: int64