In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.inspection import permutation_importance

# read the data
data = pd.read_csv("../data/thyroidDF.csv")

"""###################################################################################################"""

# get some information on the data
print("raw data description:\n" , data.describe())
data.dtypes
data.head(5)
print("raw data shape:" , data.shape)
print("raw data NaN:\n", data.isna().sum(axis=0))
print("raw data target:\n",data['target'].value_counts())
data_comp = data.dropna()
print("raw data dropna:", data_comp.shape)
print("raw data duplicates", data.duplicated().sum())

"""###################################################################################################"""

# Distribution plot before cleaning
numerical_columns = data.select_dtypes(include=['int', 'float']).columns
numerical_columns = [col for col in numerical_columns if col != 'patient_id']
num_plots = len(numerical_columns)
num_rows = (num_plots + 1) // 2  # Calculate the number of rows
fig, axs = plt.subplots(4, 2, figsize=(10,12))
for i, column in enumerate(numerical_columns):
    row = i // 2
    col = i % 2
    sns.histplot(data[column], ax=axs[row, col], bins=100)
    axs[row, col].set_title(f'{column} distribution plot')
if num_plots % 2 == 1:
    fig.delaxes(axs[-1, -1])
plt.tight_layout()
plt.savefig('../output/displots/displots_init/displots_init.png', dpi = 600)
plt.clf()

thresholds = {
    'age': 110,
    'TSH': 20,
    'T3': 10,
    'TT4': 320,
    'T4U': 1000,
    'FTI': 400,
    'TBG': 50
}
# Iterate over columns and thresholds
for column, threshold in thresholds.items():
    # Calculate the most frequent value in the column
    most_frequent_value = data[column].mode()[0]
    # Create a boolean mask to identify values above the threshold
    mask = data[column] > threshold
    # Replace values above the threshold with the most frequent value
    data.loc[mask, column] = most_frequent_value

data = data.drop_duplicates('patient_id')

# Select only numerical columns for imputation
data_numerical = data[numerical_columns]

# Perform most_frequent imputation on numerical data only
imputer = SimpleImputer(strategy='most_frequent')
data_imputed_numerical = pd.DataFrame(imputer.fit_transform(data_numerical), columns=data_numerical.columns)

# Replace original numerical columns with imputed values
data[numerical_columns] = data_imputed_numerical

# For missing sex values
# Calculate the ratio of males to females
male_count = data[data["sex"] == "M"].shape[0]
female_count = data[data["sex"] == "F"].shape[0]
ratio = male_count / female_count
# Fill in missing sex values with the ratio applied to the missing values
missing_sex_count = data["sex"].isnull().sum()
missing_male_count = int(round(missing_sex_count / (ratio + 1)))
missing_female_count = missing_sex_count - missing_male_count

data.loc[data["sex"].isnull(), "sex"] = ["M"] * missing_male_count + ["F"] * missing_female_count

data.loc[(data['target'] == 'A') | (data['target'] == 'B') | (data['target'] == 'C') | (data['target'] == 'AK') | (data['target'] == 'D'), 'target'] = '1 - hyperthyroid conditions'
data.loc[(data['target'] == 'E') | (data['target'] == 'F') | (data['target'] == 'FK') | (data['target'] == 'G') | (data['target'] == 'GI') | (data['target'] == 'GK') | (data['target'] == 'GKJ'), 'target'] = '2 - hypothyroid conditions'
data.loc[(data['target'] == 'I') | (data['target'] == 'J') | (data['target'] == 'C|I'), 'target'] = '3 - binding protein'
data.loc[(data['target'] == 'K') | (data['target'] == 'KJ') | (data['target'] == 'H|K'), 'target'] = '4 - general health'
data.loc[(data['target'] == 'L') | (data['target'] == 'M') | (data['target'] == 'MK') | (data['target'] == 'N') | (data['target'] == 'MI') | (data['target'] == 'LJ'), 'target'] = '5 - replacement therapy'
data.loc[(data['target'] == 'O') | (data['target'] == 'P') | (data['target'] == 'Q') | (data['target'] == 'OI'), 'target'] = '6 - antithyroid treatment'
data.loc[(data['target'] == 'R') | (data['target'] == 'S') | (data['target'] == 'T') | (data['target'] == 'D|R'), 'target'] = '7 - miscellaneous'


# Distribution plots after cleaning
numerical_columns = data.select_dtypes(include=['int', 'float']).columns
numerical_columns = [col for col in numerical_columns if col != 'patient_id']
num_plots = len(numerical_columns)
num_rows = (num_plots + 1) // 2  # Calculate the number of rows
fig, axs = plt.subplots(4, 2, figsize=(10,12))
for i, column in enumerate(numerical_columns):
    row = i // 2
    col = i % 2
    sns.histplot(data[column], ax=axs[row, col], bins=100)
    axs[row, col].set_title(f'{column} distribution plot')
if num_plots % 2 == 1:
    fig.delaxes(axs[-1, -1])
plt.tight_layout()
plt.savefig('../output/displots/displots_cleared/displots_cleaned.png', dpi = 600)
plt.clf()

# get some information on the data after cleaning
print("cleared data description:\n" , data.describe())
data.dtypes
data.head(5)
print("cleared data shape:" , data.shape)
print("cleared data NaN\n:", data.isna().sum(axis=0))
print("cleared data target\n:",data['target'].value_counts())
data_comp = data.dropna()
print("cleared data dropna:", data_comp.shape)

"""###################################################################################################"""

# some plots to get familiar with the data
data_with_tumor = data[data['tumor'] == 't']
sns.histplot(data=data_with_tumor, x='age', bins=range(0, 100, 10), hue='tumor', multiple='stack')
plt.tight_layout()
plt.savefig("../output/data_with_tumor.png")
plt.clf()

# age distribution
sns.histplot(data=data, x="age", color="gray")
plt.tight_layout()
plt.savefig("../output/age_distribution.png")
plt.clf()

#distribution plots
binary_columns = data.select_dtypes(include=['object']).columns
bin_plots = len(binary_columns)
bin_rows = (bin_plots + 3) // 4  # Calculate the number of rows for 4 columns

fig, axs = plt.subplots(bin_rows, 4, figsize=(16, 12))  # Increase figsize and set 4 columns

for i, column in enumerate(binary_columns):
    row = i // 4  # Adjust for 4 columns
    col = i % 4  # Adjust for 4 columns
    sns.histplot(data[column], ax=axs[row, col])
    axs[row, col].set_title(f'{column} distribution plot')

if bin_plots % 4 != 0:
    # Remove empty subplots
    for i in range(bin_plots % 4, 4):
        fig.delaxes(axs[-1, i])

plt.tight_layout()
plt.savefig('../output/displots/displots_binstr/binary_displots.png', dpi=600)
plt.clf()

vars = [col for col in data.columns if data[col].dtype == 'float64' and col != 'patient_id']
pplot = sns.pairplot(data=data,x_vars=vars, y_vars=vars,)
plt.tight_layout()
plt.savefig("../output/pair_plot.png")
plt.clf()
"""###################################################################################################"""

# One-hot encode 'sex' column
data = pd.get_dummies(data, columns=['sex'])
# Loop through all columns with binary values and one-hot encode them
for col in data.columns:
    if data[col].dtype == 'object' and set(data[col].unique()) == {'t', 'f'}:
        data[col] = data[col].apply(lambda x: 1 if x == 't' else 0)
        data = pd.get_dummies(data, columns=[col])

raw data description:
                 age          TSH           T3          TT4          T4U  \
count   9172.000000  8330.000000  6568.000000  8730.000000  8363.000000   
mean      73.555822     5.218403     1.970629   108.700305     0.976056   
std     1183.976718    24.184006     0.887579    37.522670     0.200360   
min        1.000000     0.005000     0.050000     2.000000     0.170000   
25%       37.000000     0.460000     1.500000    87.000000     0.860000   
50%       55.000000     1.400000     1.900000   104.000000     0.960000   
75%       68.000000     2.700000     2.300000   126.000000     1.065000   
max    65526.000000   530.000000    18.000000   600.000000     2.330000   

               FTI         TBG    patient_id  
count  8370.000000  349.000000  9.172000e+03  
mean    113.640746   29.870057  8.529473e+08  
std      41.551650   21.080504  7.581969e+06  
min       1.400000    0.100000  8.408010e+08  
25%      93.000000   21.000000  8.504090e+08  
50%     109.000000 

<Figure size 1000x1200 with 0 Axes>

<Figure size 1000x1200 with 0 Axes>

<Figure size 1600x1200 with 0 Axes>

<Figure size 1750x1750 with 0 Axes>

In [7]:
"#################SVM##############################################"
# SVM
df = data

# extract the predictor variables and the target variable
X = df.drop(['target', 'patient_id', 'TBG'], axis=1)
y = df['target']

print(y.value_counts())

# encode categorical variables
# get a list of columns where the data type is object
object_cols = list(X.select_dtypes(include=['object']).columns)
# perform one-hot encoding on categorical variables
X = pd.get_dummies(X, columns=object_cols)

##### feature selection #####

# Define the range of k values
k_values = range(7, 20)  #### adjust here computational cost ##### ###best value is 17
best_f1_score = 0.0
best_result = None

-    8285
+     887
Name: target, dtype: int64
