In [None]:
# File Paths
FILE_PATH = 'hypothyroid.csv'

In [None]:
# All the imports
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

# Classification Algorithms to Use - 
# 1. Logistic Regression
# 2. KNN
# 3. Kernel SVM
# 5. Random Forest 
# 6. XGBoost
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from  xgboost import XGBClassifier

from sklearn.model_selection import cross_val_score
from matplotlib.colors import ListedColormap
from sklearn.metrics import precision_score, recall_score, plot_confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn import metrics
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler




In [None]:
# Reading the Data
thyroid_dataset = pd.read_csv(FILE_PATH)

In [None]:
thyroid_dataset

In [None]:
thyroid_dataset.rename(columns={'binaryClass':'Labels'},inplace=True)

In [None]:

thyroid_dataset.describe().T


In [None]:
thyroid_dataset.info()

In [None]:
thyroid_dataset["Labels"] = thyroid_dataset["Labels"].map({"P":0,"N":1})
thyroid_dataset = thyroid_dataset.replace({"t":1,"f":0})

In [None]:
thyroid_dataset['sex'].isnull().sum()

In [None]:
thyroid_dataset["TBG"].value_counts()


In [None]:
del thyroid_dataset["TBG"]

In [None]:
thyroid_dataset = thyroid_dataset.replace({"?":np.NAN})
thyroid_dataset.isnull().sum()


In [None]:
thyroid_dataset["sex"].value_counts()


In [None]:
thyroid_dataset = thyroid_dataset.replace({"F":1,"M":0})

In [None]:
thyroid_dataset["referral source"].value_counts()


In [None]:
del thyroid_dataset["referral source"]

In [None]:
thyroid_dataset["T3 measured"].value_counts()

In [None]:
thyroid_dataset["TT4 measured"].value_counts()

In [None]:
thyroid_dataset["FTI measured"].value_counts()

In [None]:
thyroid_dataset["TBG measured"].value_counts()


In [None]:
thyroid_dataset["Labels"].value_counts()

In [None]:
thyroid_dataset.dtypes


In [None]:
# Converting the remaining columns to numeric types
col_Names = thyroid_dataset.columns[thyroid_dataset.dtypes == 'object']
thyroid_dataset[col_Names] = thyroid_dataset[col_Names].apply(pd.to_numeric, errors='coerce')
thyroid_dataset.dtypes



In [None]:
thyroid_dataset.isnull().sum()

In [None]:
# Columns where null values exist
thyroid_dataset.columns[thyroid_dataset.isnull().sum() > 0]

In [None]:
# Replacing the null values with the means
imputer = SimpleImputer(strategy='mean')
na_cols = thyroid_dataset.columns[thyroid_dataset.isnull().sum() > 0]
for col_name in na_cols:
    thyroid_dataset[col_name] = imputer.fit_transform(thyroid_dataset[[col_name]])

In [None]:
# thyroid_dataset[(thyroid_dataset['sex'] > 0.0) & (thyroid_dataset['sex'] < 1.0)]

In [None]:
thyroid_dataset

In [None]:
thyroid_dataset.columns

In [None]:
X = thyroid_dataset.iloc[:, :-1]
y = thyroid_dataset['Labels']

In [None]:
X = sm.add_constant(X)
results = sm.OLS(y,X).fit()
results.summary()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
# Using standard Scaler to scale the values uniformly
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
# 1. Logistic Regression
# 2. KNN
# 3. Kernel SVM
# 4. Naive Bayes
# 5. Random Forest 
# 6. XGBoost?

        
models_to_train = {
    'Logistic Regression': LogisticRegression(max_iter=500),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(),
    'RandomForest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(learning_rate=0.01)
}

for _, mod in models_to_train.items():
    mod.fit(X_train, y_train)


In [None]:
for _, mod in models_to_train.items():
     print(f"Accuracy Score for {_} is : ",mod.score(X_test,y_test)*100,"%")

In [None]:
# model = XGBClassifier(learning_rate=0.01).fit(X_train, y_train)
# y_pred = model.predict(X_test)
# print(f"Accuracy Score for xgboost is : ",model.score(X_test,y_test)*100,"%")