In [5]:
import pandas as pd

In [6]:
df = pd.read_csv("/kaggle/input/thyroid-disease-data/Thyroid_Diff.csv")

In [7]:
df.head()

Unnamed: 0,Age,Gender,Smoking,Hx Smoking,Hx Radiothreapy,Thyroid Function,Physical Examination,Adenopathy,Pathology,Focality,Risk,T,N,M,Stage,Response,Recurred
0,27,F,No,No,No,Euthyroid,Single nodular goiter-left,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Indeterminate,No
1,34,F,No,Yes,No,Euthyroid,Multinodular goiter,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
2,30,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
3,62,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
4,62,F,No,No,No,Euthyroid,Multinodular goiter,No,Micropapillary,Multi-Focal,Low,T1a,N0,M0,I,Excellent,No


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Age                   383 non-null    int64 
 1   Gender                383 non-null    object
 2   Smoking               383 non-null    object
 3   Hx Smoking            383 non-null    object
 4   Hx Radiothreapy       383 non-null    object
 5   Thyroid Function      383 non-null    object
 6   Physical Examination  383 non-null    object
 7   Adenopathy            383 non-null    object
 8   Pathology             383 non-null    object
 9   Focality              383 non-null    object
 10  Risk                  383 non-null    object
 11  T                     383 non-null    object
 12  N                     383 non-null    object
 13  M                     383 non-null    object
 14  Stage                 383 non-null    object
 15  Response              383 non-null    ob

In [10]:
df.rename(columns={
    'T': 'Tumor Size',
    'N': 'Lymph Node Involvement',
    'M': 'Metastasis',
    'Stage': 'Cancer Stage'
}, inplace=True)

In [11]:
df.dtypes

Age                        int64
Gender                    object
Smoking                   object
Hx Smoking                object
Hx Radiothreapy           object
Thyroid Function          object
Physical Examination      object
Adenopathy                object
Pathology                 object
Focality                  object
Risk                      object
Tumor Size                object
Lymph Node Involvement    object
Metastasis                object
Cancer Stage              object
Response                  object
Recurred                  object
dtype: object

In [12]:
# Check unique values in each column
for col in df.columns:
    print(f"{col}: {df[col].unique()}")


Age: [27 34 30 62 52 41 46 51 40 75 59 49 50 76 42 44 43 36 70 60 33 26 37 55
 31 45 20 38 29 25 21 23 24 35 54 22 69 28 17 73 18 39 57 66 32 47 56 63
 19 67 72 61 68 48 81 53 58 80 79 65 15 82 71 64 78]
Gender: ['F' 'M']
Smoking: ['No' 'Yes']
Hx Smoking: ['No' 'Yes']
Hx Radiothreapy: ['No' 'Yes']
Thyroid Function: ['Euthyroid' 'Clinical Hyperthyroidism' 'Clinical Hypothyroidism'
 'Subclinical Hyperthyroidism' 'Subclinical Hypothyroidism']
Physical Examination: ['Single nodular goiter-left' 'Multinodular goiter'
 'Single nodular goiter-right' 'Normal' 'Diffuse goiter']
Adenopathy: ['No' 'Right' 'Extensive' 'Left' 'Bilateral' 'Posterior']
Pathology: ['Micropapillary' 'Papillary' 'Follicular' 'Hurthel cell']
Focality: ['Uni-Focal' 'Multi-Focal']
Risk: ['Low' 'Intermediate' 'High']
Tumor Size: ['T1a' 'T1b' 'T2' 'T3a' 'T3b' 'T4a' 'T4b']
Lymph Node Involvement: ['N0' 'N1b' 'N1a']
Metastasis: ['M0' 'M1']
Cancer Stage: ['I' 'II' 'IVB' 'III' 'IVA']
Response: ['Indeterminate' 'Excellent' 'Struc

In [13]:
from sklearn.preprocessing import LabelEncoder


In [14]:
# Exclude numeric columns like 'Age'
categorical_columns = df.select_dtypes(include=['object']).columns

In [15]:
categorical_columns

Index(['Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy',
       'Thyroid Function', 'Physical Examination', 'Adenopathy', 'Pathology',
       'Focality', 'Risk', 'Tumor Size', 'Lymph Node Involvement',
       'Metastasis', 'Cancer Stage', 'Response', 'Recurred'],
      dtype='object')

In [17]:
# Separate columns based on unique values
label_encoding_columns = [col for col in categorical_columns if df[col].nunique() == 2]
one_hot_encoding_columns = [col for col in categorical_columns if df[col].nunique() > 2]

In [18]:
label_encoding_columns

['Gender',
 'Smoking',
 'Hx Smoking',
 'Hx Radiothreapy',
 'Focality',
 'Metastasis',
 'Recurred']

In [19]:
one_hot_encoding_columns

['Thyroid Function',
 'Physical Examination',
 'Adenopathy',
 'Pathology',
 'Risk',
 'Tumor Size',
 'Lymph Node Involvement',
 'Cancer Stage',
 'Response']

In [20]:
# Apply Label Encoding
le = LabelEncoder()
for col in label_encoding_columns:
    df[col] = le.fit_transform(df[col])

In [21]:
df.head()

Unnamed: 0,Age,Gender,Smoking,Hx Smoking,Hx Radiothreapy,Thyroid Function,Physical Examination,Adenopathy,Pathology,Focality,Risk,Tumor Size,Lymph Node Involvement,Metastasis,Cancer Stage,Response,Recurred
0,27,0,0,0,0,Euthyroid,Single nodular goiter-left,No,Micropapillary,1,Low,T1a,N0,0,I,Indeterminate,0
1,34,0,0,1,0,Euthyroid,Multinodular goiter,No,Micropapillary,1,Low,T1a,N0,0,I,Excellent,0
2,30,0,0,0,0,Euthyroid,Single nodular goiter-right,No,Micropapillary,1,Low,T1a,N0,0,I,Excellent,0
3,62,0,0,0,0,Euthyroid,Single nodular goiter-right,No,Micropapillary,1,Low,T1a,N0,0,I,Excellent,0
4,62,0,0,0,0,Euthyroid,Multinodular goiter,No,Micropapillary,0,Low,T1a,N0,0,I,Excellent,0


In [23]:
# Apply One-Hot Encoding
df = pd.get_dummies(df, columns=one_hot_encoding_columns, drop_first=True)

In [24]:
df.head()

Unnamed: 0,Age,Gender,Smoking,Hx Smoking,Hx Radiothreapy,Focality,Metastasis,Recurred,Thyroid Function_Clinical Hypothyroidism,Thyroid Function_Euthyroid,...,Tumor Size_T4b,Lymph Node Involvement_N1a,Lymph Node Involvement_N1b,Cancer Stage_II,Cancer Stage_III,Cancer Stage_IVA,Cancer Stage_IVB,Response_Excellent,Response_Indeterminate,Response_Structural Incomplete
0,27,0,0,0,0,1,0,0,False,True,...,False,False,False,False,False,False,False,False,True,False
1,34,0,0,1,0,1,0,0,False,True,...,False,False,False,False,False,False,False,True,False,False
2,30,0,0,0,0,1,0,0,False,True,...,False,False,False,False,False,False,False,True,False,False
3,62,0,0,0,0,1,0,0,False,True,...,False,False,False,False,False,False,False,True,False,False
4,62,0,0,0,0,0,0,0,False,True,...,False,False,False,False,False,False,False,True,False,False


In [25]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier  # Example model
from sklearn.metrics import accuracy_score

In [26]:
# Step 1: Separate features (X) and target (y)
X = df.drop(columns=['Recurred'])  # 'Recurred' is the target column
y = df['Recurred']  # Target column


In [29]:
# Step 2: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
# Identify numerical and categorical columns
numerical_columns = ['Age']  # Add more if you have other numerical columns
categorical_columns = [col for col in X.columns if col not in numerical_columns]

In [32]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [33]:
# Apply StandardScaler only to numerical columns
ct = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), numerical_columns),  # Scale numerical columns
    ],
    remainder='passthrough'  # Leave categorical columns as is
)

In [34]:

X_train_scaled = ct.fit_transform(X_train)
X_test_scaled = ct.transform(X_test)


In [35]:
# Train a Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)


In [36]:
# Make predictions and evaluate
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy * 100:.2f}%")

Model Accuracy: 98.70%


In [37]:
from xgboost import XGBClassifier

In [39]:

model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
model.fit(X_train_scaled, y_train)

In [40]:
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

print(f"XGBoost Model Accuracy: {accuracy * 100:.2f}%")

XGBoost Model Accuracy: 97.40%
