In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
df = pd.read_csv("Thyroid_Diff.csv")

In [4]:
df.head()

Unnamed: 0,Age,Gender,Smoking,Hx Smoking,Hx Radiothreapy,Thyroid Function,Physical Examination,Adenopathy,Pathology,Focality,Risk,T,N,M,Stage,Response,Recurred
0,27,F,No,No,No,Euthyroid,Single nodular goiter-left,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Indeterminate,No
1,34,F,No,Yes,No,Euthyroid,Multinodular goiter,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
2,30,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
3,62,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
4,62,F,No,No,No,Euthyroid,Multinodular goiter,No,Micropapillary,Multi-Focal,Low,T1a,N0,M0,I,Excellent,No


In [6]:
file_path = 'Thyroid_Diff.csv'
data = pd.read_csv(file_path)

# Display the columns of the dataset to find the target column name
print(data.columns)



Index(['Age', 'Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy',
       'Thyroid Function', 'Physical Examination', 'Adenopathy', 'Pathology',
       'Focality', 'Risk', 'T', 'N', 'M', 'Stage', 'Response', 'Recurred'],
      dtype='object')


In [7]:
# Step 1: Handle missing values
# Fill missing values with the median for numerical features and the most frequent value for categorical features
for column in data.columns:
    if pd.api.types.is_numeric_dtype(data[column]):
        data[column] = data[column].fillna(data[column].median())
    else:
        data[column] = data[column].fillna(data[column].mode()[0])


In [8]:
# Step 2: Encode categorical variables
# Convert categorical variables to numeric using Label Encoding
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for column in data.columns:
    if data[column].dtype == 'object':
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column])
        label_encoders[column] = le

In [9]:
# Step 3: Scale numerical features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numerical_columns = data.select_dtypes(include=[np.number]).columns
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Display the first few rows of the preprocessed dataset
print(data.head())

        Age    Gender   Smoking  Hx Smoking  Hx Radiothreapy  \
0 -0.917439 -0.477037 -0.383023   -0.280844        -0.136444   
1 -0.454315 -0.477037 -0.383023    3.560698        -0.136444   
2 -0.718957 -0.477037 -0.383023   -0.280844        -0.136444   
3  1.398184 -0.477037 -0.383023   -0.280844        -0.136444   
4  1.398184 -0.477037 -0.383023   -0.280844        -0.136444   

   Thyroid Function  Physical Examination  Adenopathy  Pathology  Focality  \
0          0.078732              0.325319    0.064684  -0.619635  0.742029   
1          0.078732             -1.157980    0.064684  -0.619635  0.742029   
2          0.078732              1.066968    0.064684  -0.619635  0.742029   
3          0.078732              1.066968    0.064684  -0.619635  0.742029   
4          0.078732             -1.157980    0.064684  -0.619635 -1.347656   

       Risk       T         N        M     Stage  Response  Recurred  
0  0.674696 -1.6429 -0.633987 -0.22207 -0.314426  0.464420  -0.62668  
1  0

In [10]:
# Step 4: Identify and set the target column
target_column = 'Recurred'  # Change this if your target column has a different name
if target_column not in data.columns:
    raise ValueError(f"The specified target column '{target_column}' is not found in the dataset. Please update the target column name.")

X = data.drop(target_column, axis=1)
y_continuous = data[target_column]


In [11]:
# Step 5: Convert continuous target to categorical
bins = [-float('inf'), 0.5, 1.5, float('inf')]  # Define bins as per your data
labels = ['low', 'medium', 'high']  # Define labels for the bins
y_categorical = pd.cut(y_continuous, bins=bins, labels=labels)

In [12]:
# Step 6: Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42)

In [13]:
# Step 7: Train a classifier
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [14]:
# Step 8: Make predictions on the test set
y_pred = model.predict(X_test)

In [17]:
# Step 9: Evaluate the model
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("Classification Report:")
print(classification_report(y_test, y_pred))

# Save the preprocessed data and predictions to disk (optional)
#X_train.to_csv('/kaggle/working/X_train.csv', index=False)
#X_test.to_csv('/kaggle/working/X_test.csv', index=False)
#y_train.to_csv('/kaggle/working/y_train.csv', index=False)
#y_test.to_csv('/kaggle/working/y_test.csv', index=False)
#pd.DataFrame(y_pred, columns=['predictions'])

print("Preprocessing, training, and prediction complete. Data and predictions have been saved.")


Accuracy: 0.97
Classification Report:
              precision    recall  f1-score   support

        high       0.95      0.95      0.95        19
         low       0.98      0.98      0.98        58

    accuracy                           0.97        77
   macro avg       0.97      0.97      0.97        77
weighted avg       0.97      0.97      0.97        77

Preprocessing, training, and prediction complete. Data and predictions have been saved.
