<a href="https://colab.research.google.com/github/NikhilMamilla/NikhilMamilla/blob/main/ThyroidDisease.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

***THYROID DISEASE DATA***

In [None]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC

In [None]:
# Suppressing warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Importing the dataset
df = pd.read_csv('Thyroid_Diff.csv')

In [None]:
# Displaying the first few rows of the DataFrame
print("First few rows of the DataFrame:")
print(df.head())

First few rows of the DataFrame:
   Age Gender Smoking Hx Smoking Hx Radiothreapy Thyroid Function  \
0   27      F      No         No              No        Euthyroid   
1   34      F      No        Yes              No        Euthyroid   
2   30      F      No         No              No        Euthyroid   
3   62      F      No         No              No        Euthyroid   
4   62      F      No         No              No        Euthyroid   

          Physical Examination Adenopathy       Pathology     Focality Risk  \
0   Single nodular goiter-left         No  Micropapillary    Uni-Focal  Low   
1          Multinodular goiter         No  Micropapillary    Uni-Focal  Low   
2  Single nodular goiter-right         No  Micropapillary    Uni-Focal  Low   
3  Single nodular goiter-right         No  Micropapillary    Uni-Focal  Low   
4          Multinodular goiter         No  Micropapillary  Multi-Focal  Low   

     T   N   M Stage       Response Recurred  
0  T1a  N0  M0     I  Indeterm

In [None]:
# Displaying the last few rows of the DataFrame
print("\nLast few rows of the DataFrame:")
print(df.tail())


Last few rows of the DataFrame:
     Age Gender Smoking Hx Smoking Hx Radiothreapy          Thyroid Function  \
378   72      M     Yes        Yes             Yes                 Euthyroid   
379   81      M     Yes         No             Yes                 Euthyroid   
380   72      M     Yes        Yes              No                 Euthyroid   
381   61      M     Yes        Yes             Yes  Clinical Hyperthyroidism   
382   67      M     Yes         No              No                 Euthyroid   

            Physical Examination Adenopathy     Pathology     Focality  Risk  \
378  Single nodular goiter-right      Right     Papillary    Uni-Focal  High   
379          Multinodular goiter  Extensive     Papillary  Multi-Focal  High   
380          Multinodular goiter  Bilateral     Papillary  Multi-Focal  High   
381          Multinodular goiter  Extensive  Hurthel cell  Multi-Focal  High   
382          Multinodular goiter  Bilateral     Papillary  Multi-Focal  High   

     

In [None]:
# Displaying the shape of the DataFrame
print("\nShape of the DataFrame:")
print(df.shape)


Shape of the DataFrame:
(383, 17)


In [None]:
# Displaying the columns of the DataFrame
print("\nColumns of the DataFrame:")
print(df.columns)


Columns of the DataFrame:
Index(['Age', 'Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy',
       'Thyroid Function', 'Physical Examination', 'Adenopathy', 'Pathology',
       'Focality', 'Risk', 'T', 'N', 'M', 'Stage', 'Response', 'Recurred'],
      dtype='object')


In [None]:
# Checking for duplicate rows in the DataFrame
print("\nNumber of duplicated rows in the DataFrame:")
print(df.duplicated().sum())


Number of duplicated rows in the DataFrame:
19


In [None]:
# Dropping duplicates if any
df = df.drop_duplicates()

In [None]:
# Handling missing values if any
# (This step is not included as the provided data snippet does not contain missing values)

# Checking for missing values
print("\nMissing values in the DataFrame:")
print(df.isnull().sum())


Missing values in the DataFrame:
Age                     0
Gender                  0
Smoking                 0
Hx Smoking              0
Hx Radiothreapy         0
Thyroid Function        0
Physical Examination    0
Adenopathy              0
Pathology               0
Focality                0
Risk                    0
T                       0
N                       0
M                       0
Stage                   0
Response                0
Recurred                0
dtype: int64


In [None]:
# Information about the DataFrame
print("\nInformation about the DataFrame:")
print(df.info())


Information about the DataFrame:
<class 'pandas.core.frame.DataFrame'>
Index: 364 entries, 0 to 382
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Age                   364 non-null    int64 
 1   Gender                364 non-null    object
 2   Smoking               364 non-null    object
 3   Hx Smoking            364 non-null    object
 4   Hx Radiothreapy       364 non-null    object
 5   Thyroid Function      364 non-null    object
 6   Physical Examination  364 non-null    object
 7   Adenopathy            364 non-null    object
 8   Pathology             364 non-null    object
 9   Focality              364 non-null    object
 10  Risk                  364 non-null    object
 11  T                     364 non-null    object
 12  N                     364 non-null    object
 13  M                     364 non-null    object
 14  Stage                 364 non-null    object
 15  Response   

In [None]:
# Summary statistics of the DataFrame
print("\nSummary statistics of the DataFrame:")
print(df.describe())


Summary statistics of the DataFrame:
             Age
count  364.00000
mean    41.25000
std     15.31436
min     15.00000
25%     30.00000
50%     38.00000
75%     52.00000
max     82.00000


In [None]:
# Encoding categorical variables
object_columns = ['Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy',
       'Thyroid Function', 'Physical Examination', 'Adenopathy', 'Pathology',
       'Focality', 'Risk', 'T', 'N', 'M', 'Stage', 'Response']
df = pd.get_dummies(df, columns=object_columns)

In [None]:
# Encoding the target variable
label_encoder = LabelEncoder()
df['Recurred'] = label_encoder.fit_transform(df['Recurred'])

In [None]:
# Print DataFrame after LabelEncoder
print("\nDataFrame after LabelEncoder:")
print(df.head())


DataFrame after LabelEncoder:
   Age  Recurred  Gender_F  Gender_M  Smoking_No  Smoking_Yes  Hx Smoking_No  \
0   27         0      True     False        True        False           True   
1   34         0      True     False        True        False          False   
2   30         0      True     False        True        False           True   
3   62         0      True     False        True        False           True   
4   62         0      True     False        True        False           True   

   Hx Smoking_Yes  Hx Radiothreapy_No  Hx Radiothreapy_Yes  ...   M_M1  \
0           False                True                False  ...  False   
1            True                True                False  ...  False   
2           False                True                False  ...  False   
3           False                True                False  ...  False   
4           False                True                False  ...  False   

   Stage_I  Stage_II  Stage_III  Stage_IVA 

In [None]:
# Print DataFrame after LabelEncoder
print("\nDataFrame after LabelEncoder:")
print(df.tail())


DataFrame after LabelEncoder:
     Age  Recurred  Gender_F  Gender_M  Smoking_No  Smoking_Yes  \
378   72         1     False      True       False         True   
379   81         1     False      True       False         True   
380   72         1     False      True       False         True   
381   61         1     False      True       False         True   
382   67         1     False      True       False         True   

     Hx Smoking_No  Hx Smoking_Yes  Hx Radiothreapy_No  Hx Radiothreapy_Yes  \
378          False            True               False                 True   
379           True           False               False                 True   
380          False            True                True                False   
381          False            True               False                 True   
382           True           False                True                False   

     ...   M_M1  Stage_I  Stage_II  Stage_III  Stage_IVA  Stage_IVB  \
378  ...   True    F

In [None]:
# Splitting the dataset into train and test sets
X = df.drop('Recurred', axis=1)
y = df['Recurred']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
# Training and evaluating a Logistic Regression model
log_reg_model = LogisticRegression()
log_reg_model.fit(X_train, y_train)
y_pred_log_reg = log_reg_model.predict(X_test)

In [None]:
# Training and evaluating a Decision Tree Classifier
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

In [None]:
# Training and evaluating a Support Vector Classifier
sv_model = SVC()
sv_model.fit(X_train, y_train)
y_pred_sv = sv_model.predict(X_test)

In [None]:
# Evaluating Logistic Regression model
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
report_log_reg = classification_report(y_test, y_pred_log_reg)
print("\nLogistic Regression Classifier Results:")
print(f"Accuracy: {accuracy_log_reg}")
print(f"Classification Report:\n{report_log_reg}")


Logistic Regression Classifier Results:
Accuracy: 0.9452054794520548
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.94      0.96        51
           1       0.88      0.95      0.91        22

    accuracy                           0.95        73
   macro avg       0.93      0.95      0.94        73
weighted avg       0.95      0.95      0.95        73



In [None]:
# Evaluating Decision Tree Classifier
accuracy_dt = accuracy_score(y_test, y_pred_dt)
report_dt = classification_report(y_test, y_pred_dt)
print("\nDecision Tree Classifier Results:")
print(f"Accuracy: {accuracy_dt}")
print(f"Classification Report:\n{report_dt}")


Decision Tree Classifier Results:
Accuracy: 0.9452054794520548
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.94      0.96        51
           1       0.88      0.95      0.91        22

    accuracy                           0.95        73
   macro avg       0.93      0.95      0.94        73
weighted avg       0.95      0.95      0.95        73



In [None]:
# Evaluating Support Vector Classifier
accuracy_sv = accuracy_score(y_test, y_pred_sv)
report_sv = classification_report(y_test, y_pred_sv)
print("\nSupport Vector Classifier Results:")
print(f"Accuracy: {accuracy_sv}")
print(f"Classification Report:\n{report_sv}")


Support Vector Classifier Results:
Accuracy: 0.7397260273972602
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.98      0.84        51
           1       0.80      0.18      0.30        22

    accuracy                           0.74        73
   macro avg       0.77      0.58      0.57        73
weighted avg       0.75      0.74      0.68        73

