# 1. Import necessary Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV


# 2. Import Data from Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

data = pd.read_csv('/content/drive/MyDrive/thyroiddataset/dataset.csv')

# 3. EDA

### Encode Categorical data:

In [None]:
# Encode categorical variables
categorical_cols = ['Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy', 'Thyroid Function', 'Physical Examination', 'Adenopathy', 'Pathology', 'Focality', 'Risk', 'T', 'N', 'M', 'Stage', 'Response']
le = LabelEncoder()
for col in categorical_cols:
    data[col] = le.fit_transform(data[col])

# 4. Split into training and test data

In [None]:
# Split into features and target
X = data.drop('Recurred', axis=1)
y = data['Recurred']

In [None]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Feature scaling

In [None]:
# Feature scaling
scaler = StandardScaler()

In [None]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 5. Train the model & Hyperparameter Tuning

In [None]:
# Create a Random Forest classifier

rf_model = RandomForestClassifier(random_state=42)

In [None]:
# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth':
 [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(rf_model,
 param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_rf_model = RandomForestClassifier(**best_params)

In [None]:
# Train the best model on the entire training set
best_rf_model.fit(X_train, y_train)

# 6. Make predictions

In [None]:
# Make predictions on the testing set
y_pred = best_rf_model.predict(X_test)

# 7. Model Evaluation

In [None]:
# Make predictions on the testing set using predict_proba to get probabilities
y_pred_proba = best_rf_model.predict_proba(X_test)[:, 1]  # Probability of the positive class

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='Yes') # Specify 'Yes' as the positive label
recall = recall_score(y_test, y_pred, pos_label='Yes')    # Specify 'Yes' as the positive label
f1 = f1_score(y_test, y_pred, pos_label='Yes')       # Specify 'Yes' as the positive label
auc = roc_auc_score(y_test, y_pred_proba) # Use probabilities for AUC calculation

In [None]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("AUC-ROC:", auc)

Accuracy: 0.987012987012987
Precision: 1.0
Recall: 0.9473684210526315
F1-score: 0.972972972972973
AUC-ROC: 0.9927404718693285


In [None]:
# You can customize this code further by experimenting with different algorithms,
# feature engineering techniques, and evaluation metrics to suit your specific project requirements.