<a href="https://colab.research.google.com/github/SithijaDeshan/Machine-Learning/blob/main/project2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import joblib

# Suppress warnings
warnings.filterwarnings("ignore")


In [None]:
# Mount Google Drive (if using Colab)
from google.colab import drive
drive.mount('/content/drive')

# Read the dataset
dataset_path = '/content/drive/MyDrive/Doctor_specialty_prediction_system/Original_Dataset.csv'
dis_sym_data = pd.read_csv(dataset_path)

# Display the first few rows of the dataset
dis_sym_data.head()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


Data Preprocessing

In [None]:
# Initialize an empty list to store column names
columns_to_check = []

# Extract symptoms from the dataset
for col in dis_sym_data.columns:
    if col != 'Disease':
        columns_to_check.append(col)

symptoms = dis_sym_data.iloc[:, 1:].values.flatten()
symptoms = list(set(symptoms))

# Create binary columns for each symptom
for symptom in symptoms:
    dis_sym_data[symptom] = dis_sym_data.iloc[:, 1:].apply(lambda row: int(symptom in row.values), axis=1)

# Create a new dataframe with only symptom columns and the target
dis_sym_data_v1 = dis_sym_data.drop(columns=columns_to_check)
dis_sym_data_v1 = dis_sym_data_v1.loc[:, dis_sym_data_v1.columns.notna()]
dis_sym_data_v1.columns = dis_sym_data_v1.columns.str.strip()

# Label encode the target variable 'Disease'
le = LabelEncoder()
dis_sym_data_v1['Disease'] = le.fit_transform(dis_sym_data_v1['Disease'])


Split Data into Train and Test Sets

In [None]:
# Feature matrix and target vector
X = dis_sym_data_v1.drop(columns="Disease")
y = dis_sym_data_v1['Disease']

# Split the dataset into training and testing sets (70% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


 Train the Random Forest Model

In [None]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(max_depth=5, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)


Evaluate the Model

In [None]:
# Evaluate the model on the test set
y_pred = rf_model.predict(X_test)

# Calculate accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f'Random Forest - Test Accuracy: {accuracy:.3%}')

# Print confusion matrix for further insights
conf_matrix = metrics.confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n', conf_matrix)


Random Forest - Test Accuracy: 94.309%
Confusion Matrix:
 [[18  0  0 ...  0  0  0]
 [ 0 30  0 ...  0  0  0]
 [ 0  0 24 ...  0  0  0]
 ...
 [ 0  0  0 ... 26  0  0]
 [ 0  0  0 ...  0 22  0]
 [ 0  0  0 ...  0  0 34]]


In [None]:
# Save the trained Random Forest model
joblib.dump(rf_model, '/content/drive/My Drive/Project2_saved work/random_forest_model.pkl')

# Save the label encoder
joblib.dump(le, '/content/drive/My Drive/Project2_saved work/label_encoder.pkl')


['/content/drive/My Drive/Project2_saved work/label_encoder.pkl']