In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("noorsaeed/medicine-recommendation-system-dataset")

print("Path to dataset files:", path)

Path to dataset files: /Users/whiskey/.cache/kagglehub/datasets/noorsaeed/medicine-recommendation-system-dataset/versions/1


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder



In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('Training.csv')
sym_des = pd.read_csv('symtoms_df.csv')
precautions = pd.read_csv('precautions_df.csv')
workout = pd.read_csv('workout_df.csv')
description = pd.read_csv('description.csv')
medications = pd.read_csv('medications.csv')
diets = pd.read_csv('diets.csv')

In [5]:
df.head


<bound method NDFrame.head of       itching  skin_rash  nodal_skin_eruptions  continuous_sneezing  \
0           1          1                     1                    0   
1           0          1                     1                    0   
2           1          0                     1                    0   
3           1          1                     0                    0   
4           1          1                     1                    0   
...       ...        ...                   ...                  ...   
4915        0          0                     0                    0   
4916        0          1                     0                    0   
4917        0          0                     0                    0   
4918        0          1                     0                    0   
4919        0          1                     0                    0   

      shivering  chills  joint_pain  stomach_pain  acidity  ulcers_on_tongue  \
0             0       0           0  

In [6]:
df.tail


<bound method NDFrame.tail of       itching  skin_rash  nodal_skin_eruptions  continuous_sneezing  \
0           1          1                     1                    0   
1           0          1                     1                    0   
2           1          0                     1                    0   
3           1          1                     0                    0   
4           1          1                     1                    0   
...       ...        ...                   ...                  ...   
4915        0          0                     0                    0   
4916        0          1                     0                    0   
4917        0          0                     0                    0   
4918        0          1                     0                    0   
4919        0          1                     0                    0   

      shivering  chills  joint_pain  stomach_pain  acidity  ulcers_on_tongue  \
0             0       0           0  

In [7]:
df.shape

(4920, 133)

In [8]:
df.prognosis.unique()

array(['Fungal infection', 'Allergy', 'GERD', 'Chronic cholestasis',
       'Drug Reaction', 'Peptic ulcer diseae', 'AIDS', 'Diabetes ',
       'Gastroenteritis', 'Bronchial Asthma', 'Hypertension ', 'Migraine',
       'Cervical spondylosis', 'Paralysis (brain hemorrhage)', 'Jaundice',
       'Malaria', 'Chicken pox', 'Dengue', 'Typhoid', 'hepatitis A',
       'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E',
       'Alcoholic hepatitis', 'Tuberculosis', 'Common Cold', 'Pneumonia',
       'Dimorphic hemmorhoids(piles)', 'Heart attack', 'Varicose veins',
       'Hypothyroidism', 'Hyperthyroidism', 'Hypoglycemia',
       'Osteoarthristis', 'Arthritis',
       '(vertigo) Paroymsal  Positional Vertigo', 'Acne',
       'Urinary tract infection', 'Psoriasis', 'Impetigo'], dtype=object)

In [9]:
len(df.prognosis.unique())

41

In [10]:
df.prognosis.isnull().sum()

np.int64(0)

In [11]:
X = df.drop("prognosis" , axis = 1)
y = df.prognosis


In [12]:
label_encoder = LabelEncoder()
label_encoder.fit(y)
y = label_encoder.transform(y)
y

array([15, 15, 15, ..., 38, 35, 27])

In [13]:
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB


In [14]:
# Apply KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=5)
kmeans_labels = kmeans.fit_predict(X)

# Add the cluster label as a new feature
X_with_clusters = X.copy()
X_with_clusters = np.column_stack((X_with_clusters, kmeans_labels))  # New feature for cluster labels


In [15]:
X_train , X_valid , y_train , y_valid = train_test_split(X , y , random_state = 5 , test_size = 0.2 , train_size = 0.8)

In [16]:
models = {
      'SVC' : SVC(kernel='linear'),
      'Random Forest' :RandomForestClassifier(random_state=5,n_estimators=100),
      'KNeighbors' : KNeighborsClassifier(n_neighbors=5),
      'Gradient Boosting': GradientBoostingClassifier(random_state=5,n_estimators=100),
      'MultinomialNB' :MultinomialNB() }

In [17]:
from sklearn.metrics import accuracy_score , confusion_matrix , mean_absolute_error

In [18]:
for model in models.values() :
    model.fit(X_train , y_train)
    preds = model.predict(X_valid)
    accuracy = accuracy_score(y_valid , preds)
    cm = confusion_matrix(y_valid , preds)
    print(accuracy)
    print(np.array2string(cm , separator = ","))
    print("------------------------------")
    

1.0
[[17, 0, 0,..., 0, 0, 0],
 [ 0,23, 0,..., 0, 0, 0],
 [ 0, 0,23,..., 0, 0, 0],
 ...,
 [ 0, 0, 0,...,26, 0, 0],
 [ 0, 0, 0,..., 0,28, 0],
 [ 0, 0, 0,..., 0, 0,29]]
------------------------------
1.0
[[17, 0, 0,..., 0, 0, 0],
 [ 0,23, 0,..., 0, 0, 0],
 [ 0, 0,23,..., 0, 0, 0],
 ...,
 [ 0, 0, 0,...,26, 0, 0],
 [ 0, 0, 0,..., 0,28, 0],
 [ 0, 0, 0,..., 0, 0,29]]
------------------------------
1.0
[[17, 0, 0,..., 0, 0, 0],
 [ 0,23, 0,..., 0, 0, 0],
 [ 0, 0,23,..., 0, 0, 0],
 ...,
 [ 0, 0, 0,...,26, 0, 0],
 [ 0, 0, 0,..., 0,28, 0],
 [ 0, 0, 0,..., 0, 0,29]]
------------------------------
1.0
[[17, 0, 0,..., 0, 0, 0],
 [ 0,23, 0,..., 0, 0, 0],
 [ 0, 0,23,..., 0, 0, 0],
 ...,
 [ 0, 0, 0,...,26, 0, 0],
 [ 0, 0, 0,..., 0,28, 0],
 [ 0, 0, 0,..., 0, 0,29]]
------------------------------
1.0
[[17, 0, 0,..., 0, 0, 0],
 [ 0,23, 0,..., 0, 0, 0],
 [ 0, 0,23,..., 0, 0, 0],
 ...,
 [ 0, 0, 0,...,26, 0, 0],
 [ 0, 0, 0,..., 0,28, 0],
 [ 0, 0, 0,..., 0, 0,29]]
------------------------------


In [19]:
rf_model = RandomForestClassifier(n_estimators = 100)
rf_model.fit(X_train , y_train)
pred = rf_model.predict(X_valid)
acc = accuracy_score(pred , y_valid)
acc

1.0

In [20]:
import pickle
file_path = "forest_model.pkl"
model_file = open(file_path , "wb")
pickle.dump(rf_model , model_file)
model_file.close()

In [21]:
import pickle

# Load the saved SVC model
with open('forest_model.pkl', 'rb') as file:
    loaded_svc_model = pickle.load(file)

print("SVC model loaded successfully.")


SVC model loaded successfully.
