In [84]:
#Impoting modules 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score


In [81]:
data = pd.read_excel("ayurvedic_data_with_outliers_missing.xlsx")

In [82]:
data.head()

Unnamed: 0,Symptom1,Symptom2,Symptom3,Product,Age,Gender,Duration of Symptoms (days),Severity,Dosage (mg),Frequency (times per day)
0,Stomach Pain,Sore Throat,Stomach Pain,Ginger,42,Male,5,Moderate,183,1.0
1,Hair Loss,Nausea,Anxiety,Triphala,50,Male,25,Moderate,229,3.0
2,Depression,Back Pain,Hair Loss,Tulsi,64,Male,16,Mild,842,3.0
3,Cold,Hair Loss,Weight Gain,Shatavari,57,Male,1,Mild,937,2.0
4,Back Pain,Cough,Depression,Bhringraj,60,Female,21,Severe,244,2.0


In [83]:
data.info() # Summary of the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Symptom1                     250 non-null    object 
 1   Symptom2                     250 non-null    object 
 2   Symptom3                     250 non-null    object 
 3   Product                      250 non-null    object 
 4   Age                          250 non-null    int64  
 5   Gender                       250 non-null    object 
 6   Duration of Symptoms (days)  250 non-null    int64  
 7   Severity                     245 non-null    object 
 8   Dosage (mg)                  250 non-null    int64  
 9   Frequency (times per day)    245 non-null    float64
dtypes: float64(1), int64(3), object(6)
memory usage: 19.7+ KB


## Data Preprocessing

In [57]:
# Handling Missing Values
data['Severity'].fillna(data['Severity'].mode()[0], inplace=True)
data['Frequency (times per day)'].fillna(data['Frequency (times per day)'].mode()[0], inplace=True)


In [59]:
# Handling Outliers
Q1_age = data['Age'].quantile(0.25)
Q3_age = data['Age'].quantile(0.75)
IQR_age = Q3_age - Q1_age
upper_limit_age = Q3_age + 1.5 * IQR_age
data['Age'] = np.where(data['Age'] > upper_limit_age, upper_limit_age, data['Age'])

In [60]:
Q1_dosage = data['Dosage (mg)'].quantile(0.25)
Q3_dosage = data['Dosage (mg)'].quantile(0.75)
IQR_dosage = Q3_dosage - Q1_dosage
upper_limit_dosage = Q3_dosage + 1.5 * IQR_dosage
data['Dosage (mg)'] = np.where(data['Dosage (mg)'] > upper_limit_dosage, upper_limit_dosage, data['Dosage (mg)'])


In [62]:
# Encoding Categorical Variables
data = pd.get_dummies(data, columns=['Symptom1', 'Symptom2', 'Symptom3', 'Gender', 'Severity'], drop_first=True)
le = LabelEncoder()
data['Product'] = le.fit_transform(data['Product'])


In [63]:
# Feature Selection
X = data.drop('Product', axis=1)
y = data['Product']
feature_names = X.columns

In [64]:
len(feature_names)

67

In [65]:
# Convert to numpy arrays
X = X.to_numpy()
y = y.to_numpy()

## Model Building and Evaluation

In [66]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [67]:
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'GradientBoosting': GradientBoostingClassifier(random_state=42)
}

In [68]:
# Model Selection
best_model = None
best_accuracy = 0
best_model_name = ''

In [69]:
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy for {model_name}: {accuracy}')
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model
        best_model_name = model_name
        
        

Accuracy for RandomForest: 0.04


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for LogisticRegression: 0.06
Accuracy for KNN: 0.06
Accuracy for GradientBoosting: 0.02


In [70]:
print(f'Best Model: {best_model_name} with Accuracy: {best_accuracy}')

Best Model: LogisticRegression with Accuracy: 0.06


In [71]:
print(classification_report(y_test, best_model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.00      0.00      0.00         2
           2       0.17      0.33      0.22         3
           3       0.25      1.00      0.40         1
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         4
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         4
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         4
          10       0.50      0.50      0.50         2
          11       0.00      0.00      0.00         7
          12       0.00      0.00      0.00         4
          13       0.00      0.00      0.00         3
          14       0.00      0.00      0.00         3
          16       0.00      0.00      0.00         2
          17       0.00      0.00      0.00         1
          18       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Example prediction

In [72]:
# Example prediction
example_symptoms = {
    'Age': [30],
    'Frequency (times per day)': [2],
    'Dosage (mg)': [100],
    'Symptom1_Headache': [1],
    'Symptom1_Fever': [0],
    'Symptom1_Cough': [0],
    'Symptom2_Nausea': [0],
    'Symptom2_Vomiting': [0],
    'Symptom2_None': [1],
    'Symptom3_Dizziness': [0],
    'Symptom3_Fatigue': [1],
    'Symptom3_None': [0],
    'Gender_Male': [0],
    'Gender_Female': [1],
    'Severity_Mild': [1],
    'Severity_Severe': [0]
}


In [76]:
example_df = pd.DataFrame(example_symptoms)

In [77]:
for col in feature_names:
    if col not in example_df.columns:
        example_df[col] = 0


example_df = example_df[feature_names]

In [78]:
example_df 

Unnamed: 0,Age,Duration of Symptoms (days),Dosage (mg),Frequency (times per day),Symptom1_Back Pain,Symptom1_Cold,Symptom1_Constipation,Symptom1_Cough,Symptom1_Depression,Symptom1_Diarrhea,...,Symptom3_Joint Pain,Symptom3_Nausea,Symptom3_Skin Rash,Symptom3_Sore Throat,Symptom3_Stomach Pain,Symptom3_Weight Gain,Symptom3_Weight Loss,Gender_Male,Severity_Moderate,Severity_Severe
0,30,0,100,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [79]:
# Convert example to numpy array
example_df = example_df.to_numpy()
example_prediction = best_model.predict(example_df)
example_prediction = le.inverse_transform(example_prediction)
print(f'Predicted Product: {example_prediction[0]}')

Predicted Product: Brahmi
