# Drug Recommendation System

## Steps Taken For Project:
1. Imported all the libraries required.
2. Importing the datasets
3. Did required cleaning of datasets
4. Performed munging process (Data which having rating more than 7 was filtered)
5. ML models - Naive Bayes Model, Random Forest Model and SVC model (to check which was better fitting)
6. Ensembling model by HardVoting (All three accuracies were great, hence to get a combined result)
7. Recommending The Medicines based on condition

## Preparation Work

### Importing Required Libraries

In [1]:
# munging dataset
import numpy as np
import pandas as pd

# preparing for ml
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

# making the model
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# checking accuracy
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

# for ensembling the models
from sklearn.ensemble import VotingClassifier

### Preparing Dataset

In [None]:
D = pd.read_csv('workspaces/Secure-Pharmacist/Python_Model/Datasets/drugsComTrain_raw.csv')
D.append(pd.read_csv('workspaces/Secure-Pharmacist/Python_Model/Datasets/drugsComTest_raw.csv'))
D = D.drop(['uniqueID', 'review', 'date', 'usefulCount'], axis=1)
D.head()

Unnamed: 0,drugName,condition,rating
0,Valsartan,Left Ventricular Dysfunction,9
1,Guanfacine,ADHD,8
2,Lybrel,Birth Control,5
3,Ortho Evra,Birth Control,8
4,Buprenorphine / naloxone,Opiate Dependence,9


### Cleaning The Dataset

In [3]:
D.isna().sum()
D = D.dropna()

In [4]:
# print("Before removing unwanted data:")
print("Unique Drugs:",len(D.drugName.unique()))
print("Unique Conditions:",len(D.condition.unique()))

D = D[~D.condition.str.contains("span", na=False)]

print("After removing unwanted data:")
print("Unique Drugs:",len(D.drugName.unique()))
print("Unique Conditions:",len(D.condition.unique()))

D.condition.unique()

Unique Drugs: 3431
Unique Conditions: 884
After removing unwanted data:
Unique Drugs: 3412
Unique Conditions: 811


array(['Left Ventricular Dysfunction', 'ADHD', 'Birth Control',
       'Opiate Dependence', 'Benign Prostatic Hyperplasia',
       'Emergency Contraception', 'Bipolar Disorde', 'Epilepsy',
       'Migraine Prevention', 'Depression', "Crohn's Disease", 'Cough',
       'Obesity', 'Urinary Tract Infection', 'ibromyalgia',
       'Chronic Myelogenous Leukemia', 'HIV Infection', 'Insomnia',
       'Rheumatoid Arthritis', 'Vaginal Yeast Infection',
       'Chlamydia Infection', 'Hirsutism', 'Panic Disorde', 'Migraine',
       'Pain', 'Irritable Bowel Syndrome', 'Osteoarthritis',
       'Constipation', 'Bowel Preparation', 'Psychosis', 'Muscle Spasm',
       'Hepatitis C', 'Overactive Bladde', 'Diabetes, Type 2',
       'Asthma, Maintenance', 'Non-Small Cell Lung Cance',
       'Schizophrenia', 'Dysuria', 'Smoking Cessation', 'Anxiety', 'Acne',
       'emale Infertility', 'Constipation, Acute',
       'Constipation, Drug Induced', 'Erectile Dysfunction',
       'Trigeminal Neuralgia', 'Undera

### Munging Dataset

#### Segregation Of Required Data

In [5]:
D = D[D['rating'] >= 7]
D.head()

Unnamed: 0,drugName,condition,rating
0,Valsartan,Left Ventricular Dysfunction,9
1,Guanfacine,ADHD,8
3,Ortho Evra,Birth Control,8
4,Buprenorphine / naloxone,Opiate Dependence,9
7,Aripiprazole,Bipolar Disorde,10


#### Calculated Field (Rating)

In [6]:
AvgRating = D.groupby('drugName')['rating'].mean()
AvgRating = pd.DataFrame(AvgRating)
AvgRating.head()

Unnamed: 0_level_0,rating
drugName,Unnamed: 1_level_1
A + D Cracked Skin Relief,10.0
A / B Otic,10.0
Abacavir / dolutegravir / lamivudine,9.45
Abacavir / lamivudine / zidovudine,9.0
Abatacept,9.0


In [7]:
MungedData = pd.merge(AvgRating, D[['drugName', 'condition']], on='drugName')
MungedData = MungedData.drop_duplicates(subset="drugName")
MungedData = MungedData.sort_values(["condition", "rating"], ascending = [True, False])

Rating = MungedData['rating']
Class = []
for i in Rating:
    if i>9:
        Class.append("A")
    elif i>8:
        Class.append("B")
    else:
        Class.append("C")
Class
MungedData['Class'] = Class
MungedData.head()

Unnamed: 0,drugName,rating,condition,Class
80827,ProCentra,10.0,ADHD,A
24993,Desoxyn,9.588235,ADHD,A
103783,Zenzedi,9.571429,ADHD,A
62682,Methamphetamine,9.52381,ADHD,A
23091,Cylert,9.5,ADHD,A


In [None]:
MungedData.to_csv("workspaces/Secure-Pharmacist/Python_Model/Datasets/DrugsMungedData.csv", index = False)

## Recommendation Model

### Basic Models

In [None]:
# preparing dataset for ML models
Data = pd.read_csv("workspaces/Secure-Pharmacist/Python_Model/Datasets/DrugsMungedData.csv")
Data["drugName"] = Data["drugName"].astype('category')
Data["condition"] = Data["condition"].astype('category')
Data["Class"] = Data["Class"].astype('category')

ord_enc = OrdinalEncoder()
Data["condition"] = ord_enc.fit_transform(Data[["condition"]])
Data["drugName"] = ord_enc.fit_transform(Data[["drugName"]]) 

Data.head()

Unnamed: 0,drugName,rating,condition,Class
0,2318.0,10.0,0.0,A
1,832.0,9.588235,0.0,A
2,3096.0,9.571429,0.0,A
3,1793.0,9.52381,0.0,A
4,756.0,9.5,0.0,A


In [10]:
# splitting dataset into train and test data
X = Data.drop(['Class'], axis=1)
Y = Data['Class']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=42)

#### Naive Bayes Model

In [11]:
GNB = GaussianNB()
GNB.fit(X_train, Y_train)
Y_predNB = GNB.predict(X_test)

Y_predNB

array(['A', 'A', 'B', ..., 'A', 'B', 'A'], dtype='<U1')

In [12]:
print("Accuracy:",metrics.accuracy_score(Y_test, Y_predNB)) # accuracy testing
print("Confusion Matrix:\n", confusion_matrix(Y_test, Y_predNB))
print('ROC Accuracy Score for Random Forest Model: ', 
      roc_auc_score(Y_test, GNB.predict_proba(X_test), multi_class='ovr'))

Accuracy: 0.9689677010766308
Confusion Matrix:
 [[914  44   0]
 [  0 485   5]
 [  0   0 131]]
ROC Accuracy Score for Random Forest Model:  0.999288441531775


#### Random Forest Model

In [13]:
randomForest = RandomForestClassifier(max_depth=2, random_state=0)
randomForest.fit(X_train, Y_train)
Y_predRF = randomForest.predict(X_test)

Y_predRF

array(['A', 'A', 'B', ..., 'A', 'B', 'A'], dtype=object)

In [14]:
print("Accuracy:",metrics.accuracy_score(Y_test, Y_predRF))
print("Confusion Matrix:\n", confusion_matrix(Y_test, Y_predRF))
print('ROC Accuracy Score for Random Forest Model: ', 
      roc_auc_score(Y_test, randomForest.predict_proba(X_test), multi_class='ovr'))

Accuracy: 0.9170360987967068
Confusion Matrix:
 [[958   0   0]
 [  0 490   0]
 [  0 131   0]]
ROC Accuracy Score for Random Forest Model:  0.9999968766202532


#### SVC Model

In [15]:
SVC = make_pipeline(StandardScaler(), SVC(gamma='auto', probability=True))
SVC.fit(X_train, Y_train)
Y_predSVC = SVC.predict(X_test)

Y_predSVC

array(['A', 'A', 'B', ..., 'A', 'B', 'A'], dtype=object)

In [16]:
print("Accuracy:",metrics.accuracy_score(Y_test, Y_predSVC))
print("Confusion Matrix:\n", confusion_matrix(Y_test, Y_predSVC))
print('ROC Accuracy Score for SVC Model: ', 
      roc_auc_score(Y_test, SVC.predict_proba(X_test), multi_class='ovr'))

Accuracy: 0.9702343255224826
Confusion Matrix:
 [[913  45   0]
 [  0 488   2]
 [  0   0 131]]
ROC Accuracy Score for SVC Model:  0.9996339326359402


### Models Being Ensembled

In [17]:
estimator = []
estimator.append(('GNB', GNB))
estimator.append(('RF', randomForest))
estimator.append(('SVC', SVC))

In [18]:
EnsembledVH = VotingClassifier(estimators = estimator, voting ='hard')
EnsembledVH.fit(X_train, Y_train)
Y_predEM = EnsembledVH.predict(X_test)

Y_predEM

array(['A', 'A', 'B', ..., 'A', 'B', 'A'], dtype=object)

In [19]:
print("Accuracy:",metrics.accuracy_score(Y_test, Y_predEM))
print("Confusion Matrix:\n", confusion_matrix(Y_test, Y_predEM))

# One Hot Code Encoding
Y_changedDF = pd.DataFrame()
Y_changedDF['Y_predEM'] = Y_predEM
Y_OHENCdata = pd.get_dummies(Y_changedDF, columns = ['Y_predEM'])

print('ROC Accuracy Score for Ensembled Model: ', 
      roc_auc_score(Y_test, Y_OHENCdata, multi_class='ovr'))

Accuracy: 0.9721342621912603
Confusion Matrix:
 [[916  42   0]
 [  0 488   2]
 [  0   0 131]]
ROC Accuracy Score for Ensembled Model:  0.9853547204412446


In [20]:
Y = EnsembledVH.predict(X)

condition = input("Enter the condition: ").upper()
conditionsUpper = MungedData['condition'].str.upper()
match = conditionsUpper == condition
print("The Drugs for " + condition + " are: ")

print("\nMost Recommended:")
print(MungedData[match & (Y == 'A')]['drugName'])

print("\nSecond Most Recommended:")
print(MungedData[match & (Y == 'B' )]['drugName'])

print("\nLeast Recommended:")
print(MungedData[match & (Y == 'C')]['drugName'])

Enter the condition: acne
The Drugs for ACNE are: 

Most Recommended:
429                                Absorica
3544                                  Adoxa
9105                                   Avar
9348                                  Avita
11475     Benzoyl peroxide / hydrocortisone
11477             Benzoyl peroxide / sulfur
19536                             Clindagel
32349                              Ery Pads
42284                      Fostex Medicated
44378                       Hexachlorophene
70523                          Norinyl 1+35
70759                                  NuOx
73444                                Oxy-10
74399         PanOxyl 10% Acne Foaming Wash
75952                                Pernox
83930                   Resorcinol / sulfur
85086                        Salicylic acid
90536                                Sulfur
74394                               PanOxyl
103776                             Zenatane
72232                               Onexton
11080 