# Drug Recommendation System

## Steps Taken For Project:
1. Imported all the libraries required.
2. Importing the datasets
3. Did required cleaning of datasets
4. Performed munging process (Data which having rating more than 7 was filtered)
5. ML models - Naive Bayes Model, Random Forest Model and SVC model (to check which was better fitting)
6. Ensembling model by HardVoting (All three accuracies were great, hence to get a combined result)
7. Recommending The Medicines based on condition

## Preparation Work

### Importing Required Libraries

In [None]:
# munging dataset
import numpy as np
import pandas as pd

# preparing for ml
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

# making the model
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# checking accuracy
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

# for ensembling the models
from sklearn.ensemble import VotingClassifier

### Preparing Dataset

In [2]:
D = pd.read_csv('/workspaces/Secure-Pharmacist/Python_Model/Datasets/drugsComTrain_raw.csv')
D = pd.concat([D, pd.read_csv('/workspaces/Secure-Pharmacist/Python_Model/Datasets/drugsComTest_raw.csv')], axis=0)
D = D.drop(['uniqueID', 'review', 'date', 'usefulCount'], axis=1)
D.head()

Unnamed: 0,drugName,condition,rating
0,Omeprazole,Zollinger-Ellison Syndrome,10
1,Mirena,Birth Control,10
2,Levonorgestrel,Birth Control,10
3,Zoloft,Depression,9
4,Phentermine,Weight Loss,10


### Cleaning The Dataset

In [3]:
D.isna().sum()
D = D.dropna()

In [4]:
# print("Before removing unwanted data:")
print("Unique Drugs:",len(D.drugName.unique()))
print("Unique Conditions:",len(D.condition.unique()))

D = D[~D.condition.str.contains("span", na=False)]

print("After removing unwanted data:")
print("Unique Drugs:",len(D.drugName.unique()))
print("Unique Conditions:",len(D.condition.unique()))

D.condition.unique()

Unique Drugs: 3646
Unique Conditions: 855
After removing unwanted data:
Unique Drugs: 3646
Unique Conditions: 855


array(['Zollinger-Ellison Syndrome', 'Birth Control', 'Depression',
       'Weight Loss', 'Anxiety and Stress', 'Anxiety',
       'emale Infertility', 'Pain', 'Obesity', 'Osteoporosis',
       'Panic Disorde', 'Urinary Tract Infection',
       'Polycystic Ovary Syndrome', 'Muscle Spasm', 'Opiate Withdrawal',
       'Hypogonadism Male', 'Ovulation Induction', 'Hashimotos disease',
       'Bipolar Disorde', 'Opiate Dependence', 'Insomnia',
       'Underactive Thyroid', 'Breast Cance', 'High Blood Pressure',
       'ADHD', 'Benign Prostatic Hyperplasia', 'Alopecia',
       'Erectile Dysfunction', 'Generalized Anxiety Disorde',
       'Atrophic Vaginitis', 'Sexual Dysfunction SSRI Induced',
       'Obsessive Compulsive Disorde', 'High Cholesterol',
       'Hypothyroidism After Thyroid Removal',
       'Abnormal Uterine Bleeding', 'Renal Cell Carcinoma',
       'Non-Small Cell Lung Cance', 'ibromyalgia',
       'Photoaging of the Skin', 'Atrial Fibrillation',
       'Alzheimers Disease', 'S

### Munging Dataset

#### Segregation Of Required Data

In [5]:
D = D[D['rating'] >= 7]
D.head()

Unnamed: 0,drugName,condition,rating
0,Omeprazole,Zollinger-Ellison Syndrome,10
1,Mirena,Birth Control,10
2,Levonorgestrel,Birth Control,10
3,Zoloft,Depression,9
4,Phentermine,Weight Loss,10


#### Calculated Field (Rating)

In [6]:
AvgRating = D.groupby('drugName')['rating'].mean()
AvgRating = pd.DataFrame(AvgRating)
AvgRating.head()

Unnamed: 0_level_0,rating
drugName,Unnamed: 1_level_1
A + D Cracked Skin Relief,10.0
A / B Otic,10.0
Abacavir / dolutegravir / lamivudine,9.45614
Abacavir / lamivudine,10.0
Abacavir / lamivudine / zidovudine,9.0


In [7]:
MungedData = pd.merge(AvgRating, D[['drugName', 'condition']], on='drugName')
MungedData = MungedData.drop_duplicates(subset="drugName")
MungedData = MungedData.sort_values(["condition", "rating"], ascending = [True, False])

Rating = MungedData['rating']
Class = []
for i in Rating:
    if i>9:
        Class.append("A")
    elif i>8:
        Class.append("B")
    else:
        Class.append("C")
Class
MungedData['Class'] = Class
MungedData.head()

Unnamed: 0,drugName,rating,condition,Class
103948,ProCentra,10.0,ADHD,A
29720,Cylert,9.8,ADHD,A
97351,Pemoline,9.8,ADHD,A
32176,Desoxyn,9.666667,ADHD,A
133505,Zenzedi,9.571429,ADHD,A


In [8]:
MungedData.to_csv("/workspaces/Secure-Pharmacist/Python_Model/Datasets/DrugsMungedData.csv", index = False)

## Recommendation Model

### Basic Models

In [9]:
# preparing dataset for ML models
Data = pd.read_csv("/workspaces/Secure-Pharmacist/Python_Model/Datasets/DrugsMungedData.csv")
Data["drugName"] = Data["drugName"].astype('category')
Data["condition"] = Data["condition"].astype('category')
Data["Class"] = Data["Class"].astype('category')

ord_enc = OrdinalEncoder()
Data["condition"] = ord_enc.fit_transform(Data[["condition"]])
Data["drugName"] = ord_enc.fit_transform(Data[["drugName"]]) 

Data.head()

Unnamed: 0,drugName,rating,condition,Class
0,2497.0,10.0,0.0,A
1,807.0,9.8,0.0,A
2,2349.0,9.8,0.0,A
3,886.0,9.666667,0.0,A
4,3333.0,9.571429,0.0,A


In [10]:
# splitting dataset into train and test data
X = Data.drop(['Class'], axis=1)
Y = Data['Class']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=42)

#### Naive Bayes Model

In [11]:
GNB = GaussianNB()
GNB.fit(X_train, Y_train)
Y_predNB = GNB.predict(X_test)

Y_predNB

array(['B', 'A', 'A', ..., 'B', 'A', 'A'], shape=(1699,), dtype='<U1')

In [12]:
print("Accuracy:",metrics.accuracy_score(Y_test, Y_predNB)) # accuracy testing
print("Confusion Matrix:\n", confusion_matrix(Y_test, Y_predNB))
print('ROC Accuracy Score for Random Forest Model: ', 
      roc_auc_score(Y_test, GNB.predict_proba(X_test), multi_class='ovr'))

Accuracy: 0.9535020600353149
Confusion Matrix:
 [[980  70   0]
 [  0 522   9]
 [  0   0 118]]
ROC Accuracy Score for Random Forest Model:  0.9981445146792257


#### Random Forest Model

In [13]:
randomForest = RandomForestClassifier(max_depth=2, random_state=0)
randomForest.fit(X_train, Y_train)
Y_predRF = randomForest.predict(X_test)

Y_predRF

array(['B', 'A', 'A', ..., 'B', 'A', 'A'], shape=(1699,), dtype=object)

In [14]:
print("Accuracy:",metrics.accuracy_score(Y_test, Y_predRF))
print("Confusion Matrix:\n", confusion_matrix(Y_test, Y_predRF))
print('ROC Accuracy Score for Random Forest Model: ', 
      roc_auc_score(Y_test, randomForest.predict_proba(X_test), multi_class='ovr'))

Accuracy: 0.9305473808122425
Confusion Matrix:
 [[1050    0    0]
 [   0  531    0]
 [   0  118    0]]
ROC Accuracy Score for Random Forest Model:  1.0


#### SVC Model

In [15]:
SVC = make_pipeline(StandardScaler(), SVC(gamma='auto', probability=True))
SVC.fit(X_train, Y_train)
Y_predSVC = SVC.predict(X_test)

Y_predSVC

array(['B', 'A', 'A', ..., 'B', 'A', 'A'], shape=(1699,), dtype=object)

In [16]:
print("Accuracy:",metrics.accuracy_score(Y_test, Y_predSVC))
print("Confusion Matrix:\n", confusion_matrix(Y_test, Y_predSVC))
print('ROC Accuracy Score for SVC Model: ', 
      roc_auc_score(Y_test, SVC.predict_proba(X_test), multi_class='ovr'))

Accuracy: 0.9552678045909359
Confusion Matrix:
 [[978  72   0]
 [  0 527   4]
 [  0   0 118]]
ROC Accuracy Score for SVC Model:  0.9996808410483823


### Models Being Ensembled

In [17]:
estimator = []
estimator.append(('GNB', GNB))
estimator.append(('RF', randomForest))
estimator.append(('SVC', SVC))

In [18]:
EnsembledVH = VotingClassifier(estimators = estimator, voting ='hard')
EnsembledVH.fit(X_train, Y_train)
Y_predEM = EnsembledVH.predict(X_test)

Y_predEM

array(['B', 'A', 'A', ..., 'B', 'A', 'A'], shape=(1699,), dtype=object)

In [19]:
print("Accuracy:",metrics.accuracy_score(Y_test, Y_predEM))
print("Confusion Matrix:\n", confusion_matrix(Y_test, Y_predEM))

# One Hot Code Encoding
Y_changedDF = pd.DataFrame()
Y_changedDF['Y_predEM'] = Y_predEM
Y_OHENCdata = pd.get_dummies(Y_changedDF, columns = ['Y_predEM'])

print('ROC Accuracy Score for Ensembled Model: ', 
      roc_auc_score(Y_test, Y_OHENCdata, multi_class='ovr'))

Accuracy: 0.9582107121836374
Confusion Matrix:
 [[983  67   0]
 [  0 527   4]
 [  0   0 118]]
ROC Accuracy Score for Ensembled Model:  0.9781274102550953


In [20]:
Y = EnsembledVH.predict(X)

condition = input("Enter the condition: ").upper()
conditionsUpper = MungedData['condition'].str.upper()
match = conditionsUpper == condition
print("The Drugs for " + condition + " are: ")

print("\nMost Recommended:")
print(MungedData[match & (Y == 'A')]['drugName'])

print("\nSecond Most Recommended:")
print(MungedData[match & (Y == 'B' )]['drugName'])

print("\nLeast Recommended:")
print(MungedData[match & (Y == 'C')]['drugName'])

The Drugs for ADHD are: 

Most Recommended:
103948            ProCentra
29720                Cylert
97351              Pemoline
32176               Desoxyn
133505              Zenzedi
80531       Methamphetamine
30914              Daytrana
85364             Modafinil
4751         Adzenys XR-ODT
80833              Methylin
33209     Dextroamphetamine
39213           Dyanavel XR
108501           Ritalin LA
32889             Dexedrine
Name: drugName, dtype: object

Second Most Recommended:
8499      Amphetamine / dextroamphetamine
3938                             Adderall
72669                    Lisdexamfetamine
8467                          Amphetamine
4150                          Adderall XR
130689                            Vyvanse
33394                          Dextrostat
106935                      Quillivant XR
62342                              Kapvay
27606                            Concerta
60913                             Intuniv
54225                          Focalin XR
7973