In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
dataset = pd.read_csv('dis_sym_dataset_comb.csv')
dataset.shape

(8835, 490)

In [3]:
dataset.head()

Unnamed: 0,label_dis,abdominal cramp,abdominal distention,abnormal behavior,abnormal bleeding,abnormal sensation,abnormally frequent,abscess,aching,acne,...,wet,wheezing,white patch vaginal discharge,widespread pain,withdrawal occurring stopping,worrying,yellow skin,yellowish coloration skin white eye,yellowish skin,yellowish skin crust
0,Abscess,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Abscess,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Abscess,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Abscess,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Abscess,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
dataset.isnull().sum()

label_dis                              0
abdominal cramp                        0
abdominal distention                   0
abnormal behavior                      0
abnormal bleeding                      0
                                      ..
worrying                               0
yellow skin                            0
yellowish coloration skin white eye    0
yellowish skin                         0
yellowish skin crust                   0
Length: 490, dtype: int64

In [5]:
dataset = dataset.dropna()
dataset.shape

(8835, 490)

In [6]:
dataset.label_dis.value_counts()

Myocardial Infarction (Heart Attack)    2047
Polycystic ovary syndrome (PCOS)         511
Anthrax                                  511
Porphyria                                255
Rabies                                   255
                                        ... 
Neoplasm                                   1
Burns                                      1
Fibroids                                   1
Taeniasis/cysticercosis                    1
Hypotonia                                  1
Name: label_dis, Length: 261, dtype: int64

In [7]:
df_train = dataset[(dataset['label_dis']=='Myocardial Infarction (Heart Attack)') | (dataset['label_dis']=='Polycystic ovary syndrome (PCOS)') | (dataset['label_dis']=='Anthrax')|(dataset['label_dis']=='Porphyria')]

In [8]:
df_train.shape

(3324, 490)

In [9]:
y=df_train['label_dis']
X=df_train.drop(['label_dis'], axis=1)

In [10]:
X.shape

(3324, 489)

In [11]:
X.dtypes

abdominal cramp                        int64
abdominal distention                   int64
abnormal behavior                      int64
abnormal bleeding                      int64
abnormal sensation                     int64
                                       ...  
worrying                               int64
yellow skin                            int64
yellowish coloration skin white eye    int64
yellowish skin                         int64
yellowish skin crust                   int64
Length: 489, dtype: object

In [12]:
X.isna().sum()

abdominal cramp                        0
abdominal distention                   0
abnormal behavior                      0
abnormal bleeding                      0
abnormal sensation                     0
                                      ..
worrying                               0
yellow skin                            0
yellowish coloration skin white eye    0
yellowish skin                         0
yellowish skin crust                   0
Length: 489, dtype: int64

In [13]:
y.value_counts()


Myocardial Infarction (Heart Attack)    2047
Anthrax                                  511
Polycystic ovary syndrome (PCOS)         511
Porphyria                                255
Name: label_dis, dtype: int64

In [14]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0, stratify=y)

In [15]:
len(x_train)

2659

In [16]:
len(x_test)

665

In [17]:
multinb_model = MultinomialNB()
multinb_model.fit(x_train,y_train)
y_pred_multinb = multinb_model.predict(x_test)

In [25]:
precision =precision_score(y_test, y_pred_multinb, average='macro')
recall = recall_score(y_test, y_pred_multinb, average='macro')
f1score = f1_score(y_test, y_pred_multinb, average='macro')
print(precision)
print('Recall: '+str(recall))
print(f1score)

Precision: 0.9939320388349515
Recall: 0.9852941176470589
0.9895016638149554


In [None]:
print(classification_report(y_test,y_pred_multinb))

In [None]:
df=pd.read_csv('drugsComTrain_raw.tsv', sep='\t')

In [None]:
df.to_csv('drugsComTrain_raw.csv',index=False)

In [None]:
df

In [None]:
def top_drugs_extractor(condition):
    df_top = df[(df['rating']>=9)&(df['usefulCount']>=100)].sort_values(by = ['rating', 'usefulCount'], ascending = [False, False])
    drug_lst = df_top[df_top['condition']==condition]['drugName'].head(3).tolist()
    return drug_lst

In [None]:
def predict_text(lst_text):
    df_test = pd.DataFrame(lst_text, columns = ['test_sent'])
    df_test["test_sent"] = df_test["test_sent"].apply(review_to_words)
    tfidf_bigram = tfidf_vectorizer3.transform(lst_text)
    prediction = pass_tf.predict(tfidf_bigram)
    df_test['prediction']=prediction
    return df_test

In [None]:
sentences = [
  "abnormal sensation,worrying and yellow skin"
  ]

In [None]:
#plotting the roc curve
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score(y_test,y_pred_multinb)
fpr, tpr, _ = roc_curve(y_test,y_pred_multinb)
plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, marker='.',color='orange',label="for Multinomial Naive Bayes, auc = %.2f"% roc_auc)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

In [None]:
#plt.figure(figsize=(6, 4))
import seaborn as sns
sns.heatmap(confusion_matrix(y_test,y_pred_multinb) , annot = True,fmt='d',cmap="inferno")
print(confusion_matrix(y_test,y_pred_multinb))
plt.title('Confusion matrix of y_pred_multinb')
plt.xlabel('Predicted labels')
plt.ylabel('Actual labels')
#plt.savefig('confusion_matrix_dataset1_svm.png')

In [None]:
clf = svm.SVC(kernel='linear')
t = clf.fit(x_train, y_train)
y_pred_svm = t.predict(x_test)

In [None]:
precision_svm = precision_score(y_test, y_pred_svm, average='binary')
recall_svm = recall_score(y_test, y_pred_svm, average='binary')
f1score_svm = f1_score(y_test, y_pred_svm, average='binary')
print(precision)
print(recall)
print(f1score)

In [None]:
print(classification_report(y_test,y_pred_svm))

In [None]:
#plotting the roc curve
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score(y_test,y_pred_svm)
fpr, tpr, _ = roc_curve(y_test,y_pred_svm)
plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, marker='.',color='orange',label="for SVM, auc = %.2f"% roc_auc)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

In [None]:
#plt.figure(figsize=(6, 4))
import seaborn as sns
sns.heatmap(confusion_matrix(y_test,y_pred_svm) , annot = True,fmt='d',cmap="inferno")
print(confusion_matrix(y_test,y_pred_svm))
plt.title('Confusion matrix of SVMn')
plt.xlabel('Predicted labels')
plt.ylabel('Actual labels')
#plt.savefig('confusion_matrix_dataset1_svm.png')

In [None]:
from sklearn.naive_bayes import GaussianNB
model= GaussianNB()

In [None]:
model.fit(x_train,y_train)

In [None]:
model.score(x_test,y_test)

In [None]:
x_test[:10]

In [None]:
y_test[:10]

In [None]:
model.predict(x_test[:10])

In [None]:
model.predict_proba(x_test[:10])

In [None]:
print(classification_report(y_test,y_pred_svm))