In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import os
from sklearn.preprocessing import LabelEncoder
from scipy.special import inv_boxcox
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# for HD visualizations
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set_style('whitegrid')
plt.style.use('ggplot')

In [2]:
# df = pd.read_csv(r".\raw_data\dataset.csv")
# Alternative Way: 
# always use dynamic path (using OS) as we have to run our applications on server 
data_dir = 'raw_data'
filename = 'dataset.csv'
file_path = os.path.join(data_dir, filename)
df = pd.read_csv(file_path)


In [3]:
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [4]:
df.shape

(4920, 18)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4920 entries, 0 to 4919
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Disease     4920 non-null   object
 1   Symptom_1   4920 non-null   object
 2   Symptom_2   4920 non-null   object
 3   Symptom_3   4920 non-null   object
 4   Symptom_4   4572 non-null   object
 5   Symptom_5   3714 non-null   object
 6   Symptom_6   2934 non-null   object
 7   Symptom_7   2268 non-null   object
 8   Symptom_8   1944 non-null   object
 9   Symptom_9   1692 non-null   object
 10  Symptom_10  1512 non-null   object
 11  Symptom_11  1194 non-null   object
 12  Symptom_12  744 non-null    object
 13  Symptom_13  504 non-null    object
 14  Symptom_14  306 non-null    object
 15  Symptom_15  240 non-null    object
 16  Symptom_16  192 non-null    object
 17  Symptom_17  72 non-null     object
dtypes: object(18)
memory usage: 692.0+ KB


In [6]:
df.describe()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
count,4920,4920,4920,4920,4572,3714,2934,2268,1944,1692,1512,1194,744,504,306,240,192,72
unique,41,34,48,54,50,38,32,26,21,22,21,18,11,8,4,3,3,1
top,Fungal infection,vomiting,vomiting,fatigue,high_fever,headache,nausea,abdominal_pain,abdominal_pain,yellowing_of_eyes,yellowing_of_eyes,irritability,malaise,muscle_pain,chest_pain,chest_pain,blood_in_sputum,muscle_pain
freq,120,822,870,726,378,348,390,264,276,228,198,120,126,72,96,144,72,72


In [None]:
# df["Disease"] = df["Disease"].map({'Drug Reaction':0, 'Malaria':1, 'Allergy':2, 'Hypothyroidism':3,
#                   'Psoriasis':4, 'GERD':5, 'Chronic cholestasis':6, 'hepatitis A':7,
#                   'Osteoarthristis':8, '(vertigo) Paroymsal  Positional Vertigo':9,
#                   'Hypoglycemia':10, 'Acne':11, 'Diabetes':12, 'Impetigo':13, 'Hypertension':14,
#                   'Peptic ulcer diseae':15, 'Dimorphic hemorrhoids(piles)':16,
#                   'Common Cold':17, 'Chicken pox':18, 'Cervical spondylosis':19,
#                   'Hyperthyroidism':20, 'Urinary tract infection':21, 'Varicose veins':22,
#                   'AIDS':23, 'Paralysis (brain hemorrhage)':24, 'Typhoid':25, 'Hepatitis B':26,
#                   'Fungal infection':27, 'Hepatitis C':28, 'Migraine':29, 'Bronchial Asthma':30,
#                   'Alcoholic hepatitis':31, 'Jaundice':32, 'Hepatitis E':33, 'Dengue':34,
#                   'Hepatitis D':35, 'Heart attack':36, 'Pneumonia':37, 'Arthritis':38,
#                   'Gastroenteritis':39, 'Tuberculosis':40})

In [7]:
df.isna().sum()

Disease          0
Symptom_1        0
Symptom_2        0
Symptom_3        0
Symptom_4      348
Symptom_5     1206
Symptom_6     1986
Symptom_7     2652
Symptom_8     2976
Symptom_9     3228
Symptom_10    3408
Symptom_11    3726
Symptom_12    4176
Symptom_13    4416
Symptom_14    4614
Symptom_15    4680
Symptom_16    4728
Symptom_17    4848
dtype: int64

In [8]:
# if disease contain null value drop them
df1 = df[df['Disease'].isnull()]
print(df1.shape)
df.drop(axis=0,index=df1.index,inplace=True)

(0, 18)


In [9]:
# df["Disease"] = df["Disease"].astype(int)

In [10]:
# why this line??
df.reset_index(drop = "first", inplace = True )

In [11]:
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [12]:
# treating null values
df.isna().sum()

Disease          0
Symptom_1        0
Symptom_2        0
Symptom_3        0
Symptom_4      348
Symptom_5     1206
Symptom_6     1986
Symptom_7     2652
Symptom_8     2976
Symptom_9     3228
Symptom_10    3408
Symptom_11    3726
Symptom_12    4176
Symptom_13    4416
Symptom_14    4614
Symptom_15    4680
Symptom_16    4728
Symptom_17    4848
dtype: int64

In [13]:
df.replace(np.nan,'',regex=True, inplace = True)  

In [14]:
df.isna().sum()

Disease       0
Symptom_1     0
Symptom_2     0
Symptom_3     0
Symptom_4     0
Symptom_5     0
Symptom_6     0
Symptom_7     0
Symptom_8     0
Symptom_9     0
Symptom_10    0
Symptom_11    0
Symptom_12    0
Symptom_13    0
Symptom_14    0
Symptom_15    0
Symptom_16    0
Symptom_17    0
dtype: int64

In [None]:
# feature engineering combining all Symptoms
# df["Symptoms"] = df[["Symptom_1", "Symptom_2","Symptom_3", "Symptom_4","Symptom_5", "Symptom_6", "Symptom_7","Symptom_8",
#                    "Symptom_9", "Symptom_10", "Symptom_11", "Symptom_12", "Symptom_13", "Symptom_14",
#                    "Symptom_15","Symptom_16", "Symptom_17"]].apply(",".join, axis=1)

In [15]:
# better way of combine all
df["Symptoms"] = df.iloc[:, 1:].astype(str).apply(lambda x: ",".join(x), axis=1)

In [16]:
df

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17,Symptoms
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,"itching, skin_rash, nodal_skin_eruptions, disc..."
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,,"skin_rash, nodal_skin_eruptions, dischromic _..."
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,,"itching, nodal_skin_eruptions, dischromic _pat..."
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,,"itching, skin_rash, dischromic _patches,,,,,,,..."
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,,"itching, skin_rash, nodal_skin_eruptions,,,,,,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,(vertigo) Paroymsal Positional Vertigo,vomiting,headache,nausea,spinning_movements,loss_of_balance,unsteadiness,,,,,,,,,,,,"vomiting, headache, nausea, spinning_movement..."
4916,Acne,skin_rash,pus_filled_pimples,blackheads,scurring,,,,,,,,,,,,,,"skin_rash, pus_filled_pimples, blackheads, sc..."
4917,Urinary tract infection,burning_micturition,bladder_discomfort,foul_smell_of urine,continuous_feel_of_urine,,,,,,,,,,,,,,"burning_micturition, bladder_discomfort, foul..."
4918,Psoriasis,skin_rash,joint_pain,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,,,,,,,,,,,,"skin_rash, joint_pain, skin_peeling, silver_l..."


In [None]:
#to get all cols list 
# column_names = df.columns.tolist()
# column_names

In [None]:
# drop extra columns
# df.drop(["Symptom_1","Symptom_2","Symptom_3", "Symptom_4","Symptom_5", "Symptom_6", "Symptom_7","Symptom_8",
#                    "Symptom_9", "Symptom_10", "Symptom_11", "Symptom_12", "Symptom_13", "Symptom_14",
#                    "Symptom_15","Symptom_16", "Symptom_17"], axis = 1, inplace = True)

In [17]:
#better way to drop columns
df = df.drop(df.columns[1:18], axis=1)


In [19]:
df.head()

Unnamed: 0,Disease,Symptoms
0,Fungal infection,"itching, skin_rash, nodal_skin_eruptions, disc..."
1,Fungal infection,"skin_rash, nodal_skin_eruptions, dischromic _..."
2,Fungal infection,"itching, nodal_skin_eruptions, dischromic _pat..."
3,Fungal infection,"itching, skin_rash, dischromic _patches,,,,,,,..."
4,Fungal infection,"itching, skin_rash, nodal_skin_eruptions,,,,,,..."


In [20]:
df.shape

(4920, 2)

In [21]:
df.isna().sum()

Disease     0
Symptoms    0
dtype: int64

### --------------------------------------------------------------------------------------------------------------------------

# symptoms_description

In [22]:
#loading symptom_Description csv file
# symptoms_description = pd.read_csv(r".\raw_data\symptom_Description.csv")

#alternative way
# data_dir = 'raw_data'
filename = 'symptom_Description.csv'
file_path = os.path.join(data_dir, filename)
symptoms_description = pd.read_csv(file_path)


In [23]:
symptoms_description.head()

Unnamed: 0,Disease,Description
0,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...
1,Malaria,An infectious disease caused by protozoan para...
2,Allergy,An allergy is an immune system response to a f...
3,Hypothyroidism,"Hypothyroidism, also called underactive thyroi..."
4,Psoriasis,Psoriasis is a common skin disorder that forms...


In [24]:
symptoms_description.shape

(41, 2)

## Symptom_Severity

In [25]:
# loading Symptom-severity csv file
Symptom_severity = pd.read_csv(r".\raw_data\Symptom_Severity.csv")

#alternative way
# data_dir = 'raw_data'
filename = 'Symptom_Severity.csv'
file_path = os.path.join(data_dir, filename)
Symptom_severity = pd.read_csv(file_path)

In [26]:
Symptom_severity.head()

Unnamed: 0,Symptom,weight
0,itching,1
1,skin_rash,3
2,nodal_skin_eruptions,4
3,continuous_sneezing,4
4,shivering,5


In [27]:
Symptom_severity.shape

(133, 2)

In [28]:
Symptom_severity["Symptom"].nunique()

132

In [29]:
Symptom_severity["weight"].unique()

array([1, 3, 4, 5, 6, 2, 7], dtype=int64)

In [30]:
Symptom_severity["Symptoms"] = Symptom_severity["Symptom"]

In [31]:
Symptom_severity

Unnamed: 0,Symptom,weight,Symptoms
0,itching,1,itching
1,skin_rash,3,skin_rash
2,nodal_skin_eruptions,4,nodal_skin_eruptions
3,continuous_sneezing,4,continuous_sneezing
4,shivering,5,shivering
...,...,...,...
128,inflammatory_nails,2,inflammatory_nails
129,blister,4,blister
130,red_sore_around_nose,2,red_sore_around_nose
131,yellow_crust_ooze,3,yellow_crust_ooze


## Symptoms_Precautions

In [32]:
# loading symtoms_precautions csv file
# symptoms_precautions = pd.read_csv(r".\raw_data\Symptom_Precaution.csv")

#alternative way
data_dir = 'raw_data'
filename = 'Symptom_Precaution.csv'
file_path = os.path.join(data_dir, filename)
symtoms_precautions = pd.read_csv(file_path)

In [33]:
symtoms_precautions.head()

Unnamed: 0,Disease,Precaution_1,Precaution_2,Precaution_3,Precaution_4
0,Drug Reaction,stop irritation,consult nearest hospital,stop taking drug,follow up
1,Malaria,Consult nearest hospital,avoid oily food,avoid non veg food,keep mosquitos out
2,Allergy,apply calamine,cover area with bandage,,use ice to compress itching
3,Hypothyroidism,reduce stress,exercise,eat healthy,get proper sleep
4,Psoriasis,wash hands with warm soapy water,stop bleeding using pressure,consult doctor,salt baths


In [34]:
symtoms_precautions.shape

(41, 5)

In [35]:
symtoms_precautions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Disease       41 non-null     object
 1   Precaution_1  41 non-null     object
 2   Precaution_2  41 non-null     object
 3   Precaution_3  40 non-null     object
 4   Precaution_4  40 non-null     object
dtypes: object(5)
memory usage: 1.7+ KB


In [36]:
# treating null
df.replace(np.nan,'',regex=True, inplace = True) 

## ------------------------------------------------
## Merging symptoms_description And dataset

In [37]:
data = pd.merge(symptoms_description,df, how = "inner", on = "Disease")

In [None]:
# data = pd.merge(Symptom_severity,df, how = "inner", on = "Symptoms")

In [38]:
data.head()

Unnamed: 0,Disease,Description,Symptoms
0,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...,"itching, skin_rash, stomach_pain, burning_mict..."
1,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...,"itching, stomach_pain, burning_micturition, sp..."
2,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...,"itching, skin_rash, burning_micturition, spott..."
3,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...,"itching, skin_rash, stomach_pain, spotting_ ur..."
4,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...,"itching, skin_rash, stomach_pain, burning_mict..."


In [39]:
data.shape

(4560, 3)

In [40]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4560 entries, 0 to 4559
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Disease      4560 non-null   object
 1   Description  4560 non-null   object
 2   Symptoms     4560 non-null   object
dtypes: object(3)
memory usage: 142.5+ KB


In [41]:
# we dont requred all this 3 step as there is no duplicates
data.columns[data.columns.duplicated()]

Index([], dtype='object')

In [42]:
data = data.loc[:, ~data.columns.duplicated()]

In [43]:
data.shape

(4560, 3)

In [44]:
data.head()

Unnamed: 0,Disease,Description,Symptoms
0,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...,"itching, skin_rash, stomach_pain, burning_mict..."
1,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...,"itching, stomach_pain, burning_micturition, sp..."
2,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...,"itching, skin_rash, burning_micturition, spott..."
3,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...,"itching, skin_rash, stomach_pain, spotting_ ur..."
4,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...,"itching, skin_rash, stomach_pain, burning_mict..."


In [45]:
data["Disease"].unique()

array(['Drug Reaction', 'Malaria', 'Allergy', 'Hypothyroidism',
       'Psoriasis', 'GERD', 'Chronic cholestasis', 'hepatitis A',
       'Osteoarthristis', '(vertigo) Paroymsal  Positional Vertigo',
       'Hypoglycemia', 'Acne', 'Impetigo', 'Peptic ulcer diseae',
       'Common Cold', 'Chicken pox', 'Cervical spondylosis',
       'Hyperthyroidism', 'Urinary tract infection', 'Varicose veins',
       'AIDS', 'Paralysis (brain hemorrhage)', 'Typhoid', 'Hepatitis B',
       'Fungal infection', 'Hepatitis C', 'Migraine', 'Bronchial Asthma',
       'Alcoholic hepatitis', 'Jaundice', 'Hepatitis E', 'Dengue',
       'Hepatitis D', 'Heart attack', 'Pneumonia', 'Arthritis',
       'Gastroenteritis', 'Tuberculosis'], dtype=object)

In [None]:
# data["Disease"] = data["Disease"].map({'Drug Reaction':0, 'Malaria':1, 'Allergy':2, 'Hypothyroidism':3,
#                   'Psoriasis':4, 'GERD':5, 'Chronic cholestasis':6, 'hepatitis A':7,
#                   'Osteoarthristis':8, '(vertigo) Paroymsal  Positional Vertigo':9,
#                   'Hypoglycemia':10, 'Acne':11, 'Diabetes':12, 'Impetigo':13, 'Hypertension':14,
#                   'Peptic ulcer diseae':15, 'Dimorphic hemorrhoids(piles)':16,
#                   'Common Cold':17, 'Chicken pox':18, 'Cervical spondylosis':19,
#                   'Hyperthyroidism':20, 'Urinary tract infection':21, 'Varicose veins':22,
#                   'AIDS':23, 'Paralysis (brain hemorrhage)':24, 'Typhoid':25, 'Hepatitis B':26,
#                   'Fungal infection':27, 'Hepatitis C':28, 'Migraine':29, 'Bronchial Asthma':30,
#                   'Alcoholic hepatitis':31, 'Jaundice':32, 'Hepatitis E':33, 'Dengue':34,
#                   'Hepatitis D':35, 'Heart attack':36, 'Pneumonia':37, 'Arthritis':38,
#                   'Gastroenteritis':39, 'Tuberculosis':40})

In [46]:
data.head()

Unnamed: 0,Disease,Description,Symptoms
0,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...,"itching, skin_rash, stomach_pain, burning_mict..."
1,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...,"itching, stomach_pain, burning_micturition, sp..."
2,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...,"itching, skin_rash, burning_micturition, spott..."
3,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...,"itching, skin_rash, stomach_pain, spotting_ ur..."
4,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...,"itching, skin_rash, stomach_pain, burning_mict..."


## --------------------------------------------------
## saving data file

In [47]:
# data.to_csv(r".\processed_data\dataset_final.csv")
#alternative way

output_dir = "processed_data"
processed_dataset_name="processed_dataset.csv"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

processed_file_path = os.path.join(output_dir, processed_dataset_name)
data.to_csv(processed_file_path)

## -------------------------------------------------------------------------
## Model Training 

In [48]:
# df = pd.read_csv(r".\processed_data\processed_dataset.csv",index_col=[0])
#alternative way
data_dir = 'processed_data'
filename = 'processed_dataset.csv'
file_path = os.path.join(data_dir, filename)
df = pd.read_csv(file_path ,index_col=[0])

In [49]:
df.head()

Unnamed: 0,Disease,Description,Symptoms
0,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...,"itching, skin_rash, stomach_pain, burning_mict..."
1,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...,"itching, stomach_pain, burning_micturition, sp..."
2,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...,"itching, skin_rash, burning_micturition, spott..."
3,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...,"itching, skin_rash, stomach_pain, spotting_ ur..."
4,Drug Reaction,An adverse drug reaction (ADR) is an injury ca...,"itching, skin_rash, stomach_pain, burning_mict..."


In [50]:
df.isna().sum()

Disease        0
Description    0
Symptoms       0
dtype: int64

In [51]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [None]:
#NLTK provides various resources, such as word lists and corpora, which need to be downloaded separately.
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')


In [52]:
lemmatizer = WordNetLemmatizer()

In [53]:
def preprocess(ReviewText, flag):
    # Removing special characters and digits
    sentence = re.sub("[^a-zA-Z]", " ", ReviewText)

    # Remove white space
    pattern = re.compile(r'\s+') 
    sentence = re.sub(pattern, ' ', ReviewText)
    
    # change sentence to lower case
    sentence = sentence.lower()

    # tokenize into words
    tokens = sentence.split()
    
    # remove stop words                
    clean_tokens = [t for t in tokens if t not in stopwords.words("english")]
    
    # Stemming/Lemmatization
    if(flag == 'stem'):
        clean_tokens = [stemmer.stem(word) for word in clean_tokens]
    else:
        clean_tokens = [lemmatizer.lemmatize(word) for word in clean_tokens]
    
    return pd.Series([" ".join(clean_tokens)])

In [54]:
from tqdm import tqdm, tqdm_notebook
tqdm.pandas()

In [55]:
df['Symptoms'] = df['Symptoms'].progress_apply(lambda x: preprocess(x, 'lemma'))

100%|██████████| 4560/4560 [01:26<00:00, 52.85it/s] 


In [56]:
df['Description'] = df['Description'].progress_apply(lambda x: preprocess(x, 'lemma'))

100%|██████████| 4560/4560 [04:19<00:00, 17.59it/s]


In [57]:
df["Symptoms+Description"] = df.Symptoms.str.cat(df.Description)  

In [58]:
# df.to_csv(r'.\chatbot_dataset\temp_df.csv')
#alternative way
output_dir = "chatbot_dataset"
temp_dataset_name="temp_df.csv"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

temp_file_path = os.path.join(output_dir, temp_dataset_name)
df.to_csv(temp_file_path)

In [59]:
# df1 = pd.read_csv(r".\chatbot_dataset\temp_df.csv",index_col = [0] )
data_dir = 'chatbot_dataset'
filename = 'temp_df.csv'
file_path = os.path.join(data_dir, filename)
df1 = pd.read_csv(file_path,index_col = [0])

In [60]:
df1

Unnamed: 0,Disease,Description,Symptoms,Symptoms+Description
0,Drug Reaction,adverse drug reaction (adr) injury caused taki...,"itching, skin_rash, stomach_pain, burning_mict...","itching, skin_rash, stomach_pain, burning_mict..."
1,Drug Reaction,adverse drug reaction (adr) injury caused taki...,"itching, stomach_pain, burning_micturition, sp...","itching, stomach_pain, burning_micturition, sp..."
2,Drug Reaction,adverse drug reaction (adr) injury caused taki...,"itching, skin_rash, burning_micturition, spott...","itching, skin_rash, burning_micturition, spott..."
3,Drug Reaction,adverse drug reaction (adr) injury caused taki...,"itching, skin_rash, stomach_pain, spotting_ ur...","itching, skin_rash, stomach_pain, spotting_ ur..."
4,Drug Reaction,adverse drug reaction (adr) injury caused taki...,"itching, skin_rash, stomach_pain, burning_mict...","itching, skin_rash, stomach_pain, burning_mict..."
...,...,...,...,...
4555,Tuberculosis,tuberculosis (tb) infectious disease usually c...,"chills, vomiting, fatigue, weight_loss, cough,...","chills, vomiting, fatigue, weight_loss, cough,..."
4556,Tuberculosis,tuberculosis (tb) infectious disease usually c...,"chills, vomiting, fatigue, weight_loss, cough,...","chills, vomiting, fatigue, weight_loss, cough,..."
4557,Tuberculosis,tuberculosis (tb) infectious disease usually c...,"chills, vomiting, fatigue, weight_loss, cough,...","chills, vomiting, fatigue, weight_loss, cough,..."
4558,Tuberculosis,tuberculosis (tb) infectious disease usually c...,"chills, vomiting, fatigue, weight_loss, cough,...","chills, vomiting, fatigue, weight_loss, cough,..."


In [61]:
# i think we should use df1 from here onwards

X = df[["Symptoms+Description"]]
y = df["Disease"]

In [62]:
# split into train and test

from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X,y, train_size = 0.80, random_state= 42)

In [63]:
print("train dataset: ",X_train.shape, y_train.shape)
print("test dataset: ",X_test.shape, y_test.shape)

train dataset:  (3648, 1) (3648,)
test dataset:  (912, 1) (912,)


In [64]:
print(type(X_train))

<class 'pandas.core.frame.DataFrame'>


In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer
vocab = TfidfVectorizer()
X_train_trans = vocab.fit_transform(X_train["Symptoms+Description"])
X_test_trans = vocab.transform(X_test["Symptoms+Description"])

In [66]:
print(len(vocab.vocabulary_))

print(type(X_train_trans))

print( X_train_trans.shape)

658
<class 'scipy.sparse._csr.csr_matrix'>
(3648, 658)


In [67]:
print(X_train_trans.shape)
print(X_test_trans.shape)

(3648, 658)
(912, 658)


### LogisticRegression

In [74]:
from sklearn.linear_model import LogisticRegression
LRclassifier = LogisticRegression()
LRclassifier.fit(X_train_trans, y_train)
y_train_pred = LRclassifier.predict(X_train_trans)
y_test_pred = LRclassifier.predict(X_test_trans)
from sklearn import metrics
accuracy_log_train = metrics.accuracy_score(y_train,y_train_pred)
accuracy_log_test = metrics.accuracy_score(y_test,y_test_pred)
print('Accuracy_train :',accuracy_log_train)
print('Accuracy_test :',accuracy_log_test)

Accuracy_train : 1.0
Accuracy_test : 1.0


### DecisionTreeClassifier

In [69]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(max_depth= 35)
classifier.fit(X_train_trans, y_train)
y_train_pred = classifier.predict(X_train_trans)
y_test_pred = classifier.predict(X_test_trans)
from sklearn import metrics
accuracy_DT_train = metrics.accuracy_score(y_train,y_train_pred)
accuracy_DT_test = metrics.accuracy_score(y_test,y_test_pred)
print('Accuracy_train :',accuracy_DT_train)
print('Accuracy_test :',accuracy_DT_test)

Accuracy_train : 0.9528508771929824
Accuracy_test : 0.9254385964912281


### RandomForestClassifier


In [76]:
from sklearn.ensemble import RandomForestClassifier
RFclassifier = RandomForestClassifier(max_depth=3)
RFclassifier.fit(X_train_trans, y_train)
y_train_pred = RFclassifier.predict(X_train_trans)
y_test_pred = RFclassifier.predict(X_test_trans)
from sklearn import metrics
accuracy_RFC_train = metrics.accuracy_score(y_train,y_train_pred)
accuracy_RFC_test = metrics.accuracy_score(y_test,y_test_pred)
print('Accuracy_train :',accuracy_RFC_train)
print('Accuracy_test :',accuracy_RFC_test)

Accuracy_train : 1.0
Accuracy_test : 1.0


In [71]:
input_symptoms = ['fever', 'cough', 'headache']
input_vector = vocab.transform(input_symptoms)

In [73]:
print(type(input_vector))

<class 'scipy.sparse._csr.csr_matrix'>


In [80]:
pred = LRclassifier.predict(input_vector)
print(pred)

['Typhoid' 'GERD' 'Paralysis (brain hemorrhage)']


In [81]:
pred = RFclassifier.predict(input_vector)
print(pred)

['Typhoid' 'Hepatitis C' 'Hepatitis C']
