In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import os
from sklearn.preprocessing import LabelEncoder
from scipy.special import inv_boxcox
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# for HD visualizations
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set_style('whitegrid')
plt.style.use('ggplot')

In [None]:
# df = pd.read_csv(r".\raw_data\dataset.csv")
# Alternative Way: 
# always use dynamic path (using OS) as we have to run our applications on server 
data_dir = 'raw_data'
filename = 'dataset.csv'
file_path = os.path.join(data_dir, filename)
df = pd.read_csv(file_path)


In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# df["Disease"] = df["Disease"].map({'Drug Reaction':0, 'Malaria':1, 'Allergy':2, 'Hypothyroidism':3,
#                   'Psoriasis':4, 'GERD':5, 'Chronic cholestasis':6, 'hepatitis A':7,
#                   'Osteoarthristis':8, '(vertigo) Paroymsal  Positional Vertigo':9,
#                   'Hypoglycemia':10, 'Acne':11, 'Diabetes':12, 'Impetigo':13, 'Hypertension':14,
#                   'Peptic ulcer diseae':15, 'Dimorphic hemorrhoids(piles)':16,
#                   'Common Cold':17, 'Chicken pox':18, 'Cervical spondylosis':19,
#                   'Hyperthyroidism':20, 'Urinary tract infection':21, 'Varicose veins':22,
#                   'AIDS':23, 'Paralysis (brain hemorrhage)':24, 'Typhoid':25, 'Hepatitis B':26,
#                   'Fungal infection':27, 'Hepatitis C':28, 'Migraine':29, 'Bronchial Asthma':30,
#                   'Alcoholic hepatitis':31, 'Jaundice':32, 'Hepatitis E':33, 'Dengue':34,
#                   'Hepatitis D':35, 'Heart attack':36, 'Pneumonia':37, 'Arthritis':38,
#                   'Gastroenteritis':39, 'Tuberculosis':40})

In [None]:
df.isna().sum()

In [None]:
# if disease contain null value drop them
df1 = df[df['Disease'].isnull()]
print(df1.shape)
df.drop(axis=0,index=df1.index,inplace=True)

In [None]:
# df["Disease"] = df["Disease"].astype(int)

In [None]:
# why this line??
df.reset_index(drop = "first", inplace = True )

In [None]:
df.head()

In [None]:
# treating null values
df.isna().sum()

In [None]:
df.replace(np.nan,'',regex=True, inplace = True)  

In [None]:
df.isna().sum()

In [None]:
# feature engineering combining all Symptoms
# df["Symptoms"] = df[["Symptom_1", "Symptom_2","Symptom_3", "Symptom_4","Symptom_5", "Symptom_6", "Symptom_7","Symptom_8",
#                    "Symptom_9", "Symptom_10", "Symptom_11", "Symptom_12", "Symptom_13", "Symptom_14",
#                    "Symptom_15","Symptom_16", "Symptom_17"]].apply(",".join, axis=1)

In [None]:
# better way of combine all
df["Symptoms"] = df.iloc[:, 1:].astype(str).apply(lambda x: ",".join(x), axis=1)

In [None]:
df

In [None]:
#to get all cols list 
# column_names = df.columns.tolist()
# column_names

In [None]:
# drop extra columns
# df.drop(["Symptom_1","Symptom_2","Symptom_3", "Symptom_4","Symptom_5", "Symptom_6", "Symptom_7","Symptom_8",
#                    "Symptom_9", "Symptom_10", "Symptom_11", "Symptom_12", "Symptom_13", "Symptom_14",
#                    "Symptom_15","Symptom_16", "Symptom_17"], axis = 1, inplace = True)

In [None]:
#better way to drop columns
df = df.drop(df.columns[1:18], axis=1)


In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isna().sum()

### --------------------------------------------------------------------------------------------------------------------------

# symptoms_description

In [None]:
#loading symptom_Description csv file
# symptoms_description = pd.read_csv(r".\raw_data\symptom_Description.csv")

#alternative way
# data_dir = 'raw_data'
filename = 'symptom_Description.csv'
file_path = os.path.join(data_dir, filename)
symptoms_description = pd.read_csv(file_path)


In [None]:
symptoms_description.head()

In [None]:
symptoms_description.shape

## Symptom_Severity

In [None]:
# loading Symptom-severity csv file
Symptom_severity = pd.read_csv(r".\raw_data\Symptom_Severity.csv")

#alternative way
# data_dir = 'raw_data'
filename = 'Symptom_Severity.csv'
file_path = os.path.join(data_dir, filename)
Symptom_severity = pd.read_csv(file_path)

In [None]:
Symptom_severity.head()

In [None]:
Symptom_severity.shape

In [None]:
Symptom_severity["Symptom"].nunique()

In [None]:
Symptom_severity["weight"].unique()

In [None]:
Symptom_severity["Symptoms"] = Symptom_severity["Symptom"]

In [None]:
Symptom_severity

## Symptoms_Precautions

In [None]:
# loading symtoms_precautions csv file
# symptoms_precautions = pd.read_csv(r".\raw_data\Symptom_Precaution.csv")

#alternative way
data_dir = 'raw_data'
filename = 'Symptom_Precaution.csv'
file_path = os.path.join(data_dir, filename)
symtoms_precautions = pd.read_csv(file_path)

In [None]:
symtoms_precautions.head()

In [None]:
symtoms_precautions.shape

In [None]:
symtoms_precautions.info()

In [None]:
# treating null
df.replace(np.nan,'',regex=True, inplace = True) 

## ------------------------------------------------
## Merging symptoms_description And dataset

In [None]:
data = pd.merge(symptoms_description,df, how = "inner", on = "Disease")

In [None]:
# data = pd.merge(Symptom_severity,df, how = "inner", on = "Symptoms")

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
# we dont requred all this 3 step as there is no duplicates
data.columns[data.columns.duplicated()]

In [None]:
data = data.loc[:, ~data.columns.duplicated()]

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data["Disease"].unique()

In [None]:
# data["Disease"] = data["Disease"].map({'Drug Reaction':0, 'Malaria':1, 'Allergy':2, 'Hypothyroidism':3,
#                   'Psoriasis':4, 'GERD':5, 'Chronic cholestasis':6, 'hepatitis A':7,
#                   'Osteoarthristis':8, '(vertigo) Paroymsal  Positional Vertigo':9,
#                   'Hypoglycemia':10, 'Acne':11, 'Diabetes':12, 'Impetigo':13, 'Hypertension':14,
#                   'Peptic ulcer diseae':15, 'Dimorphic hemorrhoids(piles)':16,
#                   'Common Cold':17, 'Chicken pox':18, 'Cervical spondylosis':19,
#                   'Hyperthyroidism':20, 'Urinary tract infection':21, 'Varicose veins':22,
#                   'AIDS':23, 'Paralysis (brain hemorrhage)':24, 'Typhoid':25, 'Hepatitis B':26,
#                   'Fungal infection':27, 'Hepatitis C':28, 'Migraine':29, 'Bronchial Asthma':30,
#                   'Alcoholic hepatitis':31, 'Jaundice':32, 'Hepatitis E':33, 'Dengue':34,
#                   'Hepatitis D':35, 'Heart attack':36, 'Pneumonia':37, 'Arthritis':38,
#                   'Gastroenteritis':39, 'Tuberculosis':40})

In [None]:
data.head()

## --------------------------------------------------
## saving data file

In [None]:
# data.to_csv(r".\processed_data\dataset_final.csv")
#alternative way

output_dir = "processed_data"
processed_dataset_name="processed_dataset.csv"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

processed_file_path = os.path.join(output_dir, processed_dataset_name)
data.to_csv(processed_file_path)

## -------------------------------------------------------------------------
## Model Training 

In [None]:
# df = pd.read_csv(r".\processed_data\processed_dataset.csv",index_col=[0])
#alternative way
data_dir = 'processed_data'
filename = 'processed_dataset.csv'
file_path = os.path.join(data_dir, filename)
df = pd.read_csv(file_path ,index_col=[0])

In [None]:
df.head()

In [None]:
df.isna().sum()

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [None]:
#NLTK provides various resources, such as word lists and corpora, which need to be downloaded separately.
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')


In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
def preprocess(ReviewText, flag):
    # Removing special characters and digits
    sentence = re.sub("[^a-zA-Z]", " ", ReviewText)

    # Remove white space
    pattern = re.compile(r'\s+') 
    sentence = re.sub(pattern, ' ', ReviewText)
    
    # change sentence to lower case
    sentence = sentence.lower()

    # tokenize into words
    tokens = sentence.split()
    
    # remove stop words                
    clean_tokens = [t for t in tokens if t not in stopwords.words("english")]
    
    # Stemming/Lemmatization
    if(flag == 'stem'):
        clean_tokens = [stemmer.stem(word) for word in clean_tokens]
    else:
        clean_tokens = [lemmatizer.lemmatize(word) for word in clean_tokens]
    
    return pd.Series([" ".join(clean_tokens)])

In [None]:
from tqdm import tqdm, tqdm_notebook
tqdm.pandas()

In [None]:
df['Symptoms'] = df['Symptoms'].progress_apply(lambda x: preprocess(x, 'lemma'))

In [None]:
df['Description'] = df['Description'].progress_apply(lambda x: preprocess(x, 'lemma'))

In [None]:
df["Symptoms+Description"] = df.Symptoms.str.cat(df.Description)  

In [None]:
# df.to_csv(r'.\chatbot_dataset\temp_df.csv')
#alternative way
output_dir = "chatbot_dataset"
temp_dataset_name="temp_df.csv"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

temp_file_path = os.path.join(output_dir, temp_dataset_name)
df.to_csv(temp_file_path)

In [None]:
# df1 = pd.read_csv(r".\chatbot_dataset\temp_df.csv",index_col = [0] )
data_dir = 'chatbot_dataset'
filename = 'temp_df.csv'
file_path = os.path.join(data_dir, filename)
df1 = pd.read_csv(file_path,index_col = [0])

In [None]:
df1

In [None]:
# i think we should use df1 from here onwards

X = df[["Symptoms+Description"]]
y = df["Disease"]

In [None]:
# split into train and test

from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X,y, train_size = 0.80, random_state= 42)

In [None]:
print("train dataset: ",X_train.shape, y_train.shape)
print("test dataset: ",X_test.shape, y_test.shape)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vocab = TfidfVectorizer()
X_train_trans = vocab.fit_transform(X_train["Symptoms+Description"])
X_test_trans = vocab.transform(X_test["Symptoms+Description"])

In [None]:
print(len(vocab.vocabulary_))

print(type(X_train_trans))

print( X_train_trans.shape)

In [None]:
print(X_train_trans.shape)
print(X_test_trans.shape)

### LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train_trans, y_train)
y_train_pred = classifier.predict(X_train_trans)
y_test_pred = classifier.predict(X_test_trans)
from sklearn import metrics
accuracy_log_train = metrics.accuracy_score(y_train,y_train_pred)
accuracy_log_test = metrics.accuracy_score(y_test,y_test_pred)
print('Accuracy_train :',accuracy_log_train)
print('Accuracy_test :',accuracy_log_test)

### DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(max_depth= 35)
classifier.fit(X_train_trans, y_train)
y_train_pred = classifier.predict(X_train_trans)
y_test_pred = classifier.predict(X_test_trans)
from sklearn import metrics
accuracy_DT_train = metrics.accuracy_score(y_train,y_train_pred)
accuracy_DT_test = metrics.accuracy_score(y_test,y_test_pred)
print('Accuracy_train :',accuracy_DT_train)
print('Accuracy_test :',accuracy_DT_test)

### RandomForestClassifier


In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(max_depth=3)
classifier.fit(X_train_trans, y_train)
y_train_pred = classifier.predict(X_train_trans)
y_test_pred = classifier.predict(X_test_trans)
from sklearn import metrics
accuracy_RFC_train = metrics.accuracy_score(y_train,y_train_pred)
accuracy_RFC_test = metrics.accuracy_score(y_test,y_test_pred)
print('Accuracy_train :',accuracy_RFC_train)
print('Accuracy_test :',accuracy_RFC_test)