In [1]:
import pandas as pd
import sys
import random
import os
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import random

In [2]:
#nltk.download()

In [3]:
data = pd.read_csv('csv_data/dataset.csv')

Get some example sentences from the data

In [None]:
data.head()

In [4]:
diseases = []
for i in range(len(data['Disease'])):
    diseases.append(data['Disease'][i])

disease = set(diseases)

In [5]:
def get_symptoms_for1(symptom_num,disease):
    symptoms = []
    strval = "Symptom_" + str(symptom_num)
    for i in range(len(data[strval])):
        if data["Disease"][i] == disease:

            if str(data[strval][i]) != "nan":
                symptoms.append((str(data[strval][i])).replace("_"," "))

    return symptoms


def get_symptoms_per_disease(diseaseSet=disease):
    symptomDisease = {}

    out = []
    for h in diseaseSet:
        for k in range(1,18):
            out.extend(get_symptoms_for1(k,h))

        symptomDisease[h] = list(set(out))
        out=[] 

    return symptomDisease   

In [6]:
values = get_symptoms_per_disease()

In [7]:
len(list(values.keys()))

41

In [8]:
values2 = {}
for condition in list(values.keys()):
    words = values.get(condition)
    newarr = []
    for word in words:
        #print(word)
        word2 = word[1:]
        newarr.append(word2)
    values2[condition] = newarr

Now that we have all the symptom words per disease, we can hard code a couple hundred example sentences with symptom words to train the Markov Chain Model

In [9]:
def randWords(inpList):
    randomWords = []

    randomVals = random.sample(range(0,len(inpList)-1), 3)
    for randomIdx in randomVals:
        randomWords.append(inpList[randomIdx])

    return randomWords

def makeSentence(randomWords):
    #list description
    listDesc = (randomWords[0] + ", " + 
    randomWords[1] + ", " + 
    randomWords[2])

    #vague description
    vagueDesc = ("I feel " + randomWords[0] + 
    " and I also feel like " + 
    randomWords[1] + " but I also feel like " + 
    randomWords[2] + ".")

    return listDesc, vagueDesc

def makeSentences(inpList):
    generated_sents = []

    for i in range(41):
        listDesc, vagueDesc = makeSentence(randWords(inpList))
        generated_sents.append(listDesc)
        generated_sents.append(vagueDesc)
    
    return generated_sents

In [10]:
diseaseSents = {}

valueKeys = list(values2.keys())
for disease in valueKeys:
    conditions = values2.get(disease)
    diseaseSents[disease] = makeSentences(conditions)

In [11]:
allSymptoms = []
for setsymps in list(values2.values()):
    for symptom in setsymps:
        allSymptoms.append(symptom)
allSymptoms = set(allSymptoms)

Get the Markov chain set up for dataset generation

In [12]:
#define a few helper funcs
def clean_data(data):
    cleaned = []
 
    for line in data:
        line = line.lower() #makes it lowercase
 
        line = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", line) #takes out any symbols
 
        tokens = word_tokenize(line)
 
        words = [word for word in tokens if word.isalpha()] #check if only letters (no special chars/symbols)
 
        cleaned += words
 
    return cleaned

Clean the data per line and get it into a dictionary with condition: sentences

In [21]:
diseaseSents

{'Gastroenteritis': ['dehydration, diarrhoea, vomiting',
  'I feel dehydration and I also feel like diarrhoea but I also feel like vomiting.',
  'vomiting, diarrhoea, dehydration',
  'I feel vomiting and I also feel like diarrhoea but I also feel like dehydration.',
  'diarrhoea, dehydration, vomiting',
  'I feel diarrhoea and I also feel like dehydration but I also feel like vomiting.',
  'diarrhoea, dehydration, vomiting',
  'I feel diarrhoea and I also feel like dehydration but I also feel like vomiting.',
  'diarrhoea, dehydration, vomiting',
  'I feel diarrhoea and I also feel like dehydration but I also feel like vomiting.',
  'vomiting, diarrhoea, dehydration',
  'I feel vomiting and I also feel like diarrhoea but I also feel like dehydration.',
  'dehydration, vomiting, diarrhoea',
  'I feel dehydration and I also feel like vomiting but I also feel like diarrhoea.',
  'diarrhoea, vomiting, dehydration',
  'I feel diarrhoea and I also feel like vomiting but I also feel like dehy

In [22]:
conditions = diseaseSents.keys()

conditionSentences = {}

for condition in conditions:
    sentences = diseaseSents.get(condition)
    conditionSentences[condition] = []
    
    cleaned_sent = clean_data(sentences)
    #print(cleaned_sent)
        
    conditionSentences[condition] += cleaned_sent

Now that the data is cleaned, we can train a markov chain model on the data, which we can then use to generate synthetic data

In [23]:
def MCM(cleaned_sent, n_gram=1):
 
    markov_chain = {} #dictionary with keys for eventually calculating probability of the next move
 
    for i in range(len(cleaned_sent)-n_gram-1):
 
        curr_state, next_state = "", ""
 
        for j in range(n_gram):
 
            curr_state += cleaned_sent[i+j] + " "
            next_state += cleaned_sent[i+j+n_gram] + " "
 
        curr_state = curr_state[:-1]
        next_state = next_state[:-1]
 
        if curr_state not in markov_chain:
 
            markov_chain[curr_state] = {}
            markov_chain[curr_state][next_state] = 1
 
        else:
 
            if next_state in markov_chain[curr_state]:
 
                markov_chain[curr_state][next_state] += 1
 
            else:
                markov_chain[curr_state][next_state] = 1
   
    # calculating transition probabilities
    for curr_state, transition in markov_chain.items():
 
        total = sum(transition.values())
 
        for state, count in transition.items():
           
            markov_chain[curr_state][state] = count/total
       
    return markov_chain

In [24]:
conditionSentences

{'Gastroenteritis': ['dehydration',
  'diarrhoea',
  'vomiting',
  'i',
  'feel',
  'dehydration',
  'and',
  'i',
  'also',
  'feel',
  'like',
  'diarrhoea',
  'but',
  'i',
  'also',
  'feel',
  'like',
  'vomiting',
  'vomiting',
  'diarrhoea',
  'dehydration',
  'i',
  'feel',
  'vomiting',
  'and',
  'i',
  'also',
  'feel',
  'like',
  'diarrhoea',
  'but',
  'i',
  'also',
  'feel',
  'like',
  'dehydration',
  'diarrhoea',
  'dehydration',
  'vomiting',
  'i',
  'feel',
  'diarrhoea',
  'and',
  'i',
  'also',
  'feel',
  'like',
  'dehydration',
  'but',
  'i',
  'also',
  'feel',
  'like',
  'vomiting',
  'diarrhoea',
  'dehydration',
  'vomiting',
  'i',
  'feel',
  'diarrhoea',
  'and',
  'i',
  'also',
  'feel',
  'like',
  'dehydration',
  'but',
  'i',
  'also',
  'feel',
  'like',
  'vomiting',
  'diarrhoea',
  'dehydration',
  'vomiting',
  'i',
  'feel',
  'diarrhoea',
  'and',
  'i',
  'also',
  'feel',
  'like',
  'dehydration',
  'but',
  'i',
  'also',
  'feel',


In [32]:
def train_MCM(dOC=conditionSentences):
    mcmConditions = {}
    
    conditions = list(dOC.keys())

    for condition in conditions:
        sents = (dOC.get(condition))

        model = MCM(sents)
        #print(model.keys())
        states = model.keys()
        modelVals = {}

        for state in list(states):
            modelVals[state] = model.get(state)

        mcmConditions[condition] = modelVals

    return mcmConditions

In [33]:
mcms = train_MCM()

In [34]:
conditionSentences

{'Gastroenteritis': ['dehydration',
  'diarrhoea',
  'vomiting',
  'i',
  'feel',
  'dehydration',
  'and',
  'i',
  'also',
  'feel',
  'like',
  'diarrhoea',
  'but',
  'i',
  'also',
  'feel',
  'like',
  'vomiting',
  'vomiting',
  'diarrhoea',
  'dehydration',
  'i',
  'feel',
  'vomiting',
  'and',
  'i',
  'also',
  'feel',
  'like',
  'diarrhoea',
  'but',
  'i',
  'also',
  'feel',
  'like',
  'dehydration',
  'diarrhoea',
  'dehydration',
  'vomiting',
  'i',
  'feel',
  'diarrhoea',
  'and',
  'i',
  'also',
  'feel',
  'like',
  'dehydration',
  'but',
  'i',
  'also',
  'feel',
  'like',
  'vomiting',
  'diarrhoea',
  'dehydration',
  'vomiting',
  'i',
  'feel',
  'diarrhoea',
  'and',
  'i',
  'also',
  'feel',
  'like',
  'dehydration',
  'but',
  'i',
  'also',
  'feel',
  'like',
  'vomiting',
  'diarrhoea',
  'dehydration',
  'vomiting',
  'i',
  'feel',
  'diarrhoea',
  'and',
  'i',
  'also',
  'feel',
  'like',
  'dehydration',
  'but',
  'i',
  'also',
  'feel',


Now that we have a trained Markov Chain, we can quickly set up a sentence generation function to create all the sentences per each condition, we we can then add to a json file and export to the database

In [35]:
def gen_sent(model,max_words,symp):
 
    n = 0
    curr_state = symp
    next_state = None
 
    sent = ""
    sent+=curr_state+" "
 
    while n < max_words:
        next_state = random.choices(list(model[curr_state].keys()),
                                    list(model[curr_state].values()))
       
        curr_state = next_state[0]
        sent+=curr_state+" "
        n+=1
 
    return sent

def implement_sent_gen(model,arr):
    out = []
    for i in range(len(arr)):
        if arr[i] in model.keys():
            for j in range(30):
                    out.append(gen_sent(model,15,symp=arr[i]))
    print(len(out))      
    return out

In [41]:
#get the conditions and sentences
generated_sents = {}
for condition in list(values.keys()):
    model = mcms.get(condition)
    symps = values2.get(condition)

    separatedSymps = []
    for symp in symps:
        separatedSymps += symp.split(" ")
    

    generated_sents[condition] = []

    for i in range(10):
        sentence = implement_sent_gen(model,separatedSymps)
        generated_sents[condition].extend(sentence)

90
90
90
90
90
90
90
90
90
90
270
270
270
270
270
270
270
270
270
270
210
210
210
210
210
210
210
210
210
210
300
300
300
300
300
300
300
300
300
300
570
570
570
570
570
570
570
570
570
570
690
690
690
690
690
690
690
690
690
690
240
240
240
240
240
240
240
240
240
240
240
240
240
240
240
240
240
240
240
240
180
180
180
180
180
180
180
180
180
180
270
270
270
270
270
270
270
270
270
270
450
450
450
450
450
450
450
450
450
450
210
210
210
210
210
210
210
210
210
210
240
240
240
240
240
240
240
240
240
240
330
330
330
330
330
330
330
330
330
330
240
240
240
240
240
240
240
240
240
240
690
690
690
690
690
690
690
690
690
690
300
300
300
300
300
300
300
300
300
300
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
360
120
120
120
120
120
120
120
120
120
120
330
330
330
330
330
330
330
330
330
330
810
810
810
810
810
810
810
810
810
810
330
330
330
330
330
330
330
330
330
330
180
180
180
180
180
180
180
180
180
180
750
750
750
750
750
750
750
750
750
750
390
390
39

In [43]:
#generated_sents.get("AIDS")

In [45]:
len(list(generated_sents.keys()))

41