Date: 02-09-2021

Description: Completed creating a data record file for constructing the bayesian network

In [None]:
import pickle
import copy
import pandas as pd

In [None]:
filepath_for_all = '/content/drive/MyDrive/synthetic_dataset'

In [None]:
# Identifying the names of the underlying disease and symptoms in our dataset
disease_symptom = pickle.load(open(filepath_for_all + '/disease_symptom.p', 'rb'))
all_diseases = []
all_symptoms = []
for disease, value in disease_symptom.items():
  all_diseases.append(disease)
  all_symptoms.extend(list(value['symptom'].keys()))
all_symptoms = list(set(all_symptoms))

In [None]:
print("The total number of disease are {}".format(len(all_diseases)))
print("Sample diseases: {}, {}, {}".format(all_diseases[0], all_diseases[1], all_diseases[2]))

The total number of disease are 90
Sample diseases: Cat scratch disease, Dengue fever, Gas gangrene


In [None]:
print("The total number of symptoms are {}".format(len(all_symptoms)))
print("Sample symptoms: {}, {}, {}".format(all_symptoms[0], all_symptoms[1], all_symptoms[2]))

The total number of symptoms are 266
Sample symptoms: Back pain, Joint stiffness or tightness, Neck stiffness or tightness


In [None]:
# Constructing all the goals present within our training dataset
groups = ['1', '4', '5', '6', '7', '12', '13', '14', '19']
training_goal_set = []

# Loading data from the respective groups files
for i in groups:
  filename = filepath_for_all + '/label' + i + '/goal_set.p'
  goal_set = pickle.load(open(filename, 'rb'))
  training_goal_set.extend(goal_set['train'])

# Loading data from files belonging in general category
goal_set = pickle.load(open(filepath_for_all + '/goal_set.p', 'rb'))
training_goal_set.extend(goal_set['train'])

print("Completed loading all the training data !!")
print("The number of user goals in the training data is {}".format(len(training_goal_set)))

Completed loading all the training data !!
The number of user goals in the training data is 60000


In [None]:
training_goal_set[0]

{'consult_id': 3833,
 'disease_tag': 'Chagas disease',
 'goal': {'explicit_inform_slots': {'Fatigue': True},
  'implicit_inform_slots': {'Facial pain': True,
   'Joint stiffness or tightness': True,
   'Wrist pain': True},
  'request_slots': {'disease': 'UNK'}},
 'group_id': 0}

In [None]:
# Developing data records for the construction of a dataframe
row_values = []
column_values = all_diseases + all_symptoms

for g in training_goal_set:
  disease = g['disease_tag']
  symptoms = list(g['goal']['explicit_inform_slots'].keys()) + list(g['goal']['implicit_inform_slots'].keys())
  temp_row = [False for i in range(len(column_values))]
  for c in range(len(column_values)):
    if column_values[c] == disease:
      temp_row[c] = True
    elif column_values[c] in symptoms:
      temp_row[c] = True
    else:
      pass
  row_values.append(copy.deepcopy(temp_row))

In [None]:
print("The total number of data records are {}".format(len(row_values)))
print("Sample data record(length: {}): {} ".format(len(row_values[0]), row_values[0]))

The total number of data records are 60000
Sample data record(length: 356): [False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, Fa

In [None]:
modified_column_values = []
flag = True
for c in column_values:
  column_name = c.replace(', ', '_')
  column_name = column_name.replace(' ', '_')
  if c in all_diseases:
    if c == 'Diaper rash':
      if flag:
        column_name += '_disease'
        flag = False
      else:
        column_name += '_symptom'
    else:
      column_name += '_disease'
  else:
    column_name += '_symptom'
  modified_column_values.append(column_name)
print(modified_column_values)

['Cat_scratch_disease_disease', 'Dengue_fever_disease', 'Gas_gangrene_disease', 'Chickenpox_disease', 'Granuloma_inguinale_disease', 'Chagas_disease_disease', 'Chancroid_disease', 'Chlamydia_disease', 'Acariasis_disease', 'Gonorrhea_disease', 'Fluid_overload_disease', 'Diabetic_ketoacidosis_disease', 'Amyloidosis_disease', 'Diabetes_insipidus_disease', 'Diabetic_retinopathy_disease', 'Diabetic_peripheral_neuropathy_disease', 'Carcinoid_syndrome_disease', 'Graves_disease_disease', 'Cushing_syndrome_disease', 'Cystic_Fibrosis_disease', 'Conversion_disorder_disease', 'Chronic_pain_disorder_disease', 'Acute_stress_reaction_disease', 'Factitious_disorder_disease', 'Alcohol_intoxication_disease', 'Eating_disorder_disease', 'Anxiety_disease', 'Dissociative_disorder_disease', 'Drug_abuse_cocaine_disease', 'Adjustment_reaction_disease', 'Cerebral_edema_disease', 'Degenerative_disc_disease_disease', 'Guillain_Barre_syndrome_disease', 'Complex_regional_pain_syndrome_disease', 'Amyotrophic_lateral

In [None]:
ds_dataframe = pd.DataFrame(data=row_values, columns=modified_column_values)

In [None]:
ds_dataframe['Diaper_rash_disease']

0        False
1        False
2        False
3        False
4        False
         ...  
59995    False
59996    False
59997    False
59998    False
59999    False
Name: Diaper_rash_disease, Length: 60000, dtype: bool

In [None]:
ds_dataframe.to_csv('/content/ds_dataframe.csv', header=True, index=False)