### NaturDoc - TL BL WT 22-23

# Data clustering:

## Preparing Activity and Symptom Datasets for Word Embeddings:

Preparing our activities and symptoms data:

In [96]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_blobs

## Data from the dataset:

#### Import duke activities:

In [105]:
duke_enriched_df = pd.read_json("./data/remedies.json")
activities_df = pd.read_json("./data/symptoms.json")

In [59]:
duke_enriched_df.head()

Unnamed: 0,remedyName,medicinalUses,commonNames,family,genus,species,vernacularNames,treatmentClinical,treatmentTraditional,treatmentFolk,contraindication,warnings,adverseEffects,posology,ratingAverage,totalNumberofRatings,popularityScore
0,Abelmoschus esculentus,"Abortifacient,Antidote,Boil,Burn,Catarrh,Coffe...","Huang Shu K'Uei,Bamia,Molondron,Quiabeiro,Gomb...",Malvaceae,Abelmoschus,esculentus,,,,,,,,,,,
1,Abelmoschus manihot,"Antitussive,Boil,Cancer,Cancer(Stomach),Catarr...","Tororo-Aoi,Kastuli",Malvaceae,Abelmoschus,manihot,,,,,,,,,,,
2,Abelmoschus moschatus,"Aphrodisiac,Asthma,Bite(Snake),Boil,Cancer,Car...","Musk Mallow,Moskus,Muskus,Mushk Dana,Ambercice...",Malvaceae,Abelmoschus,moschatus,,,,,,,,,,,
3,Abies alba,"Bronchitis,Bruise,Calculus,Catarrh,Cough,Diure...","Abeto,Edeltanne,Beyaz Koknar,Silver Fir,Abeto ...",Pinaceae,Abies,alba,,,,,,,,,,,
4,Abies balsamea,"Burn,Cancer,Cold,Cough,Heart,Masticatory,Sore,...","Balsam,Canada,Balsam Fir,Fir",Pinaceae,Abies,balsamea,,,,,,,,,,,


In [106]:
activities_df.shape

(2109, 1)

In [107]:
activities_df.head()

Unnamed: 0,symptomName
0,Abcess
1,Abdomen
2,Abortifacient
3,Abortive
4,Abrasion


#### Import other list of symptoms:

In [65]:
gb_symptoms_df = pd.read_csv("./data/2022_country_weekly_2022_GB_weekly_symptoms_dataset.csv")

In [68]:
df_columns = list(gb_symptoms_df.columns)
print(df_columns[:15])

symptoms = list()

for col in df_columns:
    if "symptom:" not in col:
        continue
    col = col.replace("symptom:", "")
    col = col.capitalize()
    symptoms.append(col)

['country_region_code', 'country_region', 'sub_region_1', 'sub_region_1_code', 'sub_region_2', 'sub_region_2_code', 'place_id', 'date', 'symptom:Abdominal obesity', 'symptom:Abdominal pain', 'symptom:Acne', 'symptom:Actinic keratosis', 'symptom:Acute bronchitis', 'symptom:Adrenal crisis', 'symptom:Ageusia']


In [77]:
symptoms

['Abdominal obesity',
 'Abdominal pain',
 'Acne',
 'Actinic keratosis',
 'Acute bronchitis',
 'Adrenal crisis',
 'Ageusia',
 'Alcoholism',
 'Allergic conjunctivitis',
 'Allergy',
 'Amblyopia',
 'Amenorrhea',
 'Amnesia',
 'Anal fissure',
 'Anaphylaxis',
 'Anemia',
 'Angina pectoris',
 'Angioedema',
 'Angular cheilitis',
 'Anosmia',
 'Anxiety',
 'Aphasia',
 'Aphonia',
 'Apnea',
 'Arthralgia',
 'Arthritis',
 'Ascites',
 'Asperger syndrome',
 'Asphyxia',
 'Asthma',
 'Astigmatism',
 'Ataxia',
 'Atheroma',
 'Attention deficit hyperactivity disorder',
 'Auditory hallucination',
 'Autoimmune disease',
 'Avoidant personality disorder',
 'Back pain',
 'Bacterial vaginosis',
 'Balance disorder',
 "Beau's lines",
 "Bell's palsy",
 'Biliary colic',
 'Binge eating',
 'Bleeding',
 'Bleeding on probing',
 'Blepharospasm',
 'Bloating',
 'Blood in stool',
 'Blurred vision',
 'Blushing',
 'Boil',
 'Bone fracture',
 'Bone tumor',
 'Bowel obstruction',
 'Bradycardia',
 'Braxton hicks contractions',
 'Break

#### Concatenate duke activities and symptoms:

In [120]:
# activities = symptoms_df.values.tolist()
symptoms_df = pd.DataFrame(symptoms)
symptoms_df.rename(columns={0: "symptomName"}, inplace=True)
symptoms_df[["is_symptom", "is_activity"]] = (1, 0)
symptoms_df.head()

Unnamed: 0,symptomName,is_symptom,is_activity
0,Abdominal obesity,1,0
1,Abdominal pain,1,0
2,Acne,1,0
3,Actinic keratosis,1,0
4,Acute bronchitis,1,0


In [121]:
activities_df[["is_symptom", "is_activity"]] = (0, 1)
activities_df.head()

Unnamed: 0,symptomName,is_symptom,is_activity
0,Abcess,0,1
1,Abdomen,0,1
2,Abortifacient,0,1
3,Abortive,0,1
4,Abrasion,0,1


Only take the unique values!

In [198]:
print(len(activities_df))
print(len(symptoms_df))
activities_symptoms_df = pd.concat([activities_df, symptoms_df],axis=0)
print("Should be:", len(activities_df) + len(symptoms_df))
print("Is:", len(activities_symptoms_df))
activities_symptoms_df.reset_index(inplace=True)
activities_symptoms_df.drop(labels="index", axis=1, inplace=True)
activities_symptoms_df.head()

2109
422
Should be: 2531
Is: 2531


Unnamed: 0,symptomName,is_symptom,is_activity
0,Abcess,0,1
1,Abdomen,0,1
2,Abortifacient,0,1
3,Abortive,0,1
4,Abrasion,0,1


However, actually not all unique (combinations are, but some symptomNames were present in both):

In [199]:
activities_symptoms_df.nunique()

symptomName    2404
is_symptom        2
is_activity       2
dtype: int64

In [200]:
activities_symptoms_df["symptomName"]

0              Abcess
1             Abdomen
2       Abortifacient
3            Abortive
4            Abrasion
            ...      
2526        Xeroderma
2527       Xerostomia
2528             Yawn
2529    Hyperhidrosis
2530     Pancreatitis
Name: symptomName, Length: 2531, dtype: object

In [202]:
activities_symptoms_df["symptomName"].duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
2526    False
2527    False
2528    False
2529    False
2530    False
Name: symptomName, Length: 2531, dtype: bool

In [203]:
duplicate_activities = (activities_symptoms_df["symptomName"].duplicated("last"))
filt_df = activities_symptoms_df[duplicate_activities]
filt_df

Unnamed: 0,symptomName,is_symptom,is_activity
23,Acne,0,1
38,Alcoholism,0,1
43,Allergy,0,1
47,Amblyopia,0,1
49,Amenorrhea,0,1
...,...,...,...
1837,Wheeze,0,1
1996,Hypertrophy,0,1
1997,Hypoglycemia,0,1
2035,Purpura,0,1


In [204]:
duplicate_symptoms = (activities_symptoms_df["symptomName"].duplicated("first"))
filt_df = activities_symptoms_df[duplicate_symptoms]
filt_df

Unnamed: 0,symptomName,is_symptom,is_activity
2111,Acne,1,0
2116,Alcoholism,1,0
2118,Allergy,1,0
2119,Amblyopia,1,0
2120,Amenorrhea,1,0
...,...,...,...
2517,Vertigo,1,0
2520,Vomiting,1,0
2521,Wart,1,0
2523,Weakness,1,0


In [208]:
duplicate_indices = filt_df.index
duplicate_indices

Int64Index([2111, 2116, 2118, 2119, 2120, 2121, 2124, 2129, 2133, 2134,
            ...
            2500, 2502, 2503, 2505, 2512, 2517, 2520, 2521, 2523, 2525],
           dtype='int64', length=127)

#### Adding a 1 to is_symptom for activities that are also symptoms:

In [209]:
activities_symptoms_df.loc[duplicate_activities, ["is_symptom"]] = 1
activities_symptoms_df[duplicate_activities].head()

Unnamed: 0,symptomName,is_symptom,is_activity
23,Acne,1,1
38,Alcoholism,1,1
43,Allergy,1,1
47,Amblyopia,1,1
49,Amenorrhea,1,1


In [210]:
activities_symptoms_df.iloc[23]

symptomName    Acne
is_symptom        1
is_activity       1
Name: 23, dtype: object

In [211]:
activities_symptoms_df.drop_duplicates(subset="symptomName")

Unnamed: 0,symptomName,is_symptom,is_activity
0,Abcess,0,1
1,Abdomen,0,1
2,Abortifacient,0,1
3,Abortive,0,1
4,Abrasion,0,1
...,...,...,...
2526,Xeroderma,1,0
2527,Xerostomia,1,0
2528,Yawn,1,0
2529,Hyperhidrosis,1,0


In [212]:
activities_symptoms_df.drop_duplicates(subset="symptomName", inplace=True)

In [213]:
activities_symptoms_df.head()

Unnamed: 0,symptomName,is_symptom,is_activity
0,Abcess,0,1
1,Abdomen,0,1
2,Abortifacient,0,1
3,Abortive,0,1
4,Abrasion,0,1


Verifying results:

In [214]:
filt = (activities_symptoms_df["is_symptom"] == 1) & (activities_symptoms_df["is_activity"] == 1)
filt_df = activities_symptoms_df[filt]
filt_df

Unnamed: 0,symptomName,is_symptom,is_activity
23,Acne,1,1
38,Alcoholism,1,1
43,Allergy,1,1
47,Amblyopia,1,1
49,Amenorrhea,1,1
...,...,...,...
1837,Wheeze,1,1
1996,Hypertrophy,1,1
1997,Hypoglycemia,1,1
2035,Purpura,1,1


Resetting index:

In [225]:
activities_symptoms_df.reset_index(inplace=True)
activities_symptoms_df.drop(labels=["index"], axis=1, inplace=True)
activities_symptoms_df


Unnamed: 0,symptomName,is_symptom,is_activity
0,Abcess,0,1
1,Abdomen,0,1
2,Abortifacient,0,1
3,Abortive,0,1
4,Abrasion,0,1
...,...,...,...
2399,Xeroderma,1,0
2400,Xerostomia,1,0
2401,Yawn,1,0
2402,Hyperhidrosis,1,0


Export to csv

In [226]:
activities_symptoms_df.to_csv("./activities_symptoms_bool.csv")

In [95]:
# activities_symptoms_series = pd.Series(activities_symptoms_series)
# activities_symptoms_series.to_csv("./activities_symptoms.csv")