### Preprocessing for Emtree


In [35]:
# Imports
import pandas as pd
import numpy as np


In [36]:
# %pip install pandas

In [37]:
df_emtree = pd.read_csv("/home/dpapadopoulos/dsls-papadopoulos-ambiguity-scoring-thesis/Data/Emtree_RMC.csv")

In [38]:
#Create lists for supers and labels for each concept
temp_df1 = df_emtree.groupby('concept')['superLabel'].apply(lambda x: list(x.unique())).reset_index().rename(columns={'superLabel': 'superLabel_list'})
temp_df2 = df_emtree.groupby('concept')['super'].apply(lambda x: list(x.unique())).reset_index().rename(columns={'super': 'super_list'})
temp_df3 = df_emtree.groupby('concept')['label'].apply(lambda x: list(x.unique())).reset_index().rename(columns={'label': 'label_list'})
df_emtree_proc = pd.merge(df_emtree, temp_df1, on='concept', how='left')
df_emtree_proc = pd.merge(df_emtree_proc, temp_df2, on='concept', how='left')
df_emtree_proc = pd.merge(df_emtree_proc, temp_df3, on='concept', how='left')

In [39]:
#For concepts having both ambiguous and unambiguous labels we keep only the rows of ambiguous
#Find concepts that have both True and False values in ' ambiguous'. Similarly for only True or False
ambiguous_concepts = df_emtree_proc.groupby('concept')['ambiguous'].apply(lambda x: set(x) == {True, False})
ambiguous_concepts = ambiguous_concepts[ambiguous_concepts].index.tolist()


#Create word cound collumn 
df_emtree_proc['word_count'] = df_emtree_proc['label'].apply(lambda x: len(str(x).split()))

#Define and remove outliers

# Define upper bound
upper_bound = df_emtree_proc['word_count'].quantile(0.99)
# upper_bound = 16

# Identify outliers
outliers = df_emtree_proc[df_emtree_proc['word_count'] > upper_bound]
# print("Number of outliers:", len(outliers))

# Count the number of outliers that are ambiguous
ambiguous_outliers = outliers[outliers['ambiguous'] == True]
# print("Number of ambiguous outliers:", len(ambiguous_outliers))

# Filter the DataFrame
df_emtree_proc = df_emtree_proc[df_emtree_proc['word_count'] <= upper_bound]


In [40]:
# ***For concepts having both ambiguous and unambiguous labels we keep only the rows of ambiguous as the data is skewed.***
temp_df = df_emtree_proc[(df_emtree_proc['concept'].isin(ambiguous_concepts)) & (df_emtree_proc['ambiguous'] == False)]

# Remove these rows from df_emtree_proc
df_emtree_proc = df_emtree_proc.drop(temp_df.index)

# %%
#Keep only one label for each concept. For now this is the first one but later maybe the preferred label in the taxonomy
df_emtree_proc = df_emtree_proc.sort_values('concept')
df_emtree_proc = df_emtree_proc.drop_duplicates(subset='concept', keep='first').drop(['super','superLabel'],axis=1)

# %% [markdown]
# ***For ambiguous concepts we remove the qualifier - the explanation inside the parenthesis - and keep it in a seperate collumn***

# %%
#Remove qualifier and put it in new column for ambiguous terms
# Create a mask for rows where 'ambiguous' is True
mask = df_emtree_proc['ambiguous'] == True

# Extract the qualifier and put it in a new column for these rows
df_emtree_proc.loc[mask, 'qualifier'] = df_emtree_proc.loc[mask, 'label'].str.extract(r'\((.*?)\)', expand=False)

# Remove the qualifier from the 'label' column for these rows
df_emtree_proc.loc[mask, 'label'] = df_emtree_proc.loc[mask, 'label'].str.replace(r'\s*\(.*?\)\s*', '', regex=True)

In [41]:
# reorder columnns
df_emtree_proc = df_emtree_proc[['concept', 'label', 'ambiguous','qualifier','label_list','distance','super_list','superLabel_list','word_count']]

### Save dataframe

In [42]:
#Save dataframe to csv for pipeline
# df_emtree_proc.to_csv("/Users/papadopoulosd/Documents/Ambiguity_scoring_project/Emtree_RMC_processed.csv",header=True)

## For Omniscience

### Data Loading

In [43]:
df_omni = pd.read_csv("/home/dpapadopoulos/dsls-papadopoulos-ambiguity-scoring-thesis/Data/OmniScience.csv")

In [44]:
#Create lists for supers and labels for each concept
temp_df1 = df_omni.groupby('concept')['superLabel'].apply(lambda x: list(x.unique())).reset_index().rename(columns={'superLabel': 'superLabel_list'})
temp_df2 = df_omni.groupby('concept')['super'].apply(lambda x: list(x.unique())).reset_index().rename(columns={'super': 'super_list'})
temp_df3 = df_omni.groupby('concept')['label'].apply(lambda x: list(x.unique())).reset_index().rename(columns={'label': 'label_list'})
df_omni_proc = pd.merge(df_omni, temp_df1, on='concept', how='left')
df_omni_proc = pd.merge(df_omni_proc, temp_df2, on='concept', how='left')
df_omni_proc = pd.merge(df_omni_proc, temp_df3, on='concept', how='left')

df_omni_proc['word_count'] = df_omni_proc['label'].apply(lambda x: len(str(x).split()))
#Define and remove outliers

# Define upper bound
upper_bound = df_omni_proc['word_count'].quantile(0.99)
# upper_bound = 16

# Identify outliers
outliers = df_omni_proc[df_omni_proc['word_count'] > upper_bound]
print("Number of outliers:", len(outliers))

# Count the number of outliers that are ambiguous
ambiguous_outliers = outliers[outliers['ambiguous'] == True]
print("Number of ambiguous outliers:", len(ambiguous_outliers))

# Filter the DataFrame
df_omni_proc = df_omni_proc[df_omni_proc['word_count'] <= upper_bound]

print("Number of records after removing outliers:", len(df_omni_proc))


Number of outliers: 620
Number of ambiguous outliers: 16
Number of records after removing outliers: 97677


***For ambiguous concepts we remove the qualifier - the explanation inside the parenthesis - and keep it in a seperate collumn***

In [45]:
#Remove qualifier and put it in new column for ambiguous terms
# Create a mask for rows where 'ambiguous' is True
mask = df_omni_proc['ambiguous'] == True

# Extract the qualifier and put it in a new column for these rows
df_omni_proc.loc[mask, 'qualifier'] = df_omni_proc.loc[mask, 'label'].str.extract(r'\((.*?)\)', expand=False)

# Remove the qualifier from the 'label' column for these rows
df_omni_proc.loc[mask, 'label'] = df_omni_proc.loc[mask, 'label'].str.replace(r'\s*\(.*?\)\s*', '', regex=True)


In [46]:
#For omniscience we have to check whether a concept occurs multiple times without the qualifier.
# Filter the DataFrame for rows where 'ambiguous' is True
ambiguous_df = df_omni_proc[df_omni_proc['ambiguous'] == True]

# Group by 'concept' and 'label', count the size of each group, and filter for counts greater than 1
ambiguous_duplicates = ambiguous_df.groupby(['concept', 'label']).size().reset_index(name='count')
ambiguous_duplicates = ambiguous_duplicates[ambiguous_duplicates['count'] > 1]


In [47]:

# Get the unique 'concept'-'label' pairs in the ambiguous DataFrame
ambiguous_pairs = set(zip(ambiguous_df['concept'], ambiguous_df['label']))

# Get the unique 'concept'-'label' pairs in the whole DataFrame
all_pairs = set(zip(df_omni_proc['concept'], df_omni_proc['label']))

# Find the 'concept'-'label' pairs that are in the ambiguous DataFrame but also appear under a different concept in the whole DataFrame
ambiguous_duplicates = [(concept, label) for concept, label in ambiguous_pairs if any((c, label) in all_pairs for c in df_omni_proc['concept'].unique() if c != concept)]

# Create a new column in the original DataFrame to indicate whether each row is a duplicate
df_omni_proc['duplicate'] = df_omni_proc.apply(lambda row: (row['concept'], row['label']) in ambiguous_duplicates, axis=1)


In [48]:
# Filter the DataFrame for rows where both 'ambiguous' and 'duplicate' are True
ambiguous_and_duplicate_df = df_omni_proc[(df_omni_proc['ambiguous'] == True) & (df_omni_proc['duplicate'] == True)]

# Get the number of unique concepts in the filtered DataFrame
unique_concepts_count = ambiguous_and_duplicate_df['concept'].nunique()

# Print the number of unique concepts
print(f"The number of unique concepts that are both ambiguous and duplicate is: {unique_concepts_count}")

The number of unique concepts that are both ambiguous and duplicate is: 645


In [49]:
#Change ambiguous to False for those that don't appear under different concept
# Get the list of unique concepts that are both ambiguous and duplicate
ambiguous_and_duplicate_concepts = ambiguous_and_duplicate_df['concept'].unique()

# Change 'ambiguous' to False for concepts that are not in the list
df_omni_proc.loc[~df_omni_proc['concept'].isin(ambiguous_and_duplicate_concepts), 'ambiguous'] = False



In [50]:
#For concepts having both ambiguous and unambiguous labels we keep only the rows of ambiguous as the data is skewed.
temp_df = df_omni_proc[(df_omni_proc['concept'].isin(ambiguous_concepts)) & (df_omni_proc['ambiguous'] == False)]

# Remove these rows from df_emtree
df_omni_proc = df_omni_proc.drop(temp_df.index)

In [51]:
#Keep only one label for each concept. For now this is the first one but later maybe the preferred label in the taxonomy
df_omni_proc = df_omni_proc.sort_values('concept')
df_omni_proc = df_omni_proc.drop_duplicates(subset='concept', keep='first').drop(['super','superLabel'],axis=1)

In [52]:
# reorder columnns
df_omni_proc = df_omni_proc[['concept', 'label', 'ambiguous','qualifier','label_list','distance','super_list','superLabel_list','word_count']]

### Save dataframe

In [53]:
#Save dataframe to csv for pipeline
# df_omni_proc.to_csv("/Users/papadopoulosd/Documents/Ambiguity_scoring_project/Omniscience_processed.csv",header=True)