In [25]:
import pandas as pd

In [26]:
df = pd.read_csv('/content/drive/MyDrive/Raajitha_NLP_CourseCapstone/extracted_umls_entities.csv')
df.head()

Unnamed: 0,Abstract_ID,CUI,Name,Definition,TUIs,Aliases
0,0,C0400966,Non-alcoholic Fatty Liver Disease,A term referring to fatty replacement of the h...,['T047'],"['Nonalcoholic fatty liver disease', 'Fatty Li..."
1,0,C5204628,Prevalent,"Widespread, especially in a particular area or...",['T080'],[]
2,0,C0348080,Condition,"A state of being, such as a state of health.",['T080'],"['Condition (attribute)', 'Condition']"
3,0,C1272688,status - In progress,,['T169'],"['progressed', 'progresses', 'in progress', 'I..."
4,0,C0016059,Fibrosis,Any pathological condition where fibrous conne...,['T046'],"['Desmoplasia', 'Fibrous replacement', 'Fibros..."


Cleaning Name, Aliases and Definition columns

In [27]:
import ast

def clean_aliases(aliases):
    try:
        alias_list = ast.literal_eval(aliases)  # Convert string to list
        if isinstance(alias_list, list):
            return "; ".join([alias.strip() for alias in alias_list])  # Join aliases with semicolon
        return aliases
    except:
        return ""

# Apply the cleaning function
df['Aliases'] = df['Aliases'].apply(clean_aliases)


In [28]:
df['Aliases'].head()

Unnamed: 0,Aliases
0,"Nonalcoholic fatty liver disease; Fatty Liver,..."
1,
2,Condition (attribute); Condition
3,progressed; progresses; in progress; In progre...
4,Desmoplasia; Fibrous replacement; Fibrosis (qu...


In [29]:
# Remove rows with missing or empty definitions
df = df[df['Definition'].notna() & df['Definition'].str.strip().ne("")]


In [30]:
df['Definition'].head()

Unnamed: 0,Definition
0,A term referring to fatty replacement of the h...
1,"Widespread, especially in a particular area or..."
2,"A state of being, such as a state of health."
4,Any pathological condition where fibrous conne...
5,The probability that an event will occur. It e...


In [31]:
df.shape

(7983, 6)

In [32]:
df.to_csv('/content/drive/MyDrive/Raajitha_NLP_CourseCapstone/cleaned_umls_entities.csv', index=False)

In [33]:
def generate_questions(row):
    questions = []
    entity = row['Name']
    aliases = row['Aliases'].split("; ")  # Split aliases by semicolon delimiter

    # Core templates for the entity
    core_questions = [
        f"What is {entity}?",
        f"Explain {entity}.",
        f"What does {entity} mean?",
        f"Can you explain {entity}?",
        f"Give a definition for {entity}.",
    ]
    questions.extend(core_questions)

    # Generate questions for aliases
    for alias in aliases:
        alias_questions = [
            f"What is {alias}?",
            f"Explain {alias}.",
            f"What does {alias} mean?",
            f"Can you explain {alias}?",
            f"Give a definition for {alias}.",
        ]
        questions.extend(alias_questions)

    # Return all questions for this row
    return questions


In [34]:
# Initialize a list to store the results
question_data = []

for _, row in df.iterrows():
    # Skip rows with missing aliases or definitions
    if pd.isna(row['Aliases']) or pd.isna(row['Definition']):
        continue

    # Generate questions
    questions = generate_questions(row)

    # Append each question with its corresponding answer (definition)
    for question in questions:
        question_data.append({
            "Entity": row['Name'],
            "Question": question,
            "Answer": row['Definition']
        })

# Create a new DataFrame with the generated questions
questions_df = pd.DataFrame(question_data)



In [35]:
questions_df.head()

Unnamed: 0,Entity,Question,Answer
0,Non-alcoholic Fatty Liver Disease,What is Non-alcoholic Fatty Liver Disease?,A term referring to fatty replacement of the h...
1,Non-alcoholic Fatty Liver Disease,Explain Non-alcoholic Fatty Liver Disease.,A term referring to fatty replacement of the h...
2,Non-alcoholic Fatty Liver Disease,What does Non-alcoholic Fatty Liver Disease mean?,A term referring to fatty replacement of the h...
3,Non-alcoholic Fatty Liver Disease,Can you explain Non-alcoholic Fatty Liver Dise...,A term referring to fatty replacement of the h...
4,Non-alcoholic Fatty Liver Disease,Give a definition for Non-alcoholic Fatty Live...,A term referring to fatty replacement of the h...


In [36]:
questions_df.shape

(215535, 3)

In [37]:
questions_df.to_csv('/content/drive/MyDrive/Raajitha_NLP_CourseCapstone/questionsdf.csv', index=False)