In [2]:
import requests
from bs4 import BeautifulSoup



In [70]:
def extract_ingredients_from_letter(letter):
    url = f"https://www.drugs.com/alpha/{letter}.html"
    response = requests.get(url)
    response.raise_for_status()  # Ensure the request was successful

    soup = BeautifulSoup(response.text, 'html.parser')
    ingredient_list = []
    drug_link = []

    # Find all elements with the class "ddc-list-column-2"
    columns = soup.find_all('ul', class_='ddc-list-column-2')

    for column in columns:
        # Extract the text from each list item in the column
        ingredients = column.find_all('li')
        for ingredient in ingredients:
            ingredient_list.append(ingredient.get_text(strip=True))
            link = ingredient.find('a')
            link_href = link.get('href', '')
            drug_link.append(link_href)

    return [ingredient_list,drug_link]

def extract_all_ingredients():
    all_ingredients = []
    all_drug_links = []
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        print(f"Extracting ingredients for letter: {letter}")
        ingredients = extract_ingredients_from_letter(letter)
        all_ingredients.extend(ingredients[0])
        all_drug_links.extend(ingredients[1])

    return [all_ingredients,all_drug_links]

# Extract all ingredients from A-Z
ingredients = extract_all_ingredients()


Extracting ingredients for letter: a
Extracting ingredients for letter: b
Extracting ingredients for letter: c
Extracting ingredients for letter: d
Extracting ingredients for letter: e
Extracting ingredients for letter: f
Extracting ingredients for letter: g
Extracting ingredients for letter: h
Extracting ingredients for letter: i
Extracting ingredients for letter: j
Extracting ingredients for letter: k
Extracting ingredients for letter: l
Extracting ingredients for letter: m
Extracting ingredients for letter: n
Extracting ingredients for letter: o
Extracting ingredients for letter: p
Extracting ingredients for letter: q
Extracting ingredients for letter: r
Extracting ingredients for letter: s
Extracting ingredients for letter: t
Extracting ingredients for letter: u
Extracting ingredients for letter: v
Extracting ingredients for letter: w
Extracting ingredients for letter: x
Extracting ingredients for letter: y
Extracting ingredients for letter: z


In [78]:
ingredients

[['Abilify',
  'Abilify Asimtufii',
  'Abilify Maintena',
  'Abiraterone',
  'Acetaminophen',
  'Acetylcysteine',
  'Actemra',
  'Actos',
  'Acyclovir',
  'Adderall',
  'Adderall XR',
  'Advair Diskus',
  'Advil',
  'Afinitor',
  'Agamree',
  'Aimovig',
  'Ajovy',
  'Albuterol',
  'Aldactone',
  'Alecensa',
  'Alendronate',
  'Aleve',
  'Alfuzosin',
  'Allegra',
  'Allopurinol',
  'Alprazolam',
  'Alprolix',
  'Alunbrig',
  'Amantadine',
  'Ambien',
  'Ambroxol',
  'Amiodarone',
  'Amitriptyline',
  'Amlodipine',
  'Amoxicillin',
  'Amoxicillin and Clavulanate',
  'Anastrozole',
  'AndroGel',
  'Annovera',
  'Apalutamide',
  'Apixaban',
  'Apokyn',
  'Apriso',
  'Aptiom',
  'Aricept',
  'Arikayce',
  'Arimidex',
  'Aripiprazole',
  'Aristada',
  'Aromasin',
  'Ascorbic acid',
  'Aspirin',
  'Atarax',
  'Atenolol',
  'Ativan',
  'Atogepant',
  'Atomoxetine',
  'Atorvastatin',
  'Aubagio',
  'Augmentin',
  'Austedo',
  'Auvelity',
  'Avonex',
  'Avsola',
  'Azathioprine',
  'Azelastine',

In [72]:
import pandas as pd
df = pd.DataFrame(ingredients)

In [76]:
df

Unnamed: 0,0,1
0,Abilify,/abilify.html
1,Abilify Asimtufii,/abilify-asimtufii.html
2,Abilify Maintena,/abilify-maintena.html
3,Abiraterone,/abiraterone.html
4,Acetaminophen,/acetaminophen.html
...,...,...
1211,Zyprexa,/zyprexa.html
1212,Zyrtec,/zyrtec.html
1213,Zytiga,/zytiga.html
1214,Zyvox,/zyvox.html


In [75]:
df = df.T


In [77]:
df.to_csv('drug_list.csv')

In [56]:
def extract_section_content(start_tag):
    """
    Extracts content until the next <h2> or <div>.
    
    :param start_tag: BeautifulSoup tag object from which to start.
    :return: String of concatenated text content.
    """
    content = []
    if not start_tag:  # Check if the start tag exists
        return 'Section not found.'
    current_tag = start_tag.find_next()

    # Collect paragraphs until the next <h2> or <div>
    while current_tag and current_tag.name not in ['h2', 'div']:
        if current_tag.name == 'p':
            content.append(current_tag.get_text(separator=' ', strip=True))
        current_tag = current_tag.find_next()
    
    return ' '.join(content)

In [79]:
def scrape_ingredient_data(ingredient,link):
    """
    Scrapes 'Uses' and 'Warnings' content for a given ingredient from drugs.com.
    
    :param ingredient: Ingredient name as a string.
    :return: Dictionary with 'Uses' and 'Warnings' content.
    """
    url = f'https://www.drugs.com/{link}'
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for request errors
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract the "Uses" section
    uses_section = soup.find('h2', id='uses')
    uses_content = extract_section_content(uses_section) if uses_section else 'No uses section found.'

    # Extract the "Warnings" section
    warnings_section = soup.find('h2', id='warnings')
    warnings_content = extract_section_content(warnings_section) if warnings_section else 'No warnings section found.'
    
    before_taking_section = soup.find('h2', id='before-taking')  # Find the 'Warnings' header
    before_taking_content = extract_section_content(before_taking_section) if before_taking_section else 'No before section found.'
    
    side_effects_section = soup.find('h2',id="side-effects")
    side_effects_content = extract_section_content(side_effects_section) if side_effects_section else 'No side effects section found'

    return {
        'ingredient': ingredient,
        'uses': uses_content,
        'warnings': warnings_content,
        'before_taking':before_taking_content,
        'Side Effects':side_effects_content
    }

In [101]:
ingredients_data2 = []
for i in range(1187,1215):
    print(ingredients[0][i])
    data = scrape_ingredient_data(ingredients[0][i],ingredients[1][i])
    if data:
        ingredients_data2.append(data)

Zinc gluconate
Zinc oxide topical
Zinc sulfate
Ziprasidone
Zithromax
Zocor
Zofran
Zofran ODT
Zoladex
Zoledronic acid
Zolmitriptan
Zoloft
Zolpidem
Zometa
Zomig
Zonegran
Zonisamide
Zopiclone
Zoryve
Zosyn
Zovirax
Zydelig
Zykadia
Zyloprim
Zyprexa
Zyrtec
Zytiga
Zyvox


In [102]:
ingredients_data.extend(ingredients_data2)

In [104]:
len(ingredients_data)

1216

In [105]:
drug_content = pd.DataFrame(ingredients_data)

In [106]:
drug_content.to_csv('drug_contents.csv')

In [41]:
import requests
from bs4 import BeautifulSoup

# URL of the webpage to scrape
url = 'https://www.drugs.com/Allopurinol.html'  # Replace with the actual URL

# Fetch the content of the webpage
response = requests.get(url)
response.raise_for_status()  # Check for request errors

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

def extract_section_content(start_tag):
    """
    Extracts content until the next <h2> or <div>.
    
    :param start_tag: BeautifulSoup tag object from which to start.
    :return: String of concatenated text content.
    """
    content = []
    current_tag = start_tag.find_next()

    # Collect paragraphs until the next <h2> or <div>
    while current_tag and current_tag.name not in ['h2', 'div']:
        if current_tag.name == 'p':
            content.append(current_tag.get_text(separator=' ', strip=True))
        current_tag = current_tag.find_next()
    
    return ' '.join(content)

# Extract the "Uses" section
uses_section = soup.find('h2', id='uses')  # Find the 'Uses' header
uses_content = extract_section_content(uses_section)

# Extract the "Warnings" section
warnings_section = soup.find('h2', id='warnings')  # Find the 'Warnings' header
warnings_content = extract_section_content(warnings_section)

# Print the extracted content
print("Uses:")
print(uses_content)
print("\nWarnings:")
print(warnings_content)


Uses:
Allopurinol belongs to a class of medications called xanthine oxidase inhibitors. It works by reducing the production of uric acid in the body. High levels of uric acid may cause gout attacks or kidney stones. Allopurinol is used to treat gout (a type of arthritis in which uric acid, a naturally occurring substance in the body, builds up in the joints and causes sudden attacks of redness, swelling, pain, and heat in one or more joints). Allopurinol is used to prevent gout attacks, not to treat them once they occur. Allopurinol does not need to be stopped during an acute flare. Allopurinol is also used to treat high levels of uric acid that builds up in the blood as tumors break down in people with certain types of cancer who are being treated with chemotherapy medications. It is also used to treat kidney stones that have come back in people who have high levels of uric acid in their urine.

You should not use this medicine if you have ever had an allergic reaction to allopurinol.

In [11]:
import pandas as pd

In [12]:
medical_ingredients_qa = pd.read_csv('/Users/rujul/Documents/Projects/OCR/drug_contents.csv')


In [17]:
medical_ingredients_qa

Unnamed: 0.1,Unnamed: 0,ingredient,uses,warnings,before_taking,Side Effects
0,0,Abilify,Abilify is an antipsychotic medication. It wor...,Abilify is not approved for use in older adult...,You should not take Abilify if you are allergi...,Get emergency medical help if you have signs o...
1,1,Abilify Asimtufii,Abilify Asimtufii is an atypical antipsychotic...,Abilify Asimtufii is not approved for use in o...,You should not take this medicine if you are a...,Get emergency medical help if you have signs o...
2,2,Abilify Maintena,Abilify Maintena (aripiprazole) extended-relea...,Abilify Maintena is not approved for use in ol...,You should not be treated with Abilify Mainten...,Get emergency medical help if you have signs o...
3,3,Abiraterone,Abiraterone works by reducing androgen product...,Abiraterone tablets should not be handled by a...,Abiraterone is not for use by women or childre...,Get emergency medical help if you have signs o...
4,4,Acetaminophen,Acetaminophen is a pain reliever and a fever r...,You should not use this medication if you have...,You should not take acetaminophen if you are a...,Get emergency medical help if you have signs o...
...,...,...,...,...,...,...
1211,1211,Zyloprim,Allopurinol is used to prevent or lower high u...,It is very important that your doctor check yo...,"In deciding to use a medicine, the risks of ta...","Along with its needed effects, a medicine may ..."
1212,1212,Zyprexa,Zyprexa (olanzapine) is an atypical antipsycho...,"Zyprexa may cause serious side effects, includ...",Zyprexa may not be right for you. Before start...,Zyprexa may cause serious side effects such as...
1213,1213,Zyrtec,Zyrtec is used in adults and children to treat...,Before using Zyrtec tell your doctor about all...,You should not use Zyrtec if you are allergic ...,Get emergency medical help if you have signs o...
1214,1214,Zytiga,Zytiga works by reducing androgen production i...,You should not use Zytiga if you have severe l...,You should not use Zytiga if you are allergic ...,Get emergency medical help if you have signs o...


In [None]:
medical_ingredients_qa = medical_ingredients_qa.to_dict(orient='records')

In [19]:
# Initialize the dictionary
qa_dict = {
    "root": {
        "questions": []
    }
}

# Assuming medical_ingredients_qa is a list of dictionaries with keys: 'ingredient', 'uses', 'warnings', 'before_taking', 'Side Effects'
for i in range(len(medical_ingredients_qa)):
    # Creating the questions and answers for each ingredient
    new_text = [
        {
            "question": "What is " + medical_ingredients_qa[i]['ingredient'] + " used for?",
            "answer": medical_ingredients_qa[i]['uses']
        },
        {
            "question": "What are the warnings for " + medical_ingredients_qa[i]['ingredient'] + "?",
            "answer": medical_ingredients_qa[i]['warnings']
        },
        {
            "question": "What should be known before taking " + medical_ingredients_qa[i]['ingredient'] + "?",
            "answer": medical_ingredients_qa[i]['before_taking']
        },
        {
            "question": "What are the side effects of " + medical_ingredients_qa[i]['ingredient'] + "?",
            "answer": medical_ingredients_qa[i]['Side Effects']
        }
    ]
    
    # Extending the 'questions' list with the new questions
    qa_dict["root"]["questions"].extend(new_text)

# Printing the updated dictionary
print(qa_dict)




In [21]:
qa_dict["root"]["questions"][2000]

{'question': 'What is Jynarque used for?',
 'answer': 'Jynarque is used to slow the decline in kidney function in adults who have autosomal dominant polycystic kidney disease . Jynarque is available only under a special program. You must be registered in the program and understand the risks and benefits of  Jynarque. Samsca is used to treat adults with hyponatremia (low levels of sodium in your blood) in people with heart failure , and certain hormonal imbalances. Jynarque may also be used for purposes not listed in this medication guide.'}

In [8]:
print(type(medical_ingredients_qa))
print(medical_ingredients_qa[:5])  # Print the first 5 items to verify the structure


<class 'pandas.core.frame.DataFrame'>
   Unnamed: 0         ingredient  \
0           0            Abilify   
1           1  Abilify Asimtufii   
2           2   Abilify Maintena   
3           3        Abiraterone   
4           4      Acetaminophen   

                                                uses  \
0  Abilify is an antipsychotic medication. It wor...   
1  Abilify Asimtufii is an atypical antipsychotic...   
2  Abilify Maintena (aripiprazole) extended-relea...   
3  Abiraterone works by reducing androgen product...   
4  Acetaminophen is a pain reliever and a fever r...   

0  Abilify is not approved for use in older adult...   
1  Abilify Asimtufii is not approved for use in o...   
2  Abilify Maintena is not approved for use in ol...   
3  Abiraterone tablets should not be handled by a...   
4  You should not use this medication if you have...   

                                       before_taking  \
0  You should not take Abilify if you are allergi...   
1  You should n

In [18]:

medical_ingredients_qa = medical_ingredients_qa.to_dict(orient='records')

In [22]:
import csv

# File path for the CSV
csv_file_path = 'questions_answers.csv'

# Flatten the dictionary into rows
rows = []
for qa in qa_dict["root"]["questions"]:
    rows.append([qa["question"], qa["answer"]])

# Write to CSV
with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(["Question", "Answer"])
    # Write the data rows
    writer.writerows(rows)

print(f"Dictionary has been saved to {csv_file_path}")


Dictionary has been saved to questions_answers.csv
