In [1]:
import csv
import requests
from bs4 import BeautifulSoup

# Function to scrape FAQs from a given URL
def scrape_faq(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            faq_section = soup.find(id='faq')
            if faq_section:
                faqs = {}
                faq_items = faq_section.find_all(class_='DrugPane__content___3-yrB')
                for faq_item in faq_items:
                    question = faq_item.find(class_='Faqs__ques___1iPB9').text.strip()
                    answer = faq_item.find(class_='Faqs__ans___1uuIW').text.strip()
                    faqs[question] = answer
                return faqs
            else:
                return None
        else:
            print(f"Failed to fetch {url}. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

# Function to read URLs from CSV file and scrape FAQs
def scrape_from_csv(csv_file):
    faq_dict = {}
    with open(csv_file, 'r', newline='') as file:
        reader = csv.DictReader(file)
        for row in reader:
            url = row['URL']
            print(f"Scraping FAQs from {url}...")
            faqs = scrape_faq(url)
            if faqs:
                faq_dict[url] = faqs
    return faq_dict

# Example usage
if __name__ == "__main__":
    csv_file = 'D:\\ThinkByte_project\Medibuddy_project\data\med_urls.csv'  # Change this to your CSV file name
    faq_data = scrape_from_csv(csv_file)
    print("FAQs extracted:")
    for url, faqs in faq_data.items():
        print(f"\nURL: {url}")
        for question, answer in faqs.items():
            print(f"Question: {question}")
            print(f"Answer: {answer}\n")


KeyError: 'URL'

In [None]:
import csv
import requests
from bs4 import BeautifulSoup

# Function to scrape webpage content from a given URL
def scrape_webpage(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            return soup
        else:
            print(f"Failed to fetch {url}. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

# Function to read URLs from CSV file and scrape webpages
def scrape_from_csv(csv_file):
    webpage_data = {}
    with open(csv_file, 'r', newline='') as file:
        reader = csv.DictReader(file)
        for row in reader:
            url = row['URL']
            print(f"Scraping webpage content from {url}...")
            soup = scrape_webpage(url)
            if soup:
                webpage_data[url] = soup
    return webpage_data

# Example usage
if __name__ == "__main__":
    csv_file = 'D:\\ThinkByte_project\Medibuddy_project\data\med_urls.csv'  # Change this to your CSV file name
    webpage_data = scrape_from_csv(csv_file)
    print("Webpage content extracted:")
    for url, soup in webpage_data.items():
        print(f"\nURL: {url}")
        # Here you can process the BeautifulSoup object 'soup' as needed
        # For example, you can find specific elements, extract text, etc.


Scraping webpage content from https://www.1mg.com/drugs/nikoran-5-tablet-20263...
Scraping webpage content from https://www.1mg.com/drugs/benadryl-syrup-146352...
Scraping webpage content from https://www.1mg.com/drugs/benadryl-syrup-114690...
Webpage content extracted:

URL: https://www.1mg.com/drugs/nikoran-5-tablet-20263

URL: https://www.1mg.com/drugs/benadryl-syrup-146352

URL: https://www.1mg.com/drugs/benadryl-syrup-114690


In [8]:
import csv
import requests
from bs4 import BeautifulSoup

# Function to scrape webpage content
def scrape_webpage(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            product_name = soup.find('h1').text.strip()
            product_intro = soup.find(class_='product-introduction').text.strip()
            usages = soup.find(class_='usages').text.strip()
            benefits = soup.find(class_='benefits').text.strip()
            side_effects = soup.find(class_='side-effects').text.strip()
            how_to_use = soup.find(class_='how-to-use').text.strip()
            how_it_works = soup.find(class_='how-it-works').text.strip()
            safety_advice = soup.find(class_='safety-advice').text.strip()
            forget_to_take = soup.find(class_='forget-to-take').text.strip()
            
            faqs = {}
            faq_section = soup.find(id='faq')
            if faq_section:
                faq_items = faq_section.find_all(class_='Faqs__item___2T2Fq')
                for faq_item in faq_items:
                    question = faq_item.find(class_='Faqs__ques___1iPB9').text.strip()
                    answer = faq_item.find(class_='Faqs__ans___1uuIW').text.strip()
                    faqs[question] = answer

            return {
                'PRODUCT NAME': product_name,
                'PRODUCT INTRODUCTION': product_intro,
                'USAGES': usages,
                'BENEFITS': benefits,
                'SIDE EFFECTS': side_effects,
                'HOW TO USE': how_to_use,
                'HOW IT WORKS': how_it_works,
                'SAFETY ADVICE': safety_advice,
                'WHAT IF YOU FORGET TO TAKE': forget_to_take,
                'FAQs': faqs
            }
            print(response).text()
        else:
            print(f"Failed to fetch {url}. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

# Function to read URLs from CSV file and scrape webpages
def scrape_from_csv(csv_file, output_file):
    with open(csv_file, 'r', newline='') as file:
        with open(output_file, 'w', newline='', encoding='utf-8') as output_csv:
            fieldnames = ['PRODUCT NAME', 'PRODUCT INTRODUCTION', 'USAGES', 'BENEFITS', 'SIDE EFFECTS',
                          'HOW TO USE', 'HOW IT WORKS', 'SAFETY ADVICE', 'WHAT IF YOU FORGET TO TAKE', 'FAQs']
            writer = csv.DictWriter(output_csv, fieldnames=fieldnames)
            writer.writeheader()
            
            reader = csv.DictReader(file)
            for row in reader:
                url = row['source1']
                print(f"Scraping {url}...")
                data = scrape_webpage(url)
                if data:
                    writer.writerow(data)
                else:
                    writer.writerow({key: '' for key in fieldnames})

# Example usage
if __name__ == "__main__":
    input_csv = 'D:\\ThinkByte_project\Medibuddy_project\data\med_urls.csv'  # CSV file containing URLs
    output_csv = 'extracted_data.csv'  # Output CSV file to save extracted data
    scrape_from_csv(input_csv, output_csv)
    print("Extraction completed. Data saved to extracted_data.csv")


Scraping https://www.1mg.com/drugs/nikoran-5-tablet-20263...
Error scraping https://www.1mg.com/drugs/nikoran-5-tablet-20263: 'NoneType' object has no attribute 'text'
Scraping https://www.1mg.com/drugs/benadryl-syrup-146352...
Error scraping https://www.1mg.com/drugs/benadryl-syrup-146352: 'NoneType' object has no attribute 'text'
Scraping https://www.1mg.com/drugs/benadryl-syrup-114690...
Error scraping https://www.1mg.com/drugs/benadryl-syrup-114690: 'NoneType' object has no attribute 'text'
Extraction completed. Data saved to extracted_data.csv


In [None]:
import csv
import requests
from bs4 import BeautifulSoup

# Function to scrape webpage content
def scrape_webpage(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            product_name = soup.find('h1').text.strip()
            
            drug_header = soup.find(id='drug_header')
            uses_and_benefits = drug_header.find_next_sibling('div', id='uses_and_benefits').text.strip()
            side_effects = drug_header.find_next_sibling('div', id='side_effects').text.strip()
            how_to_use = drug_header.find_next_sibling('div', id='how_to_use').text.strip()
            how_drug_works = drug_header.find_next_sibling('div', id='how_drug_works').text.strip()
            safety_advice = drug_header.find_next_sibling('div', id='safety_advice').text.strip()
            missed_dose = drug_header.find_next_sibling('div', id='missed_dose').text.strip()
            substitutes = drug_header.find_next_sibling('div', id='substitutes').text.strip()
            expert_advice = drug_header.find_next_sibling('div', id='expert_advice').text.strip()
            fact_box = drug_header.find_next_sibling('div', id='fact_box').text.strip()
            drug_interaction = drug_header.find_next_sibling('div', id='drug_interaction').text.strip()
            patient_concerns = drug_header.find_next_sibling('div', id='patient_concerns').text.strip()
            user_feedback = drug_header.find_next_sibling('div', id='user_feedback').text.strip()
            
            faqs = {}
            faq_section = soup.find(id='faq')
            if faq_section:
                faq_items = faq_section.find_all(class_='Faqs__item___2T2Fq')
                for faq_item in faq_items:
                    question = faq_item.find(class_='Faqs__ques___1iPB9').text.strip()
                    answer = faq_item.find(class_='Faqs__ans___1uuIW').text.strip()
                    faqs[question] = answer

            return {
                'PRODUCT NAME': product_name,
                'USES AND BENEFITS': uses_and_benefits,
                'SIDE EFFECTS': side_effects,
                'HOW TO USE': how_to_use,
                'HOW DRUG WORKS': how_drug_works,
                'SAFETY ADVICE': safety_advice,
                'MISSED DOSE': missed_dose,
                'SUBSTITUTES': substitutes,
                'EXPERT ADVICE': expert_advice,
                'FACT BOX': fact_box,
                'DRUG INTERACTION': drug_interaction,
                'PATIENT CONCERNS': patient_concerns,
                'USER FEEDBACK': user_feedback,
                'FAQs': faqs
            }
        else:
            print(f"Failed to fetch {url}. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

# Function to read URLs from CSV file and scrape webpages
def scrape_from_csv(csv_file, output_file):
    with open(csv_file, 'r', newline='') as file:
        with open(output_file, 'w', newline='', encoding='utf-8') as output_csv:
            fieldnames = ['PRODUCT NAME', 'USES AND BENEFITS', 'SIDE EFFECTS', 'HOW TO USE',
                          'HOW DRUG WORKS', 'SAFETY ADVICE', 'MISSED DOSE', 'SUBSTITUTES',
                          'EXPERT ADVICE', 'FACT BOX', 'DRUG INTERACTION', 'PATIENT CONCERNS',
                          'USER FEEDBACK', 'FAQs']
            writer = csv.DictWriter(output_csv, fieldnames=fieldnames)
            writer.writeheader()
            
            reader = csv.DictReader(file)
            for row in reader:
                url = row['source2']
                print(f"Scraping {url}...")
                data = scrape_webpage(url)
                if data:
                    writer.writerow(data)
                else:
                    writer.writerow({key: '' for key in fieldnames})

# Example usage
if __name__ == "__main__":
    input_csv = 'D:\\ThinkByte_project\Medibuddy_project\data\med_urls.csv'  # CSV file containing URLs
    output_csv = 'extracted_data.csv'  # Output CSV file to save extracted data
    scrape_from_csv(input_csv, output_csv)
    print("Extraction completed. Data saved to extracted_data.csv")


Scraping https://www.netmeds.com/prescriptions/nikoran-5mg-tablet-20-s...
Error scraping https://www.netmeds.com/prescriptions/nikoran-5mg-tablet-20-s: 'NoneType' object has no attribute 'find_next_sibling'
Scraping https://www.netmeds.com/prescriptions/benadryl-cough-formula-syrup-450ml...
Error scraping https://www.netmeds.com/prescriptions/benadryl-cough-formula-syrup-450ml: 'NoneType' object has no attribute 'find_next_sibling'
Scraping https://www.netmeds.com/prescriptions/benadryl-cough-formula-syrup-150ml...
Error scraping https://www.netmeds.com/prescriptions/benadryl-cough-formula-syrup-150ml: 'NoneType' object has no attribute 'find_next_sibling'
Extraction completed. Data saved to extracted_data.csv


NameError: name 'soup' is not defined

In [None]:
import requests
url = "https://www.1mg.com/drugs/nikoran-5-tablet-20263"
r = requests.get(url)		# r variable has all the HTML code
htmlContent = r.content	# r returns response so if we want the code we write r.content
print(htmlContent)		# printing the code



In [None]:
htmlText = r.text
print(htmlText)

<!DOCTYPE html>
<html lang="en">
    <head>
        <meta charset="utf-8" />
        <meta
            name="viewport"
            content="initial-scale=1, maximum-scale=2, width=device-width height=device-height, viewport-fit=cover"
        />
        <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1" />
        <meta http-equiv="x-dns-prefetch-control" content="on" />

        

        <meta name="theme-color" content="#FFF3E3" />
        <!-- Windows Phone -->
        <meta name="msapplication-navbutton-color" content="#FFF3E3" />
        <!-- iOS Safari -->
        <meta name="apple-mobile-web-app-status-bar-style" content="#FFF3E3" />
        <meta name="apple-mobile-web-app-capable" content="yes" />

        <script nonce="b174c605d5484b883eca544a90ce0d66">
            dataLayer = []
        </script>

        <!-- Partytown Integration Scripts -->
        
        <script nonce="b174c605d5484b883eca544a90ce0d66">
            var googletag = googletag || {}
         

In [5]:
import requests
from bs4 import BeautifulSoup
url = "https://www.1mg.com/drugs/nikoran-5-tablet-20263"

r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')

print(soup.prettify())	# to print html in tree structure

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="initial-scale=1, maximum-scale=2, width=device-width height=device-height, viewport-fit=cover" name="viewport"/>
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
  <meta content="on" http-equiv="x-dns-prefetch-control"/>
  <meta content="#FFF3E3" name="theme-color"/>
  <!-- Windows Phone -->
  <meta content="#FFF3E3" name="msapplication-navbutton-color"/>
  <!-- iOS Safari -->
  <meta content="#FFF3E3" name="apple-mobile-web-app-status-bar-style"/>
  <meta content="yes" name="apple-mobile-web-app-capable"/>
  <script nonce="4920c8f400af6892e1911b9528f8d9df">
   dataLayer = []
  </script>
  <!-- Partytown Integration Scripts -->
  <script nonce="4920c8f400af6892e1911b9528f8d9df">
   var googletag = googletag || {}
            googletag.cmd = googletag.cmd || []
  </script>
  <script nonce="4920c8f400af6892e1911b9528f8d9df">
   function getChromeVersion () {
                const raw = na

In [6]:
print(soup.find('div', id='expert_advice').find('ul'))

AttributeError: 'NoneType' object has no attribute 'find'

: 

In [None]:
import requests
from bs4 import BeautifulSoup
url = "https://www.netmeds.com/prescriptions/nikoran-5mg-tablet-20-s"

r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
for i in soup.find_all("code"):
    print(i.text)
    # We can also do it like this
    # print(i.get_text())

In [None]:
title = soup.title
print(title)

<title>Buy Nikoran 5mg Tablet 20'S Online at Upto 25% OFF | Netmeds</title>


In [None]:
paras = soup.find_all('p')
print(paras)

[<p class="MegaMenu_subheading"><a href="/non-prescriptions/ayush">Ayush</a></p>, <p class="MegaMenu_subheading"><a href="/non-prescriptions/covid-essentials">Covid Essentials</a></p>, <p class="MegaMenu_subheading"><a href="/non-prescriptions/devices">Devices</a></p>, <p class="MegaMenu_subheading"><a href="/non-prescriptions/diabetes-support">Diabetes Support</a></p>, <p class="MegaMenu_subheading"><a href="/non-prescriptions/eyewear">Eyewear</a></p>, <p class="MegaMenu_subheading"><a href="/non-prescriptions/fitness">Fitness</a></p>, <p class="MegaMenu_subheading"><a href="/non-prescriptions/health-conditions">Health Conditions</a></p>, <p class="MegaMenu_subheading"><a href="/non-prescriptions/mom-baby">Mom &amp; Baby</a></p>, <p class="MegaMenu_subheading"><a href="/non-prescriptions/personal-care">Personal Care</a></p>, <p class="MegaMenu_subheading"><a href="/non-prescriptions/sexual-wellness">Sexual Wellness</a></p>, <p class="MegaMenu_subheading"><a href="/non-prescriptions/su

In [None]:
print(soup.find_all(class_="col-6 l4SemiBold marginTop-8"))

[]


In [None]:
print(soup.find_all(class_="col-6 marginTop-8 GeneralDescription__htmlNodeWrapper__h23K3"))

[<div class="col-6 marginTop-8 GeneralDescription__htmlNodeWrapper__h23K3"></div>, <div class="col-6 marginTop-8 GeneralDescription__htmlNodeWrapper__h23K3">Nikoran 5 Tablet may be taken with or without food. It should be swallowed whole with water. Take this medicine regularly and at about the same time(s) each day. Your doctor will decide the dose and how often you should take it. This may change from time to time depending on how well it is working. Do not stop taking it without talking to our doctor as sudden withdrawal may cause your chest pain to come back. You can improve the health of your heart by making some lifestyle changes such as quitting smoking, cutting down on alcohol, eating well, exercising regularly, and managing stress.<br/><br/>The most common side effect of this medicine is headache, especially during the first few days of treatment. This usually improves within a week. However,  drinking plenty of fluids and avoiding alcohol may help. Other common side effects i

In [None]:
print(soup.find_all(class_="DrugHeader__left___19WY-"))

[]


In [None]:
print(soup.find(id='DrugHeader__title-content___2ZaPo'))

None


In [None]:
ul = soup.find(id="li")
print(ul.next_sibling.next_sibling)

AttributeError: 'NoneType' object has no attribute 'next_sibling'

In [None]:
print(soup.find('p')['class'])

['MegaMenu_subheading']


In [3]:
import requests
from bs4 import BeautifulSoup
url = "https://www.apollopharmacy.in/medicine/benadryl-cough-formula-syrup-450ml"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
for i in soup.find_all("code"):
    print(i.text)
    # We can also do it like this
    print(i.get_text())

In [5]:
print(soup.find_all(class_="tb sb PJ"))

[<h1 class="tb sb PJ">Benadryl Cough Formula Syrup, 450 ml</h1>]
