In [1]:
import requests
from bs4 import BeautifulSoup
import re
import csv
import time
import urllib3
from datetime import date


def date_format(str1):
    '''
    l[0] = Month
    l[1] = Date
    l[2] = Year 
    '''
    str1 = str1.lower()
    l1 = str1.replace(',','').split(' ')
    # December -> dec
    if (len(l1[0]) > 3):
        l1[0] = l1[0][:3]
    dict1 = {'jan' : 1,
        'feb':2,
        'mar':3,
        'apr':4,
        'may':5,
        'jun':6,
        'jul':7,
        'aug':8,
        'sep':9,
        'oct':10,
        'nov':11,
        'dec':12
    }
    a = str(dict1[l1[0]])
    return str(l1[1]+'-'+ a +'-'+str(l1[2]))

# added new Keywords
keywords = ['medical devices', 'implantable', 'software as medical device', 
            'samd', 'mdufa', 'harmonised standards', 
            'medical device coordination group (mdcg)', 
            'combination product', 'guidance', 'notified body', 
            'artificial intelligence medical devices', 
            'artificial intelligence/machine learning-enabled medical devices',
            'machine learning-enabled medical devices',
            'artificial intelligence medical devices',
            'classification', 
            'designation', 'approval', 'recall', 'companion diagnostic', 
            'in vitro diagnostic (ivd)', 
            'device', 'software', 'health application', 'digital health', 
            'medical device regulation (mdr)', 'instruction for use (ifu)', 
            'medtech', 'unique device identification (udi)', '510(k)', 
            'investigational device exemption (ide)', 'de novo', 'premarket approval application (pma)', 
            'humanitarian device exemption (hde) ', 'device classification', 'iso', 'advamed', 'standard', 
            'eudamed', 'ce mark', 'declaration of conformity', 
            'general safety and performance requirements (gspr) ', 'european medicines agancy (ema)', 
            'european commission (ec)', 'eu reference laboratories (eurls)', 'eu expert panel', 
            'center for devices and radiological health (cdrh)', 'drug-device combination', 
            'national medical products administration (nmpa)', 
            'center for medical device evaluation (cmde)', 
            'medical device material', 'policy', 'swiss medtech']

def check_keywords_in_title(title, keywords):
    match = []
    for word in keywords:
        if word in title:
            print(word, "matched for title :", title)
            match.append(word)
    return match
class Content:
    def __init__(self, url, title, date):
        self.url = url
        self.title = title
        self.date = date
   
def getPage(url):
    try:
        req = requests.get(url)
    except requests.exceptions.RequestException:
        return None
    return BeautifulSoup(req.text, 'html.parser')

# https://www.mddionline.com/regulatory-quality/regulations
def scrape_mddionline(url):
    cookie_jar=requests.cookies.RequestsCookieJar()
    session=requests.Session()
    header = {'Accept-Encoding': 'gzip, deflate', 'Accept': '/', 'Connection': 'keep-alive',
     "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0"
     }
    try:
        print("Utility function for https://www.mddionline.com/regulatory-quality/regulations called !")
        bs = getPage(url)
        # print("getPage function for https://www.mddionline.com/regulatory-quality/regulations called !")
        all_news=[]
        table = bs.find_all('article', attrs={'class':'article-teaser article-teaser__icon__article article-teaser__aside'})
        for row in table:
            title = row.find('a').text.lower()
            #print("\t Checking for title: ", title)
            url_content = row.find('a')['href']
            date = date_format(row.find('span').text.lstrip().rstrip())
            news = {}
            news['identifier'], news['title'], news['url'], news['date'], news['keywords'], news['content'] =url, title, url_content, date, '', ''
            ret = check_keywords_in_title(title, keywords)
            print(title, "- Keywords matched are:- ", ret)
            if ret:
                match = ""
                match = " | ".join(word for word in ret)
                print(match)
                news['keywords'] = match
                url_inner = 'https://www.mddionline.com' + url_content
                #print(url_content)
                #print("Keyword foud at:-", url_inner)
                bs_inner = getPage(url_inner)
                content_inner = bs_inner.find('div', {'itemprop':'articleBody'})
                news['content'] = content_inner.text.lstrip()
                print("\t News content should be populated !")
            else:
                news['content'] = 'NA'
                news['keywords'] = 'NA'
                print("\t No Key Match, news content should not be populated !")
            all_news.append(news)
    except:
        print("An exception occured with:- ", url)
    return all_news
              
             


# https://www.ema.europa.eu
def scrapeEuropa(url):
    cookie_jar=requests.cookies.RequestsCookieJar()
    session=requests.Session()
    header = {'Accept-Encoding': 'gzip, deflate', 'Accept': '/', 'Connection': 'keep-alive',
     "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0"
     }
    try:
        print("Utility function for https://www.ema.europa.eu called !")
        bs = getPage(url)
        #print("getPage function for https://www.ema.europa.eu called !")
        all_news=[] 
        table = bs.find('div', attrs = {'class':'view-content'})
        for row in table.findAll('a', attrs = {'class':'ecl-link ecl-list-item__link'}):
            news = {}
            news['identifier'] = url
            news['title'] = row.h3.text
            news['url']=row['href']
            news['date'] = row.span.text.replace("/","-")
            news['keywords'] = ''
            title, url_content, date, content = news['title'].lower(), news['url'], row.span.text ,''
            ret = check_keywords_in_title(title, keywords)
            print(title, "- Keywords matched are:- ", ret)
            if ret:
                match = ""
                match = " | ".join(word for word in ret)
                print(match)
                news['keywords'] = match
                #print("\t Found a keyword!!")
                url_inner = url + url_content
                bs_inner = getPage(url_inner)
                table_inner = bs_inner.find('div', attrs = {'class':'paragraphs-items paragraphs-items-field-ema-paragraph-content paragraphs-items-field-ema-paragraph-content-full paragraphs-items-full'})
                table_inner_1 = table_inner.find('div', attrs = {'class':'ecl-field__body'})
                table_inner_2 = table_inner_1.findAll('p')
                for i in table_inner_2:
                    content += i.get_text()
                news['content'] = content.replace("\n", " ")    
                print("\t News content should be populated !")
            else:
                news['content'] = "NA"
                news['keywords'] = 'NA'
                print("\t No Key Match, news content should not be populated !")
            all_news.append(news)
    except:
        print("An exception occured with:- ", url)
    return all_news



# https://www.fdanews.com/articles/topic/106?page=5
def scrapeFDA(url):
    try:
        print("Utility function for https://www.fdanews.com called !")
        bs = getPage(url)
        print("getPage function for https://www.fdanews.com called !")
        all_news=[] 
        # Checkpoint No. of records pulled is incorrect
        table = bs.findAll('article', attrs = {'class':'record article-summary'})
        #print(table)

        for row in table:
            news = {}
            news['identifier'] = url
            news['title'], news['url'] = row.h2.text, row.h2.a['href']
            news['date'] = date_format(row.find('div' , attrs = {'class' : 'date article-summary__post-date'}).text)
            title, url_content, date, content = news['title'].lower(), news['url'], news['date'],''
            ret = check_keywords_in_title(title, keywords)
            news['keywords'] = ''
            print("check_keywords_in_title called for title:- ", title)
            if ret:
                match = ""
                match = " | ".join(word for word in ret)
                print("\t Found a keyword!!")
                news['keywords'] = match
                url_inner = url_content
                bs_inner = getPage(url_inner)
                table_inner = bs_inner.find('div', attrs = {'body gsd-paywall'})
                #print(table_inner)
                table_inner_1 = table_inner.findAll('p')
                for i in table_inner_1:
                    content += i.get_text()
                news['content'] = content.replace("\n", " ") 
                print("\t News content should be populated !")
            else:
                news['content'] = "NA"
                news['keywords'] = 'NA'
                print("\t No Key Match, news content should not be populated !")
            all_news.append(news)
#             print(all_news)
    except:
        print("An exception occured!!")
    return all_news


# https://www.medtechdive.com/topic/medical-devices/
def scrapeMedTechDive(url):
    cookie_jar=requests.cookies.RequestsCookieJar()
    session=requests.Session()
    header = {'Accept-Encoding': 'gzip, deflate', 'Accept': '/', 'Connection': 'keep-alive',
     "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0"
     }
    try:
              
        print("Utility function for https://www.medtechdive.com/topic/medical-devices/ called !")
        bs = getPage(url)
        #print("getPage function for https://www.medtechdive.com/topic/medical-devices/ called !")
        all_news=[] 
        table = bs.find('ul', attrs = {'class':'feed layout-stack-xxl'})
        for row in table.findAll('div', attrs = {'class':'medium-8 columns'}):
            news = {}
            news['identifier'] = url
            news['title'] = row.h3.text.lstrip().rstrip()
            news['url']=row.a['href']
            dummy = row.findAll('span', attrs ={'class':'secondary-label'})[-1]
            dummy = dummy.text.lstrip().rstrip().replace(",", "")
            date = dummy.split(" ")[-3:]
            date = ' '.join(date)
            news['date'] = date_format(date)
            news['keywords'] = ''
            title, url_content, date, content = news['title'].lower(), news['url'], news['date'] ,''
            ret = check_keywords_in_title(title, keywords)
            print(title, "- Keywords matched are:- ", ret)
            #print("check_keywords_in_title called for title:- ", title)
            if ret:
                match = " "
                match = " | ".join(word for word in ret)
                print(match)
                news['keywords'] = match
                #print("\t Found a keyword!!")
                base_url = "https://www.medtechdive.com"
                url_inner = base_url + url_content
                bs_inner = getPage(url_inner)
                table_inner = bs_inner.find('div', attrs = {'class':'large medium article-body'})
                table_inner_1 = table_inner.findAll('p')
                for i in table_inner_1:
                    content += i.get_text()
                news['content'] = content.replace("\n", " ") 
                print("\t News content should be populated !")
            else:
                news['content'] = "NA"
                news['keywords'] = 'NA'
                print("\t No Key Match, news content should not be populated !")
            all_news.append(news)
    except:
        print("An exception occured with:- ", url)
    return all_news

# https://www.raps.org/news-and-articles/news-articles
def scrapeRaps(url):
    cookie_jar=requests.cookies.RequestsCookieJar()
    session=requests.Session()
    header = {'Accept-Encoding': 'gzip, deflate', 'Accept': '/', 'Connection': 'keep-alive',
     "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0"
     }
    try:
        print("Utility function for https://www.raps.org/news-and-articles/news-articles called !")
        req = requests.get(url, headers = header)
        soup=BeautifulSoup(req.content, "html.parser")
        newsList = soup.findAll("div", {"class": "item-content"})
        all_news = []
        for eachNews in newsList:
            news = {}
            news['identifier'] = url
            url_content=eachNews.a['href']
            title = eachNews.a.text.strip().lower()
            date = eachNews.li.text
            date1 = date.split(" ")
            date = date1[1] + " " + date1[0] + " " + date1[2]
            news['title'] = title
            news['url'] = 'https://www.raps.org' + url_content
            news['date'] = date_format(date)
            news['keywords'] = ''
            ret = check_keywords_in_title(title, keywords)
            print(title, "- Keywords matched are:- ", ret)
            if ret:
                match = ""
                match = " | ".join(word for word in ret)
                print(match)
                news['keywords'] = match
                print("\t Found a keyword!!")
                req_inner = requests.get(news['url'], headers = header)
                soup_inner = BeautifulSoup(req_inner.content, "html.parser")
                table = soup_inner.find('div', attrs = {'class':'article'})
                row = table.findAll('div')[1]
                news['content'] = row.text
                print("\t News content should be populated !")
            else:
                news['content'] = 'NA'
                news['keywords'] = 'NA'
                print("\t No Key Match, news content should not be populated !")
            all_news.append(news)
            time.sleep(2)
    except:
        print("An exception occured with:- ", url)
    return all_news

# getting today's date
today = date.today()
# dd/mm/YY
d1 = today.strftime("%d-%m-%Y")


def populate(all_news):
    with open(filename, 'a', newline='',  encoding="utf-8") as f:
        w = csv.DictWriter(f,['identifier','title','url','date', 'keywords', 'content'])
        for news in all_news:
            w.writerow(news)
    return 1

name = "ScrapeData_"+ d1
filename = "%s.csv" % name
with open(filename, 'a', newline='',  encoding="utf-8") as f:
    w = csv.DictWriter(f,['identifier', 'title','url','date','keywords', 'content'])
    w.writeheader()
    
url = ['https://www.ema.europa.eu',
       'https://www.mddionline.com/regulatory-quality/regulations',
       'https://www.fdanews.com/articles/topic/106?page=5',
       'https://www.medtechdive.com/topic/medical-devices/',
       'https://www.raps.org/news-and-articles/news-articles']


all_news = scrapeEuropa(url[0])
ret = populate(all_news)
if ret == 1:
    print("Data for:-", url[0], " populated successfully !!")

    
all_news = scrape_mddionline(url[1])
ret = populate(all_news)
if ret == 1:
    print("Data for:-", url[1], " populated successfully !!")
    
all_news = scrapeFDA(url[2])
ret = populate(all_news)
if ret == 1:
    print("Data for:-", url[2], " populated successfully !!")

time.sleep(2)

all_news = scrapeMedTechDive(url[3])
ret = populate(all_news)
if ret == 1:
    print("Data for:-", url[3], " populated successfully !!")

all_news = scrapeRaps(url[4])
ret = populate(all_news)
if ret == 1:
    print("Data for:- ", url[4], " populated successfully")

 

Utility function for https://www.ema.europa.eu called !
meeting highlights from the committee for medicinal products for veterinary use (cvmp) 18-19 january 2022 - Keywords matched are:-  []
	 No Key Match, news content should not be populated !
international regulators’ recommendations on covid-19 vaccines and the omicron variant - Keywords matched are:-  []
	 No Key Match, news content should not be populated !
covid-19: latest safety data provide reassurance about use of mrna vaccines during pregnancy - Keywords matched are:-  []
	 No Key Match, news content should not be populated !
meeting highlights from the pharmacovigilance risk assessment committee (prac) 10 - 13 january 2022 - Keywords matched are:-  []
	 No Key Match, news content should not be populated !
ema welcomes eu commissioner for health and food safety - Keywords matched are:-  []
	 No Key Match, news content should not be populated !
accelerating clinical trials in the eu (act eu): for better clinical trials that a

	 News content should be populated !
edwards seen falling short in q4 based on early us tavr volume data: jefferies - Keywords matched are:-  []
	 No Key Match, news content should not be populated !
irhythm stock jumps nearly 28% after novitas doubles medicare rates for cardiac monitoring - Keywords matched are:-  []
	 No Key Match, news content should not be populated !
guidance matched for title : hologic tops guidance as covid-19 testing wave delivers another beat
hologic tops guidance as covid-19 testing wave delivers another beat - Keywords matched are:-  ['guidance']
guidance
	 News content should be populated !
guidance matched for title : medtronic ceo: company on track to hit guidance, omicron impact 'uncertain'
medtronic ceo: company on track to hit guidance, omicron impact 'uncertain' - Keywords matched are:-  ['guidance']
guidance
	 News content should be populated !
abbott targets consumer health, aims to turn diabetes success into wearable growth driver - Keywords matche