# Web Scrapping by Inspection

In [1]:
#Webscrapping Part1

#Webscrapping is applied to worldometers to extract Covid-19 data

#import libraries
import requests
import lxml.html as lh
import pandas as pd

#Scrape table Cells
url='https://www.worldometers.info/coronavirus/'

#Create a handle, page, to handle the contents of the website
page = requests.get(url)

#Store the contents of the website under container
container = lh.fromstring(page.content)

#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = container.xpath('//tr')


In [2]:
#Check the length of the first 12 rows
[len(T) for T in tr_elements[:12]]


[13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13]

In [3]:
#Testing
tr_elements[1][1].text

'1,239,124'

In [4]:
tr_elements = container.xpath('//tr')

#Create empty list
col=[]
i=0

#For each row, store each first element (header) and an empty list
for x in tr_elements[0]:
    i+=1
    name=x.text_content()
    print (('%d:"%s"')%(i,name))
    col.append((name,[]))
	
#Since the first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #Td is our j'th row
    Td=tr_elements[j]
    
    #If row is not of size 10, the //tr data is not from our table 
    #if len(Td)!=10:
        #break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for x in Td.iterchildren():
        data=x.text_content()
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

#Testing to check content of col
#print(col)


1:"Country,Other"
2:"TotalCases"
3:"NewCases"
4:"TotalDeaths"
5:"NewDeaths"
6:"TotalRecovered"
7:"ActiveCases"
8:"Serious,Critical"
9:"Tot Cases/1M pop"
10:"Deaths/1M pop"
11:"TotalTests"
12:"Tests/
1M pop
"
13:"Continent"


In [5]:
#To check length of the column are the same 
[len(C) for (title,C) in col]


[461, 461, 461, 461, 461, 461, 461, 461, 461, 461, 461, 461, 461]

In [6]:
#Create the Dataframe
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

# Show sample of the dataframe
df.head()


Unnamed: 0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/\n1M pop\n,Continent
0,\nNorth America\n,1239124,14169,72830,1050,202001,964293,17724,,,,,North America
1,\nEurope\n,1422787,24097,139163,1754,530673,752951,17785,,,,,Europe
2,\nAsia\n,542707,8866,19203,242,282936,240568,5500,,,,,Asia
3,\nSouth America\n,193304,3205,9833,353,64716,118755,9849,,,,,South America
4,\nOceania\n,8370,22,113,1,7134,1123,30,,,,,Australia/Oceania


# Insert Into MongoDB

In [18]:
#Import into MongoDB
import pymongo 
from pymongo import MongoClient

mng_client = pymongo.MongoClient('localhost', 27017) # Making a Connection with MongoClient
mng_db = mng_client['WorldometerDatabase'] # Replace mongodb name
collection_name = 'Worldometer' # Replace mongodb collection name
db_cm = mng_db[collection_name]

#Insert Data 
db_cm.insert_one({"index":"World","Web_Data":Dict})


<pymongo.results.InsertOneResult at 0x22231fcf508>

# Retrieve from MongoDB

In [24]:
data_from_db = db_cm.find_one({"index":"World"})
dfmongo = pd.DataFrame(data_from_db["Web_Data"])

dfmongo.head() # view sample of the dataframe

# print(dfmongo) # print the dataframe


Unnamed: 0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/\n1M pop\n,Continent
0,\nNorth America\n,1104614,13818,62770,1221,171053,870791,15412,,,,,North America
1,\nEurope\n,1321890,20619,125915,1918,457982,737993,24851,,,,,Europe
2,\nAsia\n,492632,9406,17830,242,244923,229879,5833,,,,,Asia
3,\nSouth America\n,147524,3350,7081,351,52029,88414,9773,,,,,South America
4,\nOceania\n,8305,14,103,1,6918,1284,45,,,,,Australia/Oceania


# Export Worldometer Dataframe to CSV file

In [12]:
#Export Dataframe to CSV file
dfmongo.to_csv (r'C:\Users\presi\export_worldometer.csv', index = False, header=True)

# Web Scrapping by API Call

In [35]:
#Webscrapping Part2

#API Call will be used for this part of webscrapping

#import the needed library
import pandas as pd
from sodapy import Socrata

# Unauthenticated client only works with public data sets. 
# Hence, the reason we specicified 'None' to replace inputs for application token, and no username or password:
client = Socrata("data.medicare.gov", None)


# To extract First 2000 results, returned as JSON from API / converted to Python list of dictionaries by sodapy.
hospdata = client.get("xubh-q36u", limit=2000)

# Convert to pandas DataFrame
hospital_data = pd.DataFrame.from_records(hospdata)

# Show sample of the dataframe
hospital_data.head()



Unnamed: 0,provider_id,hospital_name,address,city,state,zip_code,county_name,phone_number,hospital_type,hospital_ownership,...,timeliness_of_care_national_comparison_footnote,efficient_use_of_medical_imaging_national_comparison,efficient_use_of_medical_imaging_national_comparison_footnote,readmission_national_comparison_footnote,patient_experience_national_comparison_footnote,effectiveness_of_care_national_comparison_footnote,geocoded_column,:@computed_region_f3tr_pr43,:@computed_region_nwen_78xc,:@computed_region_csmy_5jwy
0,030084,CHINLE COMPREHENSIVE HEALTH CARE FACILITY,"US HWY 191, HOSPITAL ROAD",CHINLE,AZ,86503,APACHE,(928) 674-7001,Acute Care Hospitals,Government - Federal,...,5.0,Not Available,16,,,,,,,
1,044022,CONWAY BEHAVIORAL HEALTH,2255 STURGIS ROAD,CONWAY,AR,72034,FAULKNER,(501) 205-0011,Psychiatric,Proprietary,...,19.0,Not Available,19,19.0,19.0,19.0,,,,
2,10021F,96th Medical Group (Eglin AFB),96 MDG 307 Boatner Rd Suite 114,Eglin AFB,FL,32542,OKALOOSA,(850) 883-8600,Acute Care - Department of Defense,Department of Defense,...,22.0,Not Available,22,22.0,22.0,22.0,,,,
3,170204,"ROCK REGIONAL HOSPITAL, LLC",3251 NORTH ROCK ROAD,DERBY,KS,67037,SEDGWICK,(833) 345-7625,Acute Care Hospitals,Proprietary,...,5.0,Not Available,5,5.0,5.0,5.0,"{'type': 'Point', 'coordinates': [-97.244407, ...",1291.0,1291.0,17.0
4,171322,HAMILTON COUNTY HOSPITAL,700 NORTH HUSER,SYRACUSE,KS,67878,HAMILTON,(620) 384-7461,Critical Access Hospitals,Government - Local,...,,Not Available,16,16.0,16.0,5.0,,,,


In [36]:
data_dict = hospital_data.to_dict("records")

# Insert into MongoDB

In [39]:
#Import into MongoDB
import pymongo 
from pymongo import MongoClient

mng_client = pymongo.MongoClient('localhost', 27017) # Making a Connection with MongoClient
mng_db = mng_client['WorldometerDatabase'] # Replace mongodb name
collection_name = 'Hospital' # Replace mongodb collection name
hb_cm = mng_db[collection_name]

#Insert Data 
hb_cm.insert_one({"index":"USA","Hospital_Data":data_dict})


<pymongo.results.InsertOneResult at 0x22231cc5308>

# Retrieve from MongoDB

In [40]:
data_from_db2 = hb_cm.find_one({"index":"USA"})
hfmongo = pd.DataFrame(data_from_db2["Hospital_Data"])

hfmongo.head() # view sample of the dataframe

# print(dfmongo) # print the dataframe


Unnamed: 0,provider_id,hospital_name,address,city,state,zip_code,county_name,phone_number,hospital_type,hospital_ownership,...,timeliness_of_care_national_comparison_footnote,efficient_use_of_medical_imaging_national_comparison,efficient_use_of_medical_imaging_national_comparison_footnote,readmission_national_comparison_footnote,patient_experience_national_comparison_footnote,effectiveness_of_care_national_comparison_footnote,geocoded_column,:@computed_region_f3tr_pr43,:@computed_region_nwen_78xc,:@computed_region_csmy_5jwy
0,030084,CHINLE COMPREHENSIVE HEALTH CARE FACILITY,"US HWY 191, HOSPITAL ROAD",CHINLE,AZ,86503,APACHE,(928) 674-7001,Acute Care Hospitals,Government - Federal,...,5.0,Not Available,16,,,,,,,
1,044022,CONWAY BEHAVIORAL HEALTH,2255 STURGIS ROAD,CONWAY,AR,72034,FAULKNER,(501) 205-0011,Psychiatric,Proprietary,...,19.0,Not Available,19,19.0,19.0,19.0,,,,
2,10021F,96th Medical Group (Eglin AFB),96 MDG 307 Boatner Rd Suite 114,Eglin AFB,FL,32542,OKALOOSA,(850) 883-8600,Acute Care - Department of Defense,Department of Defense,...,22.0,Not Available,22,22.0,22.0,22.0,,,,
3,170204,"ROCK REGIONAL HOSPITAL, LLC",3251 NORTH ROCK ROAD,DERBY,KS,67037,SEDGWICK,(833) 345-7625,Acute Care Hospitals,Proprietary,...,5.0,Not Available,5,5.0,5.0,5.0,"{'type': 'Point', 'coordinates': [-97.244407, ...",1291.0,1291.0,17.0
4,171322,HAMILTON COUNTY HOSPITAL,700 NORTH HUSER,SYRACUSE,KS,67878,HAMILTON,(620) 384-7461,Critical Access Hospitals,Government - Local,...,,Not Available,16,16.0,16.0,5.0,,,,


# Export Hospital Dataframe to CSV file

In [None]:
#Export Dataframe to CSV file
hfmongo.to_csv (r'C:\Users\presi\export_hospital_data.csv', index = False, header=True)