Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #160 from illuricharles/data-scraping
Data scraping
- Loading branch information
Showing
3 changed files
with
104 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
import requests | ||
from bs4 import BeautifulSoup | ||
import re | ||
import time | ||
|
||
|
||
def check_request(url): | ||
|
||
print(f'[+] Sending Request to {url}') | ||
time.sleep(1) #To avoid load on the server | ||
req = requests.get(url) | ||
|
||
print(f'[+] Request sent to {url}') | ||
|
||
#Checking the status code of the web page | ||
if req.status_code != 200: | ||
raise Exception(f'failed to load web page {url} {req.status_code}') | ||
return -1 | ||
|
||
print(f'[+] Web Page Loaded Successfully') | ||
return req.text | ||
|
||
|
||
def url_check(url): | ||
|
||
url_pattern = "^https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)$" | ||
|
||
#Checking the url in correct format or not | ||
if not(re.match(url_pattern, url)): | ||
|
||
print(f'ERROR: \"{url}\" URL SHOULD BE IN CORRECT FORMAT[EXAMPLE: https://google.com, https://stackoverflow.com/, ..]') | ||
return -1 | ||
|
||
return 0 | ||
|
||
|
||
def scraping(doc, tag, url): | ||
|
||
# Finding all the required the tags in the page | ||
data_collected = doc.find_all(tag) | ||
|
||
if len(data_collected) == 0: | ||
print(f'[-] There is no present in the given url {url} with {tag} tag') | ||
return -1 | ||
|
||
print(f'[+] Data Present in the given url {url} with {tag} tags are: \n') | ||
for data in data_collected: | ||
print(data.text) # Printing the data in the tags | ||
|
||
|
||
def scrap_data(url, tag): | ||
|
||
url_check_status = url_check(url) | ||
if url_check_status == -1: | ||
print(f'[-] Execution Unsuccessful') | ||
return | ||
|
||
web_content = check_request(url) | ||
if web_content == -1: | ||
print(f'[-] Execution Unsuccessful') | ||
return | ||
|
||
doc = BeautifulSoup(web_content, 'html.parser') | ||
scraping(doc, tag, url) | ||
print('\n[+] Execution Successfully Completed') | ||
|
||
url = input('Enter/Paste The Url: ') | ||
tag = input('Enter the tag you wanna scrape. EX: p, h1, h2....: ') | ||
scrap_data(url, tag) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
# Data-Scraping | ||
|
||
Data scraping is the process of extracting and parsing data from websites in an automated fashion using a computer program. It's a useful technique for creating datasets for research and learning. It is developed using python3. | ||
|
||
### Required Libraries | ||
- Requests | ||
- BeautifulSoup | ||
|
||
### project outline: | ||
- After the execution of the enter the url of the web page that you want to scrape. | ||
- Next enter the html tag element to scrap particular tag. | ||
|
||
#### First Clone the Repo | ||
|
||
``` | ||
git clone https://github.com/Subhradeep10/Automation-Scripts-Using-Python.git | ||
``` | ||
#### Then move to the folder | ||
|
||
``` | ||
cd Automation-Scripts-Using-Python/DataScraping | ||
``` | ||
|
||
#### Install Requirements | ||
``` | ||
pip install -r requirements.txt | ||
``` | ||
#### Then execute the code | ||
From terminal | ||
``` | ||
python Data-Scraping.py | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
requests >= 2.24.0 | ||
beautifulsoup4 >=4.9.1 |