Skip to content

Commit

Permalink
Merge pull request #160 from illuricharles/data-scraping
Browse files Browse the repository at this point in the history
Data scraping
  • Loading branch information
Subhradeep10 committed Oct 22, 2022
2 parents 2b8da56 + 1c96c11 commit 01cb1e5
Show file tree
Hide file tree
Showing 3 changed files with 104 additions and 0 deletions.
70 changes: 70 additions & 0 deletions DataScraping/Data-Scraping.py
@@ -0,0 +1,70 @@
import requests
from bs4 import BeautifulSoup
import re
import time


def check_request(url):

print(f'[+] Sending Request to {url}')
time.sleep(1) #To avoid load on the server
req = requests.get(url)

print(f'[+] Request sent to {url}')

#Checking the status code of the web page
if req.status_code != 200:
raise Exception(f'failed to load web page {url} {req.status_code}')
return -1

print(f'[+] Web Page Loaded Successfully')
return req.text


def url_check(url):

url_pattern = "^https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)$"

#Checking the url in correct format or not
if not(re.match(url_pattern, url)):

print(f'ERROR: \"{url}\" URL SHOULD BE IN CORRECT FORMAT[EXAMPLE: https://google.com, https://stackoverflow.com/, ..]')
return -1

return 0


def scraping(doc, tag, url):

# Finding all the required the tags in the page
data_collected = doc.find_all(tag)

if len(data_collected) == 0:
print(f'[-] There is no present in the given url {url} with {tag} tag')
return -1

print(f'[+] Data Present in the given url {url} with {tag} tags are: \n')
for data in data_collected:
print(data.text) # Printing the data in the tags


def scrap_data(url, tag):

url_check_status = url_check(url)
if url_check_status == -1:
print(f'[-] Execution Unsuccessful')
return

web_content = check_request(url)
if web_content == -1:
print(f'[-] Execution Unsuccessful')
return

doc = BeautifulSoup(web_content, 'html.parser')
scraping(doc, tag, url)
print('\n[+] Execution Successfully Completed')

url = input('Enter/Paste The Url: ')
tag = input('Enter the tag you wanna scrape. EX: p, h1, h2....: ')
scrap_data(url, tag)

32 changes: 32 additions & 0 deletions DataScraping/README.md
@@ -0,0 +1,32 @@
# Data-Scraping

Data scraping is the process of extracting and parsing data from websites in an automated fashion using a computer program. It's a useful technique for creating datasets for research and learning. It is developed using python3.

### Required Libraries
- Requests
- BeautifulSoup

### project outline:
- After the execution of the enter the url of the web page that you want to scrape.
- Next enter the html tag element to scrap particular tag.

#### First Clone the Repo

```
git clone https://github.com/Subhradeep10/Automation-Scripts-Using-Python.git
```
#### Then move to the folder

```
cd Automation-Scripts-Using-Python/DataScraping
```

#### Install Requirements
```
pip install -r requirements.txt
```
#### Then execute the code
From terminal
```
python Data-Scraping.py
```
2 changes: 2 additions & 0 deletions DataScraping/requirements.txt
@@ -0,0 +1,2 @@
requests >= 2.24.0
beautifulsoup4 >=4.9.1

0 comments on commit 01cb1e5

Please sign in to comment.