Merge pull request #160 from illuricharles/data-scraping

Data scraping
Subhradeep10 · Oct 22, 2022 · 01cb1e5 · 01cb1e5
2 parents 2b8da56 + 1c96c11
commit 01cb1e5
Show file tree

Hide file tree

Showing 3 changed files with 104 additions and 0 deletions.
diff --git a/DataScraping/Data-Scraping.py b/DataScraping/Data-Scraping.py
@@ -0,0 +1,70 @@
+import requests
+from bs4 import BeautifulSoup
+import re
+import time
+
+
+def check_request(url):
+
+    print(f'[+] Sending Request to {url}')
+    time.sleep(1)  #To avoid load on the server 
+    req = requests.get(url)
+
+    print(f'[+] Request sent to {url}')
+
+    #Checking the status code of the web page
+    if req.status_code != 200:
+        raise Exception(f'failed to load web page {url} {req.status_code}')
+        return -1
+
+    print(f'[+] Web Page Loaded Successfully')
+    return req.text
+
+
+def url_check(url):
+
+    url_pattern = "^https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)$"
+
+    #Checking the url in correct format or not
+    if not(re.match(url_pattern, url)):
+
+        print(f'ERROR: \"{url}\" URL SHOULD BE IN CORRECT FORMAT[EXAMPLE: https://google.com, https://stackoverflow.com/, ..]')
+        return -1
+
+    return 0
+
+
+def scraping(doc, tag, url):
+
+    # Finding all the required the tags in the page
+    data_collected = doc.find_all(tag)
+
+    if len(data_collected) == 0:
+        print(f'[-] There is no present in the given url {url} with {tag} tag')
+        return -1
+
+    print(f'[+] Data Present in the given url {url} with {tag} tags are: \n')
+    for data in data_collected:
+        print(data.text) # Printing the data in the tags
+
+
+def scrap_data(url, tag):
+
+    url_check_status = url_check(url)
+    if url_check_status == -1:
+        print(f'[-] Execution Unsuccessful')
+        return
+
+    web_content = check_request(url)
+    if web_content == -1:
+        print(f'[-] Execution Unsuccessful')
+        return
+
+    doc = BeautifulSoup(web_content, 'html.parser')
+    scraping(doc, tag, url)
+    print('\n[+] Execution Successfully Completed')
+
+url = input('Enter/Paste The Url: ')
+tag = input('Enter the tag you wanna scrape. EX: p, h1, h2....: ')
+scrap_data(url, tag)
+
diff --git a/DataScraping/README.md b/DataScraping/README.md
@@ -0,0 +1,32 @@
+# Data-Scraping
+
+ Data scraping is the process of extracting and parsing data from websites in an automated fashion using a computer program. It's a useful technique for creating datasets for research and learning. It is developed using python3.
+
+### Required Libraries
+- Requests
+- BeautifulSoup
+
+### project outline:
+- After the execution of the enter the url of the web page that you want to scrape.
+- Next enter the html tag element to scrap particular tag.
+
+#### First Clone the Repo
+
+```
+git clone https://github.com/Subhradeep10/Automation-Scripts-Using-Python.git
+```
+#### Then move to the folder
+
+```
+cd Automation-Scripts-Using-Python/DataScraping
+```
+
+#### Install Requirements
+```
+pip install -r requirements.txt
+```
+#### Then execute the code
+From terminal
+```
+python Data-Scraping.py
+```
diff --git a/DataScraping/requirements.txt b/DataScraping/requirements.txt
@@ -0,0 +1,2 @@
+requests >= 2.24.0
+beautifulsoup4 >=4.9.1