title, description, and image link scraper added

jhamadhav · jhamadhav · commit 85e16eebf828 · 2021-04-13T15:39:55.000+05:30
db gets created on it's own

"First reads from db then request" method added

error with empty db removed

readme updated

expiry time added to data

minor changes
diff --git a/.gitignore b/.gitignore
@@ -679,3 +679,5 @@ test.py
 Test/
 reddit_tokens.json
 scriptcopy.py
+.vscode
+db.json
diff --git a/Link-Preview/README.md b/Link-Preview/README.md
@@ -0,0 +1,33 @@
+# Link Preview
+
+A script to provide the user with a preview of the link entered.
+
+- When entered a link, the script will provide with title, description, and link of the website that the URL points to.
+- The script will do so by fetching the Html file for the link and analyzing the data from there.
+- The data will be saved in a `JSON` file named `db.json` for further reference
+- Every entry will have a time limit after which it will be updated (*Data expires after 7 days*)
+
+## Setup instructions
+
+Download the required packages from the following command in you terminal.(Make sure you're in the same project directory) 
+
+```
+pip3 install -r requirements.txt
+```
+
+## Running the script:
+After installing all the requirements,run this command in your terminal.
+
+```
+python3 linkPreview.py
+```
+
+## Output
+
+The script will provide you with Title, Description, Image Link and URL.
+
+![demo gif](https://i.imgur.com/uoIG2io.gif)
+
+## Author(s) 
+Hi, I'm [Madhav Jha](https://github.com/jhamadhav) author of this script 🙋‍♂️
+
diff --git a/Link-Preview/db.json b/Link-Preview/db.json
diff --git a/Link-Preview/linkPreview.py b/Link-Preview/linkPreview.py
@@ -1,18 +1,141 @@
 import requests
+import json
+import os
+import time
 from bs4 import BeautifulSoup
 
-url = 'https://www.girlscript.tech/'
-r = requests.get(url)
-soup = BeautifulSoup(r.text,"html.parser")
-# print(soup)
+# to scrape title
 
-title = soup.find("meta", property = "og:title")
-description = soup.find("meta", property = "og:description")
-url = soup.find("meta", property = "og:url")
-img = soup.find("meta", property = "og:image")
 
+def getTitle(soup):
+    ogTitle = soup.find("meta", property="og:title")
 
-print("Title : ", title.get("content", None))
-print("Description : ", description.get("content", None))
-print("URL : ", url.get("content", None))
-print("Image Link : ", img.get("content", None))
+    twitterTitle = soup.find("meta", attrs={"name": "twitter:title"})
+
+    documentTitle = soup.find("title")
+    h1Title = soup.find("h1")
+    h2Title = soup.find("h2")
+    pTitle = soup.find("p")
+
+    res = ogTitle or twitterTitle or documentTitle or h1Title or h2Title or pTitle
+    res = res.get_text() or res.get("content", None)
+
+    if (len(res) > 60):
+        res = res[0:60]
+    return res.strip()
+
+# to scrape page description
+
+
+def getDesc(soup):
+    ogDesc = soup.find("meta", property="og:description")
+
+    twitterDesc = soup.find("meta", attrs={"name": "twitter:description"})
+
+    metaDesc = soup.find("meta", attrs={"name": "description"})
+
+    pDesc = soup.find("p")
+
+    res = ogDesc or twitterDesc or metaDesc or pDesc
+    res = res.get_text() or res.get("content", None)
+    if (len(res) > 60):
+        res = res[0:60]
+    return res.strip()
+
+# to scrape image link
+
+
+def getImage(soup, url):
+    ogImg = soup.find("meta", property="og:image")
+
+    twitterImg = soup.find("meta", attrs={"name": "twitter:image"})
+
+    metaImg = soup.find("link", attrs={"rel": "img_src"})
+
+    img = soup.find("img")
+
+    res = ogImg or twitterImg or metaImg or img
+    res = res.get("content", None) or res.get_text() or res.get("src", None)
+
+    count = 0
+    for i in range(0, len(res)):
+        if (res[i] == "." or res[i] == "/"):
+            count += 1
+        else:
+            break
+    res = res[count::]
+    if ((not res == None) and ((not "https://" in res) or (not "https://" in res))):
+        res = url + "/" + res
+    if (res == None):
+        res = "Not available"
+
+    return res
+
+# print dictionary
+
+
+def printData(data):
+    print("\nTitle : ", data["title"])
+    print("Description : ", data["description"])
+    print("URL : ", data["url"])
+    print("Image link : ", data["image"])
+
+
+# start
+print("\n======================")
+print("- Link Preview -")
+print("======================\n")
+
+# get url from user
+url = input("Enter URL to preview : ")
+
+# parsing and checking the url
+if (url == ""):
+    url = 'www.girlscript.tech'
+if ((not "http://" in url) or (not "https://" in url)):
+    url = "https://" + url
+
+# printing values
+
+# first check in the DB
+db = {}
+# create file if it doesn't exist
+if not os.path.exists('Link-Preview/db.json'):
+    f = open('Link-Preview/db.json', "w")
+    f.write("{}")
+    f.close()
+
+# read db
+with open('Link-Preview/db.json', 'r+') as file:
+    data = file.read()
+    if (len(data) == 0):
+        data = "{}"
+        file.write(data)
+    db = json.loads(data)
+
+# check if it exists
+if (url in db and db[url]["time"] < round(time.time())):
+    printData(db[url])
+else:
+    # if not in db get via request
+
+    # getting the html
+    r = requests.get(url)
+    soup = BeautifulSoup(r.text, "html.parser")
+
+    sevenDaysInSec = 7*24*60*60
+    # printing data
+    newData = {
+        "title": getTitle(soup),
+        "description": getDesc(soup),
+        "url": url,
+        "image": getImage(soup, url),
+        "time": round(time.time() * 1000) + sevenDaysInSec
+    }
+    printData(newData)
+    # parse file
+    db[url] = newData
+    with open('Link-Preview/db.json', 'w') as file:
+        json.dump(db, file)
+
+print("\n--END--\n")