Skip to content

Commit 85e16ee

Browse files
committed
title, description, and image link scraper added
db gets created on it's own "First reads from db then request" method added error with empty db removed readme updated expiry time added to data minor changes
1 parent 1c1c9c7 commit 85e16ee

File tree

4 files changed

+170
-12
lines changed

4 files changed

+170
-12
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -679,3 +679,5 @@ test.py
679679
Test/
680680
reddit_tokens.json
681681
scriptcopy.py
682+
.vscode
683+
db.json

Link-Preview/README.md

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Link Preview
2+
3+
A script to provide the user with a preview of the link entered.
4+
5+
- When entered a link, the script will provide with title, description, and link of the website that the URL points to.
6+
- The script will do so by fetching the Html file for the link and analyzing the data from there.
7+
- The data will be saved in a `JSON` file named `db.json` for further reference
8+
- Every entry will have a time limit after which it will be updated (*Data expires after 7 days*)
9+
10+
## Setup instructions
11+
12+
Download the required packages from the following command in you terminal.(Make sure you're in the same project directory)
13+
14+
```
15+
pip3 install -r requirements.txt
16+
```
17+
18+
## Running the script:
19+
After installing all the requirements,run this command in your terminal.
20+
21+
```
22+
python3 linkPreview.py
23+
```
24+
25+
## Output
26+
27+
The script will provide you with Title, Description, Image Link and URL.
28+
29+
![demo gif](https://i.imgur.com/uoIG2io.gif)
30+
31+
## Author(s)
32+
Hi, I'm [Madhav Jha](https://github.com/jhamadhav) author of this script 🙋‍♂️
33+

Link-Preview/db.json

Whitespace-only changes.

Link-Preview/linkPreview.py

Lines changed: 135 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,141 @@
11
import requests
2+
import json
3+
import os
4+
import time
25
from bs4 import BeautifulSoup
36

4-
url = 'https://www.girlscript.tech/'
5-
r = requests.get(url)
6-
soup = BeautifulSoup(r.text,"html.parser")
7-
# print(soup)
7+
# to scrape title
88

9-
title = soup.find("meta", property = "og:title")
10-
description = soup.find("meta", property = "og:description")
11-
url = soup.find("meta", property = "og:url")
12-
img = soup.find("meta", property = "og:image")
139

10+
def getTitle(soup):
11+
ogTitle = soup.find("meta", property="og:title")
1412

15-
print("Title : ", title.get("content", None))
16-
print("Description : ", description.get("content", None))
17-
print("URL : ", url.get("content", None))
18-
print("Image Link : ", img.get("content", None))
13+
twitterTitle = soup.find("meta", attrs={"name": "twitter:title"})
14+
15+
documentTitle = soup.find("title")
16+
h1Title = soup.find("h1")
17+
h2Title = soup.find("h2")
18+
pTitle = soup.find("p")
19+
20+
res = ogTitle or twitterTitle or documentTitle or h1Title or h2Title or pTitle
21+
res = res.get_text() or res.get("content", None)
22+
23+
if (len(res) > 60):
24+
res = res[0:60]
25+
return res.strip()
26+
27+
# to scrape page description
28+
29+
30+
def getDesc(soup):
31+
ogDesc = soup.find("meta", property="og:description")
32+
33+
twitterDesc = soup.find("meta", attrs={"name": "twitter:description"})
34+
35+
metaDesc = soup.find("meta", attrs={"name": "description"})
36+
37+
pDesc = soup.find("p")
38+
39+
res = ogDesc or twitterDesc or metaDesc or pDesc
40+
res = res.get_text() or res.get("content", None)
41+
if (len(res) > 60):
42+
res = res[0:60]
43+
return res.strip()
44+
45+
# to scrape image link
46+
47+
48+
def getImage(soup, url):
49+
ogImg = soup.find("meta", property="og:image")
50+
51+
twitterImg = soup.find("meta", attrs={"name": "twitter:image"})
52+
53+
metaImg = soup.find("link", attrs={"rel": "img_src"})
54+
55+
img = soup.find("img")
56+
57+
res = ogImg or twitterImg or metaImg or img
58+
res = res.get("content", None) or res.get_text() or res.get("src", None)
59+
60+
count = 0
61+
for i in range(0, len(res)):
62+
if (res[i] == "." or res[i] == "/"):
63+
count += 1
64+
else:
65+
break
66+
res = res[count::]
67+
if ((not res == None) and ((not "https://" in res) or (not "https://" in res))):
68+
res = url + "/" + res
69+
if (res == None):
70+
res = "Not available"
71+
72+
return res
73+
74+
# print dictionary
75+
76+
77+
def printData(data):
78+
print("\nTitle : ", data["title"])
79+
print("Description : ", data["description"])
80+
print("URL : ", data["url"])
81+
print("Image link : ", data["image"])
82+
83+
84+
# start
85+
print("\n======================")
86+
print("- Link Preview -")
87+
print("======================\n")
88+
89+
# get url from user
90+
url = input("Enter URL to preview : ")
91+
92+
# parsing and checking the url
93+
if (url == ""):
94+
url = 'www.girlscript.tech'
95+
if ((not "http://" in url) or (not "https://" in url)):
96+
url = "https://" + url
97+
98+
# printing values
99+
100+
# first check in the DB
101+
db = {}
102+
# create file if it doesn't exist
103+
if not os.path.exists('Link-Preview/db.json'):
104+
f = open('Link-Preview/db.json', "w")
105+
f.write("{}")
106+
f.close()
107+
108+
# read db
109+
with open('Link-Preview/db.json', 'r+') as file:
110+
data = file.read()
111+
if (len(data) == 0):
112+
data = "{}"
113+
file.write(data)
114+
db = json.loads(data)
115+
116+
# check if it exists
117+
if (url in db and db[url]["time"] < round(time.time())):
118+
printData(db[url])
119+
else:
120+
# if not in db get via request
121+
122+
# getting the html
123+
r = requests.get(url)
124+
soup = BeautifulSoup(r.text, "html.parser")
125+
126+
sevenDaysInSec = 7*24*60*60
127+
# printing data
128+
newData = {
129+
"title": getTitle(soup),
130+
"description": getDesc(soup),
131+
"url": url,
132+
"image": getImage(soup, url),
133+
"time": round(time.time() * 1000) + sevenDaysInSec
134+
}
135+
printData(newData)
136+
# parse file
137+
db[url] = newData
138+
with open('Link-Preview/db.json', 'w') as file:
139+
json.dump(db, file)
140+
141+
print("\n--END--\n")

0 commit comments

Comments
 (0)