Skip to content

Commit db61165

Browse files
committed
title, description, and image link scraper added
1 parent 1c1c9c7 commit db61165

File tree

2 files changed

+86
-12
lines changed

2 files changed

+86
-12
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -679,3 +679,4 @@ test.py
679679
Test/
680680
reddit_tokens.json
681681
scriptcopy.py
682+
.vscode

Link-Preview/linkPreview.py

Lines changed: 85 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,91 @@
11
import requests
22
from bs4 import BeautifulSoup
33

4-
url = 'https://www.girlscript.tech/'
5-
r = requests.get(url)
6-
soup = BeautifulSoup(r.text,"html.parser")
7-
# print(soup)
4+
# to scrape title
5+
6+
7+
def getTitle(soup):
8+
ogTitle = soup.find("meta", property="og:title")
9+
10+
twitterTitle = soup.find("meta", attrs={"name": "twitter:title"})
11+
12+
documentTitle = soup.find("title")
13+
h1Title = soup.find("h1")
14+
h2Title = soup.find("h2")
15+
pTitle = soup.find("p")
16+
17+
res = ogTitle or twitterTitle or documentTitle or h1Title or h2Title or pTitle
18+
res = res.get_text() or res.get("content", None)
19+
20+
if (len(res) > 60):
21+
res = res[0:60]
22+
return res.strip()
23+
24+
# to scrape page description
25+
26+
27+
def getDesc(soup):
28+
ogDesc = soup.find("meta", property="og:description")
29+
30+
twitterDesc = soup.find("meta", attrs={"name": "twitter:description"})
31+
32+
metaDesc = soup.find("meta", attrs={"name": "description"})
33+
34+
pDesc = soup.find("p")
35+
36+
res = ogDesc or twitterDesc or metaDesc or pDesc
37+
res = res.get_text() or res.get("content", None)
38+
if (len(res) > 60):
39+
res = res[0:60]
40+
return res.strip()
841

9-
title = soup.find("meta", property = "og:title")
10-
description = soup.find("meta", property = "og:description")
11-
url = soup.find("meta", property = "og:url")
12-
img = soup.find("meta", property = "og:image")
42+
# to scrape image link
1343

1444

15-
print("Title : ", title.get("content", None))
16-
print("Description : ", description.get("content", None))
17-
print("URL : ", url.get("content", None))
18-
print("Image Link : ", img.get("content", None))
45+
def getImage(soup, url):
46+
ogImg = soup.find("meta", property="og:image")
47+
48+
twitterImg = soup.find("meta", attrs={"name": "twitter:image"})
49+
50+
metaImg = soup.find("link", attrs={"rel": "img_src"})
51+
52+
img = soup.find("img")
53+
54+
res = ogImg or twitterImg or metaImg or img
55+
res = res.get("content", None) or res.get_text() or res.get("src", None)
56+
57+
if ((not res == None) and ((not "https://" in res) or (not "https://" in res))):
58+
res.replace(".", "")
59+
if (not res[0] == "/"):
60+
res = "/" + res
61+
res = url + res
62+
if (res == None):
63+
res = "Not available"
64+
65+
return res
66+
67+
68+
# start
69+
print("\n======================")
70+
print("- Link Preview -")
71+
print("======================\n")
72+
73+
# get url from user
74+
url = input("Enter URL to preview : ")
75+
76+
# parsing and checking the url
77+
if (url == ""):
78+
url = 'www.girlscript.tech'
79+
if ((not "http://" in url) or (not "https://" in url)):
80+
url = "https://" + url
81+
82+
# getting the html
83+
r = requests.get(url)
84+
soup = BeautifulSoup(r.text, "html.parser")
85+
86+
# printing values
87+
print("\nTitle : ", getTitle(soup))
88+
print("Description : ", getDesc(soup))
89+
print("URL : ", url)
90+
print("Image link : ", getImage(soup, url))
91+
print("\n--END--\n")

0 commit comments

Comments
 (0)