|
1 | 1 | import requests
|
2 | 2 | from bs4 import BeautifulSoup
|
3 | 3 |
|
4 |
| -url = 'https://www.girlscript.tech/' |
5 |
| -r = requests.get(url) |
6 |
| -soup = BeautifulSoup(r.text,"html.parser") |
7 |
| -# print(soup) |
| 4 | +# to scrape title |
| 5 | + |
| 6 | + |
| 7 | +def getTitle(soup): |
| 8 | + ogTitle = soup.find("meta", property="og:title") |
| 9 | + |
| 10 | + twitterTitle = soup.find("meta", attrs={"name": "twitter:title"}) |
| 11 | + |
| 12 | + documentTitle = soup.find("title") |
| 13 | + h1Title = soup.find("h1") |
| 14 | + h2Title = soup.find("h2") |
| 15 | + pTitle = soup.find("p") |
| 16 | + |
| 17 | + res = ogTitle or twitterTitle or documentTitle or h1Title or h2Title or pTitle |
| 18 | + res = res.get_text() or res.get("content", None) |
| 19 | + |
| 20 | + if (len(res) > 60): |
| 21 | + res = res[0:60] |
| 22 | + return res.strip() |
| 23 | + |
| 24 | +# to scrape page description |
| 25 | + |
| 26 | + |
| 27 | +def getDesc(soup): |
| 28 | + ogDesc = soup.find("meta", property="og:description") |
| 29 | + |
| 30 | + twitterDesc = soup.find("meta", attrs={"name": "twitter:description"}) |
| 31 | + |
| 32 | + metaDesc = soup.find("meta", attrs={"name": "description"}) |
| 33 | + |
| 34 | + pDesc = soup.find("p") |
| 35 | + |
| 36 | + res = ogDesc or twitterDesc or metaDesc or pDesc |
| 37 | + res = res.get_text() or res.get("content", None) |
| 38 | + if (len(res) > 60): |
| 39 | + res = res[0:60] |
| 40 | + return res.strip() |
8 | 41 |
|
9 |
| -title = soup.find("meta", property = "og:title") |
10 |
| -description = soup.find("meta", property = "og:description") |
11 |
| -url = soup.find("meta", property = "og:url") |
12 |
| -img = soup.find("meta", property = "og:image") |
| 42 | +# to scrape image link |
13 | 43 |
|
14 | 44 |
|
15 |
| -print("Title : ", title.get("content", None)) |
16 |
| -print("Description : ", description.get("content", None)) |
17 |
| -print("URL : ", url.get("content", None)) |
18 |
| -print("Image Link : ", img.get("content", None)) |
| 45 | +def getImage(soup, url): |
| 46 | + ogImg = soup.find("meta", property="og:image") |
| 47 | + |
| 48 | + twitterImg = soup.find("meta", attrs={"name": "twitter:image"}) |
| 49 | + |
| 50 | + metaImg = soup.find("link", attrs={"rel": "img_src"}) |
| 51 | + |
| 52 | + img = soup.find("img") |
| 53 | + |
| 54 | + res = ogImg or twitterImg or metaImg or img |
| 55 | + res = res.get("content", None) or res.get_text() or res.get("src", None) |
| 56 | + |
| 57 | + if ((not res == None) and ((not "https://" in res) or (not "https://" in res))): |
| 58 | + res.replace(".", "") |
| 59 | + if (not res[0] == "/"): |
| 60 | + res = "/" + res |
| 61 | + res = url + res |
| 62 | + if (res == None): |
| 63 | + res = "Not available" |
| 64 | + |
| 65 | + return res |
| 66 | + |
| 67 | + |
| 68 | +# start |
| 69 | +print("\n======================") |
| 70 | +print("- Link Preview -") |
| 71 | +print("======================\n") |
| 72 | + |
| 73 | +# get url from user |
| 74 | +url = input("Enter URL to preview : ") |
| 75 | + |
| 76 | +# parsing and checking the url |
| 77 | +if (url == ""): |
| 78 | + url = 'www.girlscript.tech' |
| 79 | +if ((not "http://" in url) or (not "https://" in url)): |
| 80 | + url = "https://" + url |
| 81 | + |
| 82 | +# getting the html |
| 83 | +r = requests.get(url) |
| 84 | +soup = BeautifulSoup(r.text, "html.parser") |
| 85 | + |
| 86 | +# printing values |
| 87 | +print("\nTitle : ", getTitle(soup)) |
| 88 | +print("Description : ", getDesc(soup)) |
| 89 | +print("URL : ", url) |
| 90 | +print("Image link : ", getImage(soup, url)) |
| 91 | +print("\n--END--\n") |
0 commit comments