/
script.py
96 lines (77 loc) · 2.56 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import time
import datetime
import re
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service as ChromeService
today = datetime.date.today()
last_week = today - datetime.timedelta(days=1)
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--headless")
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
url = f"https://cryptopotato.com/category/crypto-news/page/"
num_pages = 1
page_links = []
import random
data_links = []
for page in range(num_pages + 1):
page_url = f"{url}{page}"
time.sleep(random.uniform(1, 15))
driver.get(page_url)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
links = soup.select('h3.media-heading a')
date_elements = soup.find_all("time", class_="date")
date_list = []
for date_element in date_elements:
date_string = date_element.a.text.strip()
date_list.append(date_string)
count = 0
for date_str in date_list:
date_obj = datetime.datetime.strptime(date_str, "%b %d, %Y").date()
if last_week < date_obj <= today:
count += 1
print(date_obj)
if count > 0:
new_list = [link['href'] for link in links[:count]]
data_links.extend(new_list)
else:
break
driver.quit()
final_links = []
for link in data_links:
try:
final_links.append(link)
except:
pass
print(len(final_links))
titles = []
bodies = []
dates = []
count = 1
for link in final_links[:-17]:
try:
response = requests.get(link)
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.find("div", class_="page-title").find("h1").text
body = soup.find("div", class_="entry-content col-sm-11").text
date = soup.find("span", class_="last-modified-timestamp").text
titles.append(title)
bodies.append(body)
dates.append(date)
time.sleep(random.uniform(1, 15))
count += 1
except:
print("An error occurred")
import pandas as pd
df = pd.DataFrame({'Title': titles, 'Content': bodies, 'Date': dates})
# Save the CSV file with the current date
current_date = datetime.datetime.now().strftime("%Y-%m-%d")
#csv_file_name = f"crypto_{current_date}.csv"
csv_file_name = "crypto.csv"
df.to_csv(csv_file_name, index=False)