In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import openpyxl
from datetime import datetime
import re

In [2]:
# Get the number of last page avalaible
def get_last_news_page() -> int:
    url = "https://www.gamespot.com/news/"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    btn = soup.find_all('a', class_='btn', recursive=True)
    pages = []
    for b in btn:
        try:
            p = int(b.text)
            pages.append(p)
        except:
            continue
    last_page = max(pages)
    return last_page

In [3]:
# Get all news from first_page to the last_page. Returns array of dictionaries where each of them has info about 100 pages of news
def get_gamespot_news(last_page: int, news_min: int, news_max: int, display: bool = False) -> dict:    
    titles, links, dates, pages = [],[],[],[]
    url = "https://www.gamespot.com/news/"
    sum_news = 0
    for i in range(last_page):
        try:
            response = requests.get(url + "?page=" + str(last_page - i))
            soup = BeautifulSoup(response.text, 'html.parser')
            news = soup.find_all('div', class_="card-item__content")
            news = news[::-1]
            if (len(news) + sum_news) < news_min:
                sum_news += len(news)
                continue
            elif (len(news) + 120 + sum_news) < sum_news:
                sum_news += len(news) + 120
                i += 2
                continue
            for j in range(len(news)):
                if news_min <= sum_news + 1 <= news_max:
                    a = news[j].find('a', class_='card-item__link')
                    title = a.text
                    link = "https://www.gamespot.com" + str(a.get('href'))
                    date = news[j].find('time').get('datetime')
                    titles.append(title)
                    links.append(link)
                    dates.append(date)
                    pages.append(last_page - i)
                    if display == True:
                        if (sum_news - news_min + 1) % 100 == 0 and (sum_news - news_min + 1) != 0:
                            loaded = (sum_news - news_min + 1)
                            percent = round((100 * loaded)/(news_max - news_min + 1), 2)
                            print(loaded, "news loaded ( " + str(percent) + "% )")
                if sum_news > news_max:
                    final_dict = {}
                    final_dict['titles'] = titles
                    final_dict['links'] = links
                    final_dict['dates'] = dates
                    return final_dict
                
                sum_news += 1
        except:
            print("Error occured")
            break


In [4]:
folder_name = datetime.now()
folder_name = folder_name.strftime('%Y_%m_%d %H-%M-%S')
os.makedirs("Data/" + folder_name, exist_ok=True)
n = 4
from_value = 1
to_value = 500
last_page = get_last_news_page()
for i in range(n):
    l = from_value + (to_value * i)
    r = to_value + (to_value * i)
    dict_news = get_gamespot_news(last_page, l, r, True)
    df = pd.DataFrame(dict_news)
    df.to_csv("Data/" + folder_name + "/news" + str(l) + '-' + str(r) + ".csv", index=False)
    print("news", str(l) + '-' + str(r), "saved\n")

100 news loaded ( 20.0% )
200 news loaded ( 40.0% )
300 news loaded ( 60.0% )
400 news loaded ( 80.0% )
news 1-500 saved

100 news loaded ( 20.0% )
200 news loaded ( 40.0% )
300 news loaded ( 60.0% )
400 news loaded ( 80.0% )
news 501-1000 saved

100 news loaded ( 20.0% )
200 news loaded ( 40.0% )
300 news loaded ( 60.0% )
400 news loaded ( 80.0% )
news 1001-1500 saved

100 news loaded ( 20.0% )
200 news loaded ( 40.0% )
300 news loaded ( 60.0% )
400 news loaded ( 80.0% )
news 1501-2000 saved



In [5]:
target_folder_name = "2024_05_12 23-38-02"
os.makedirs("Data/Sheets", exist_ok=True)
file_names = os.listdir("Data/" + target_folder_name)

filenames_values = []
csv_file_names = []
for f in file_names:
    pattern = r'news(\d+)-(\d+)\.csv'
    match = re.match(pattern, f)
    if match:
        value1 = int(match.group(1))
        value2 = int(match.group(2))
        filenames_values.append(value1)
        filenames_values.append(value2)
        csv_file_names.append(f)

excel_path = "Data/Sheets/NewsSheet" + target_folder_name + " " + str(min(filenames_values)) + '-' + str(max(filenames_values)) + ".xlsx"
with pd.ExcelWriter(excel_path) as writer:

    for f in csv_file_names:
        df = pd.read_csv("Data/" + target_folder_name + '/' + f)
        name = f.replace(".csv", "")
        df.to_excel(writer, sheet_name=name, index=False)
        workbook = writer.book
        worksheet = workbook[name] 
        
        for row in worksheet.iter_rows():
            for cell in row:
                cell.alignment = openpyxl.styles.Alignment(wrap_text=True)
        for col in worksheet.columns:
            max_length = 0
            column = col[0].column_letter
            for cell in col:
                try:
                    if len(str(cell.value)) > max_length:
                        max_length = len(cell.value)
                except:
                    pass
            adjusted_width = (max_length + 2) * 1.2 
            worksheet.column_dimensions[column].width = adjusted_width
