In [942]:
from selenium import webdriver
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.common.by import By 
from selenium.webdriver.chrome import options
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup 
import pandas as pd
from datetime import datetime, timedelta

In [943]:
driver = webdriver.Chrome()
driver.get('https://steamdb.info/sales/')

In [944]:
    select_element = driver.find_element(By.ID, 'dt-length-0')

    select = Select(select_element)

    select.select_by_visible_text('All (slow)')

In [945]:
tableGames = driver.find_element(By.ID, 'DataTables_Table_0')

In [946]:
htmlContent = tableGames.get_attribute("outerHtml")

In [947]:
try:
    table_games = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//*[@id="DataTables_Table_0"]'))
    )

    html_content = table_games.get_attribute("outerHTML")

    soup = BeautifulSoup(html_content, 'html.parser')

except Exception as e:
    print("Ocorreu um erro:", str(e))
finally:
     driver.quit()

In [948]:
rows = soup.find_all('tr', class_='app')

In [949]:
def extract_data(row):
        appid = row.get('data-appid', 'N/A')
        name = row.find('a', class_='b').text if row.find('a', class_='b') else 'N/A'
        discount = row.find('td', class_='price-discount-major dt-type-numeric') or row.find('td', class_='price-discount dt-type-numeric')
        price_elements = row.find_all('td', class_='dt-type-numeric')
        price = price_elements[2].text.strip() if len(price_elements) > 2 else 'N/A'
        rating = price_elements[3].text.strip() if len(price_elements) > 3 else 'N/A'
        release = price_elements[4].text.strip() if len(price_elements) > 4 else 'N/A'

        td_elements = row.find_all('td', class_='timeago dt-type-numeric')
        
        def convert_timestamp(data_sort):
            if data_sort and data_sort.isdigit():
                return datetime.utcfromtimestamp(int(data_sort)).strftime('%Y-%m-%d')
            return 'N/A'

        ends = convert_timestamp(td_elements[0].get('data-sort', '')) if len(td_elements) > 0 else 'N/A'
        started = convert_timestamp(td_elements[1].get('data-sort', '')) if len(td_elements) > 1 else 'N/A'
        
        return {
            'AppID': appid,
            'Name': name,
            'Discount': discount,
            'Price': price,
            'Rating': rating,
            'Release': release,
            'Ends': ends,
            'Started': started
        }


In [951]:
data = [extract_data(row) for row in rows]

In [952]:
df = pd.DataFrame(data)

In [953]:
df

Unnamed: 0,AppID,Name,Discount,Price,Rating,Release,Ends,Started
0,1507190,Machinika: Museum,[-100%],"R$ 0,00",88.19%,Mar 2021,2024-05-27,2024-05-13
1,289130,ENDLESS™ Legend,[-100%],"R$ 0,00",82.23%,Sep 2014,2024-05-23,2024-05-16
2,1586800,Lil Gator Game,[-50%],"R$ 29,99",95.00%,Dec 2022,2024-05-31,2024-05-17
3,1250,Killing Floor,[-90%],"R$ 3,49",93.36%,May 2009,2024-05-23,2024-05-16
4,874260,The Forgotten City,[-65%],"R$ 25,89",93.25%,Jul 2021,2024-05-27,2024-05-13
...,...,...,...,...,...,...,...,...
724,1531720,Need for Drive - Open World Multiplayer Racing,,"R$ 1,74",60.41%,Feb 2021,2024-06-05,2024-05-22
725,526790,Monsti,,"R$ 1,74",60.39%,Sep 2016,2024-05-29,2024-05-15
726,248470,Doorways: Prelude,,"R$ 6,59",60.34%,Sep 2013,2024-05-27,2024-05-20
727,293180,Overcast - Walden and the Werewolf,,"R$ 1,74",60.11%,Apr 2014,2024-05-30,2024-05-16


In [954]:
df['Release'] = df['Release'].replace('—', '')
df['Release'] = pd.to_datetime(df['Release'], errors='coerce')
df['Release'] = df['Release'].dt.strftime('%Y-%m')

df['Ends'] = pd.to_datetime(df['Ends'], errors='coerce')
df['Started'] = pd.to_datetime(df['Started'],errors='coerce')

today = (datetime.today())

df['Ends'] = (df['Ends'] - today + (timedelta(days=1))).dt.days
df['Started'] = (df['Started'] - today + (timedelta(days=1))).dt.days

  df['Release'] = pd.to_datetime(df['Release'], errors='coerce')


In [955]:
df.head(10)

Unnamed: 0,AppID,Name,Discount,Price,Rating,Release,Ends,Started
0,1507190,Machinika: Museum,[-100%],"R$ 0,00",88.19%,2021-03,5,-9.0
1,289130,ENDLESS™ Legend,[-100%],"R$ 0,00",82.23%,2014-09,1,-6.0
2,1586800,Lil Gator Game,[-50%],"R$ 29,99",95.00%,2022-12,9,-5.0
3,1250,Killing Floor,[-90%],"R$ 3,49",93.36%,2009-05,1,-6.0
4,874260,The Forgotten City,[-65%],"R$ 25,89",93.25%,2021-07,5,-9.0
5,757480,Broken Reality,[-85%],"R$ 4,34",88.73%,2018-11,1,-6.0
6,1723260,CaseCracker,[-50%],"R$ 10,34",88.62%,2022-12,11,-3.0
7,1232570,Paper Beast,[-75%],"R$ 11,49",88.01%,2020-07,5,-9.0
8,1710170,Blade of Darkness,[-60%],"R$ 11,59",87.89%,2021-10,7,-7.0
9,232090,Killing Floor 2,[-95%],"R$ 2,79",86.53%,2016-11,1,-6.0


In [956]:
print(df.dtypes)

AppID        object
Name         object
Discount     object
Price        object
Rating       object
Release      object
Ends          int64
Started     float64
dtype: object


In [957]:
def removerCaracteres(df,col, caract):
    for char in caract:
        df[col] = df[col].str.replace(char,"")
        
    return df

In [958]:
df = removerCaracteres(df,'Price',['R',"$"])
df = removerCaracteres(df,'Rating',["%"])


In [959]:
df.head()

Unnamed: 0,AppID,Name,Discount,Price,Rating,Release,Ends,Started
0,1507190,Machinika: Museum,[-100%],0,88.19,2021-03,5,-9.0
1,289130,ENDLESS™ Legend,[-100%],0,82.23,2014-09,1,-6.0
2,1586800,Lil Gator Game,[-50%],2999,95.0,2022-12,9,-5.0
3,1250,Killing Floor,[-90%],349,93.36,2009-05,1,-6.0
4,874260,The Forgotten City,[-65%],2589,93.25,2021-07,5,-9.0


In [960]:
df['Discount'] = df['Discount'].apply(lambda x: re.sub('<.*?>', '', str(x)))
df = removerCaracteres(df,'Discount',["%"])

In [961]:
df.head()

Unnamed: 0,AppID,Name,Discount,Price,Rating,Release,Ends,Started
0,1507190,Machinika: Museum,-100,0,88.19,2021-03,5,-9.0
1,289130,ENDLESS™ Legend,-100,0,82.23,2014-09,1,-6.0
2,1586800,Lil Gator Game,-50,2999,95.0,2022-12,9,-5.0
3,1250,Killing Floor,-90,349,93.36,2009-05,1,-6.0
4,874260,The Forgotten City,-65,2589,93.25,2021-07,5,-9.0


In [962]:
df['Started'] = pd.to_numeric(df['Started'], errors='coerce')
df['Started'] = df['Started'].fillna(0).astype(int)


In [965]:
print(df.dtypes)

AppID       object
Name        object
Discount    object
Price       object
Rating      object
Release     object
Ends         int64
Started      int32
dtype: object


In [966]:
df['Discount'] = pd.to_numeric(df['Discount'], errors='coerce')
df['Discount'] = df['Discount'].astype(float)
df['Discount'] = df['Discount'].apply(lambda x: x / (-100) if pd.notnull(x) else x)


df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
df['Rating'] = df['Rating'].astype(float)

df['Price'] = df['Price'].str.replace(',', '.')
df['Price'] = df['Price'].astype(float)

df['Release'] = df['Release'].replace('—', '')
df['Release'] = pd.to_datetime(df['Release'], errors='coerce')
df['Release'] = df['Release'].dt.strftime('%Y-%m')

df['AppID'] = df['AppID'].astype(int)

In [967]:
print(df.dtypes)

AppID         int32
Name         object
Discount    float64
Price       float64
Rating      float64
Release      object
Ends          int64
Started       int32
dtype: object


In [968]:
df.head(50)

Unnamed: 0,AppID,Name,Discount,Price,Rating,Release,Ends,Started
0,1507190,Machinika: Museum,1.0,0.0,88.19,2021-03,5,-9
1,289130,ENDLESS™ Legend,1.0,0.0,82.23,2014-09,1,-6
2,1586800,Lil Gator Game,0.5,29.99,95.0,2022-12,9,-5
3,1250,Killing Floor,0.9,3.49,93.36,2009-05,1,-6
4,874260,The Forgotten City,0.65,25.89,93.25,2021-07,5,-9
5,757480,Broken Reality,0.85,4.34,88.73,2018-11,1,-6
6,1723260,CaseCracker,0.5,10.34,88.62,2022-12,11,-3
7,1232570,Paper Beast,0.75,11.49,88.01,2020-07,5,-9
8,1710170,Blade of Darkness,0.6,11.59,87.89,2021-10,7,-7
9,232090,Killing Floor 2,0.95,2.79,86.53,2016-11,1,-6


In [969]:
df["Discount"] = df["Discount"].fillna(0)

In [970]:
df.head(5)

Unnamed: 0,AppID,Name,Discount,Price,Rating,Release,Ends,Started
0,1507190,Machinika: Museum,1.0,0.0,88.19,2021-03,5,-9
1,289130,ENDLESS™ Legend,1.0,0.0,82.23,2014-09,1,-6
2,1586800,Lil Gator Game,0.5,29.99,95.0,2022-12,9,-5
3,1250,Killing Floor,0.9,3.49,93.36,2009-05,1,-6
4,874260,The Forgotten City,0.65,25.89,93.25,2021-07,5,-9


In [971]:
df.to_json(r'C:\Users\Raul\Desktop\Programação\Dados\Pandas\SteamSales.json', orient='records',lines=True)

In [972]:
df.to_csv(r'C:\Users\Raul\Desktop\Programação\Dados\Pandas\SteamSales.csv', index=False)