In [4]:
import aiohttp, http, asyncio, requests, re, time, os
from bs4 import BeautifulSoup
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

In [5]:
pd.set_option('display.max_rows', 8000)
pd.set_option('display.max_columns', 19)
pd.set_option('display.max_colwidth', 16)

FUNCTIONS

In [6]:
def get_url(url, headers=None, params=None):
    
    response = requests.get(url,headers=headers, params=params)
    stat = response.status_code
    soup = BeautifulSoup(response.content, 'html.parser')
    time.sleep(1)
    
    return stat, soup

def fetch_dict(soup, gameid):
    k_value = 0
    players = soup.find("tbody")
    play = re.split(r"[\n\t]+",players.get_text().strip())
    player_game = {} 

    for i in range(0, len(play), 5):
        # Date
        date = play[i]
        # Peak players
        peak = play[i+1]  
        # Average players
        average = play[i +4]
        player_game[k_value]={"Date": date,
                            "Peak Players":peak,
                            "Avg Players":average,
        }
        
        k_value += 1
        
    return player_game

def overall_fetch(list_id):
    error_list =[]
    game_id_dict = {}
    for gameid in list_id:
        gameid = str(gameid)
        url = f'https://steamcharts.com/app/{gameid}'
        headers= {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'}
        try:
            stat, soup = get_url(url, headers=headers, params=gameid)
            if stat == 200:
                print(f"Webcode status is : {stat} for {gameid} id. ")
                player_game = fetch_dict(soup, gameid)
                game_id_dict[gameid] = player_game.copy()
            else:
                error_list.append((gameid, stat))
                print(f"Appid: {gameid} reported an error {stat}, skipping it for now")    
        except Exception as e:  # Catch specific exceptions
            error_list.append((gameid, stat))
            print(f"App ID: {gameid} reported an error: {str(e)}, skipping it for now.")
            
    
    return game_id_dict, error_list

MAIN CODES

In [7]:
# creates the games id list
txt = pd.read_csv(r"C:\\Users\\Utilizador\\Desktop\\IRONHACK\\Project 3\\Project-3\\top1k.csv")
txt = txt.rename({"Game Title":"title", "Steam App ID":"id"}, axis=1)
list_id = list(txt["id"])

In [None]:
## main program, output

game_id_dict, error_list = overall_fetch(list_id)

In [31]:
# Saves our main source of information into a feather
output = pd.DataFrame.from_dict(game_id_dict)
output.to_feather("output.feather")

In [32]:
# Saves our error results into an error csv
err_table = pd.DataFrame(error_list)
err_table.to_csv("error.csv")

In [None]:
### Bridges the webscrapping/API part with the cleaning and processing parts

# imports our feather database & gives us an idea of the shape/NaN's present
output = pd.read_feather("C:\\Users\\Utilizador\\Desktop\\IRONHACK\\Project 3\\Project-3\\output.feather")
output.isna().sum()
display(output.shape)

In [109]:
# Cleans database of most recent games (so from Aug 2019 -> Present)
out2 = output.copy() 
filtered = out2.isna().sum(axis=0) > 85 ## 85 = Aug 2019
filter = filtered[filtered]

mask = list(filter.index)
for code in mask:
    out2.drop(code, axis = 1, inplace=True)

In [None]:
# Converts "Date" into DataFrame index
out3=out2.copy()
out2_extracted = out2.applymap(lambda x: x["Date"] if isinstance(x, dict) else x) 

out3["Date"] = out2_extracted["730"]
out3.set_index('Date', inplace=True)

out3_extracted = out3.applymap(lambda x: x["Avg Players"] if isinstance(x, dict) else x)
out3_extracted

In [None]:
# Converts columns into game's name
out4 = out3.copy()
# creates the id list
txt2 = txt.rename({"Game Title":"title", "Steam App ID":"id"}, axis=1)
txt2 = txt2[["id","title"]]
txt2.set_index("id", inplace=True)

# Create a mapping from ids to titles
title_mapping = {str(k): v for k, v in txt2["title"].to_dict().items()}

# Rename the columns in out4 using the mapping
out4.columns = [title_mapping.get(str(col), col) for col in out4.columns]

# Creates our main dataframe visualization 
visual = "Peak Players"
out4 = out4.applymap(lambda x: x[visual] if isinstance(x, dict) else x)
out4 = round(out4.fillna(0).astype(float),0).astype(int)
out4

In [None]:
# More cleaning while looking at data. Disregarding outliers etc..
out5 = out4.copy()

## Drops useless games that make weird spikes
out5.drop("POSTAL", axis=1, inplace=True)

# Creates a new column (Total) and concats into the existing dataframe
total_column = out5.sum(axis = 1)
out5 = pd.concat([out5, total_column.rename("Total")], axis=1)

In [None]:
# Normalization 1 (average gains) - Assuming 'Total' is the column you want to apply the formula to
out5['Value'] = (out5['Total'] - out5['Total'].shift(-1)) / out5['Total']
display(out5)
#display(txt2)

In [100]:
# further normalizations - Z score normalization
def z_norm(column):
    mean = column.mean()
    std_dev = column.std()
    return (column - mean) / std_dev

out5['Z'] = z_norm(out5['Total'])

out5

In [None]:
## Data Visualization 

# Set theme and palette
sns.set_theme(style="darkgrid")
sns.set_palette("viridis")

# Create a line plot for Total values over Date
plt.figure(figsize=(12, 6))  # Adjust figure size
sns.lineplot(data=out5, x=out5.index, y='Z', marker='o', linestyle='--', linewidth=2)

#### Add a vertical line on a specific date

# Covid related lines
plt.axvline(x="November 2019", color='orange', linestyle='--', linewidth=1)
plt.axvline(x="January 2020", color='red', linestyle='--', linewidth=1)
plt.axvline(x="June 2020", color='green', linestyle='--', linewidth=1)

plt.text(x="November 2019", y=-2,  s='Nov19: China warns about Covid', color='orange', fontsize=10, ha='right', rotation=90)
plt.text(x="January 2020", y=-2, s='Jan20: WHO acknoledges Covid', color='red', fontsize=10, ha='left', rotation=90)
plt.text(x="June 2020", y=-2, s='Jun20: end of first big lockdown period', color='green', fontsize=10, ha='right', rotation=90)

# Game related lines
plt.axvline(x="December 2017", color='cyan', linestyle='--', linewidth=1)
plt.axvline(x="December 2020", color='cyan', linestyle='--', linewidth=1)
plt.axvline(x="September 2021", color='cyan', linestyle='--', linewidth=1)
plt.axvline(x="February 2021", color='cyan', linestyle='--', linewidth=1)
plt.axvline(x="March 2019", color='cyan', linestyle='--', linewidth=1)


plt.text(x="December 2017", y=-2, s="PUBG goes live", color='cyan', fontsize=10, ha='right', rotation=90)
plt.text(x="December 2020", y=-2, s="Cyberpunk goes live", color='cyan', fontsize=10, ha='right', rotation=90)
plt.text(x="September 2021", y=-2, s="New World goes live", color='cyan', fontsize=10, ha='right', rotation=90)
plt.text(x="February 2021", y=-2, s="Valheim goes live", color='cyan', fontsize=10, ha='right', rotation=90)
plt.text(x="March 2019", y=-2, s="Sekiro goes live", color='cyan', fontsize=10, ha='right', rotation=90)

### Graphic formatting

# Format the date on the x-axis
plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=1))

#Limit x-axis onto a specific interval 
plt.xlim('August 2022', 'August 2018')  # Specify the limits as strings
plt.gca().invert_xaxis() # inverts the x-axis (more suited for dates as it grows from left to right)

# Set titles and labels
plt.title('Z-values Over Time')
plt.xlabel('Date')
plt.ylabel('Z-Value')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
