In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
import altair as alt
import streamlit as st
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv('../data/steam.csv')
df.head()

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,3416,398,187,34,5000000-10000000,3.99
3,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,1273,267,258,184,5000000-10000000,3.99
4,50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,FPS;Action;Sci-fi,0,5250,288,624,415,5000000-10000000,3.99


In [3]:
mask = ~df['genres'].str.contains(';')

# Apply the mask to the DataFrame to keep only the rows where 'genres' does not contain ';'
df = df[mask]

In [4]:
# group by count genres
df.groupby(['genres'])['name'].count().sort_values(ascending=False).reset_index().iloc[:50,] 

Unnamed: 0,genres,name
0,Action,843
1,Indie,759
2,Casual,560
3,Adventure,535
4,Strategy,485
5,Simulation,328
6,RPG,270
7,Racing,86
8,Sports,63
9,Utilities,44


In [5]:
grouped_genres = df.groupby('genres').size().reset_index(name='counts')
genres_to_replace = grouped_genres[grouped_genres['counts'] < 10]['genres'].tolist()

# Add a new column to the original DataFrame with the updated genres
df['new_genre'] = df['genres'].apply(lambda x: 'Others' if x in genres_to_replace else x)

# Replace the original 'genre' column with the 'new_genre' column
df['genres'] = df['new_genre']
df.drop('new_genre', axis=1, inplace=True)

# group by count genres after binning
df.groupby(['genres'])['name'].count().sort_values(ascending=False).reset_index()

Unnamed: 0,genres,name
0,Action,843
1,Indie,759
2,Casual,560
3,Adventure,535
4,Strategy,485
5,Simulation,328
6,RPG,270
7,Racing,86
8,Sports,63
9,Utilities,44


In [6]:
# standardize positive and negative ratings to limit the range of those values
df['standardized_pos'] = (df['positive_ratings'] - df['positive_ratings'].mean()) / df['positive_ratings'].std()
df['standardized_neg'] = (df['negative_ratings'] - df['negative_ratings'].mean()) / df['negative_ratings'].std()

In [7]:
# calculate the rate for positive ratings for each game and add a new columns
df['positive_rate'] = df['positive_ratings']/(df['negative_ratings']+df['positive_ratings'])

In [8]:
# calculate the rate for negative ratings for each game and add a new columns
df['negative_rate'] = df['negative_ratings']/(df['negative_ratings']+df['positive_ratings'])

In [9]:
# look at the summary statistics of the newly created column
df.positive_rate[df['genres']=='Action'].describe()

count    843.000000
mean       0.742005
std        0.211803
min        0.000000
25%        0.636364
50%        0.800000
75%        0.900990
max        1.000000
Name: positive_rate, dtype: float64

In [10]:
# create subset data without certain genres of games because of scarcity 
df1 = df[~df.genres.isin(['Utilities','Others','Free to Play'])]

In [11]:
# reshape and create new subset for plotting the rating plot
#df2 contains two columns: positive ratings rate, type of ratings (positive)
df2 = df1[['genres','positive_rate']]
df2['rating type']= 'positive'
df2 = df2.rename(columns={'positive_rate': 'rating'})

#df3 contains two columns: negative ratings rate, type of ratings (negative)
df3 = df1[['genres','negative_rate']]
df3['rating type']= 'negative'
df3 = df3.rename(columns={'negative_rate': 'rating'})

#concat df2 and df3 to get a new data with two columns: rate of ratings, type of ratings (negative/positive)
df4  = pd.concat([df2, df3], axis=0)
df4

Unnamed: 0,genres,rating,rating type
0,Action,0.973888,positive
1,Action,0.839787,positive
2,Action,0.895648,positive
3,Action,0.826623,positive
4,Action,0.947996,positive
...,...,...,...
27037,Casual,0.625000,negative
27040,Strategy,0.000000,negative
27047,Indie,0.218750,negative
27061,Indie,0.000000,negative


In [12]:
# check for action genres rating
df4.rating[df4['genres']=='Action']

0        0.973888
1        0.839787
2        0.895648
3        0.826623
4        0.947996
           ...   
26713    0.000000
26769    0.000000
26904    0.000000
26921    0.000000
26992    0.166667
Name: rating, Length: 1686, dtype: float64

In [13]:
df4['rating type'] = df4['rating type'].astype('category') #cast data type

#plot facet histograms group by the type of ratings using plotly express
fig = px.histogram(df4, x="rating", color="rating type", nbins=33,
                   color_discrete_map = {'positive':'#00C691','negative':'#8F52D2'},
                   barmode="overlay", #show both bins even when overlaped
                   opacity=0.7,
                   facet_col='genres',
                   facet_col_wrap=3,
                   facet_row_spacing=0.1, 
                    facet_col_spacing=0.08,
                    height=800, width=1000)

#fig.update_traces(hovertemplate='GDP: %{x} <br>Life Expectancy: %{y}')
fig.update_yaxes(matches=None)
fig.update_xaxes(matches=None)
fig.for_each_yaxis(lambda yaxis: yaxis.update(showticklabels=True))
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(legend_title_text='Rating Type')
fig.update_traces(marker_line_width=0.5,marker_line_color="white")
fig.show()

import plotly.offline as pyo
pyo.plot(fig, filename='dis_plot.html')


'dis_plot.html'

In [14]:
#altair radial plot to show counts by genres
base = alt.Chart(df1).encode(
    theta=alt.Theta("count(genres):Q",sort=alt.EncodingSortField(field='count(genres):Q', order='descending'),stack=True), 
    radius=alt.Radius("count(genres):Q",scale=alt.Scale(type='sqrt',zero=True, rangeMin=20)),
    color=alt.Color('genres',sort=alt.EncodingSortField(field='count(genres):Q', order='descending'),scale=alt.Scale(range=['#8d6ea3', '#d79abd', '#ebd38b', '#e5a185', '#8ec1b1','#aa5555','#6498ba','#B46060','#A9907E'])),
    tooltip=["genres:N", "count(genres):Q"]
).properties(
    height=500,
    width=500,
    title='Radial plot of video games counts by genres'
)

radial = base.mark_arc() #make arc

# point distribution plot 
points = alt.Chart(df1).mark_point(filled=True, size=70).encode(
  x=alt.X('median_playtime:Q', scale=alt.Scale(domain=[0,10000],clamp=True), title='Median playtime'), #adjust x-axis limit and clamp the edge
  y=alt.Y('genres', title='Genres'),
  color=alt.Color('genres',scale=alt.Scale(range=['#EF553B', '#10DDE5', '#636EFB', '#FB9E58','#00C691','#62B6CB','#8F52D2','#CDB4DB','#FB9E58'])),
  tooltip=[alt.Tooltip('mean(median_playtime)', title='Mean of median playtime')] # customize tooltip
).properties(
    height=300,
    width=600,
    title = 'Median playtime of video games by genres')

max_playtime_df = df1.loc[df1.groupby('genres')['median_playtime'].idxmax()]

# Create the text chart using the max_playtime_df DataFrame
text = alt.Chart(max_playtime_df).mark_text(align='left', dx=3,dy=10, fontSize=8).encode(
    x=alt.X('median_playtime:Q', scale=alt.Scale(domain=[0, 10000], clamp=True)),
    y=alt.Y('genres'),
    text='name:N'
)

# Layer the text chart on top of the points chart
chart = points + text


In [15]:
chart


iteritems is deprecated and will be removed in a future version. Use .items instead.



In [21]:
step = 50 #facet row height
overlap = .5 #facet row overlapness
new_df = df1[df1['median_playtime']>0].reset_index(drop=True) #keep games with median playtime>0 because 0 hours will be meaningless for this analysis
new_df['log_median_playtime'] = np.log(new_df['median_playtime']) #log transform median playtime to shrink the range

color_scale = alt.Scale(domain=['Action', 'Adventure', 'Casual', 'Indie', 'RPG', 'Racing', 'Simulation', 'Sports', 'Strategy'],
                        range=["#EA5355","#0FDDE5", "#636EFB", "#DA627D", "#00C691","#CDB4DB","#8F52D2","#62B6CB","#FB9E58"])

#transform bin: make each value of the log median playtime a bin
#transform aggregate: groupby genres and count log median_playtime
chart = alt.Chart(new_df, width=700, height=step).transform_bin(
    'binned_playtime', 'log_median_playtime', bin=True                          
).transform_aggregate(
    value='count()', groupby=['genres', 'binned_playtime']
).transform_window(
    total_count='sum(value)', groupby=['genres']
).transform_calculate(
    percentage='datum.value / datum.total_count'
).mark_area(
    interpolate='monotone',
    fillOpacity=0.8,
    stroke='lightgray',
    strokeWidth=0.5
).encode(
    alt.X('binned_playtime:Q', title='Log Transformed Median Playtime', scale=alt.Scale(domain=[0,8], clamp=True)),
    alt.Y('percentage:Q', axis=None, scale=alt.Scale(range=[step, -step * overlap])),
    #alt.Fill('genres:N', legend=alt.Legend(title='Genres'), scale=alt.Scale())
    alt.Color('genres:N', scale=color_scale, legend=alt.Legend(title='Game Genres', labelFontSize=12))
).configure(background='#fafafa').facet(
    row = alt.Row('genres:N', title=None, header=alt.Header(labelAngle=0, labelAlign='left', labelFontSize=13))
).properties(
    title={
        "text": "",
        "fontSize": 18,
        "anchor": "middle",
        "align": "center"
    },
    bounds='flush'
).configure_facet(
    spacing=0
).configure_view(
    stroke=None
).configure_title(
    anchor='end'
).configure_axis(
    grid=False
) #create facet row distribution plot 


chart


iteritems is deprecated and will be removed in a future version. Use .items instead.



In [108]:
radial

In [110]:
df.groupby(['developer'])['appid'].count().sort_values(ascending=False).reset_index().iloc[:50,] #group by count developper column

Unnamed: 0,developer,appid
0,"KOEI TECMO GAMES CO., LTD.",68
1,HeR Interactive,29
2,HexWar Games,26
3,Valve,22
4,Square Enix,19
5,Arc System Works,18
6,id Software,16
7,Gogii Games,16
8,MumboJumbo,15
9,EnsenaSoft,15
