# Game Data Exploration and Analysis

### Index
* [Trends in games](#gametrends)
    * [Number of games by year](#gameyear)
    * [Number of games by month](#gamemonth)
    * [Geographical distribution of game developers](#gamedeveloper)
    
* [Factors affecting game ratings](#gamerating)
    * [Platforms](#gameplatform)
    * [Genres](#gamegenre)

In [2]:
# dependencies and setup
import requests
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
import gmaps
import os
import datetime as dt

from api_keys import g_key

# Load and read file into dataframe
game_to_load = "csvfiles/game_final.csv"
game_df = pd.read_csv(game_to_load,parse_dates=["release date"])
game_df.head()

Unnamed: 0,name,id,number of platforms,platforms,number of stores,stores,number of genres,genres,tags,release date,...,metacritic score,yet,owned,beaten,toplay,dropped,playing,users,month,year
0,The Witcher 3: Wild Hunt,3328,4,"['PC', 'PlayStation', 'Xbox', 'Nintendo']",4,"['Steam', 'PlayStation Store', 'Xbox Store', '...",3,"['Adventure', 'Action', 'RPG']","['Full controller support', 'Action RPG', 'Atm...",2015-05-18,...,93,562,6149,2405,455,424,593,9147,5,2015
1,Life is Strange,3439,9,"['PC', 'PlayStation', 'Xbox', 'iOS', 'Android'...",7,"['Steam', 'PlayStation Store', 'Xbox Store', '...",1,['Adventure'],"['Full controller support', 'Atmospheric', 'Ch...",2015-01-29,...,83,466,6156,1916,184,343,124,8196,1,2015
2,Red Dead Redemption 2,28,3,"['PC', 'PlayStation', 'Xbox']",4,"['Steam', 'PlayStation Store', 'Xbox Store', '...",2,"['Adventure', 'Action']","['In-App Purchases', 'America', 'Partial Contr...",2018-10-26,...,96,404,4230,1261,1143,219,588,6079,10,2018
3,DOOM (2016),2454,4,"['PC', 'PlayStation', 'Xbox', 'Nintendo']",4,"['Steam', 'PlayStation Store', 'Xbox Store', '...",2,"['Shooter', 'Action']","['Partial Controller Support', 'Steam Achievem...",2016-05-13,...,85,318,4868,1526,319,370,222,6616,5,2016
4,Fallout 4,3070,3,"['PC', 'PlayStation', 'Xbox']",3,"['Steam', 'PlayStation Store', 'Xbox Store']",2,"['Action', 'RPG']","['Full controller support', 'Action RPG', 'Ste...",2015-11-09,...,84,295,4844,1235,225,732,222,6301,11,2015


## Trends in games <a id='gametrends'></a>
The following section explores trends in games released over the past 5 years.


In [None]:
game_df.describe()

Observations include:
* The range and standard deviation of the Metacritic score data indicate the dataset has high variability.  The mean and median being close together however which indicates the dataset has to be a relatively symmetrical distribution
* The rating has measures of variability of relative magnitude although slightly higher proportionally, which would indicate a slightly more variable dataset.  As with the Metacritic score the mean and median are close


In [None]:
corr = game_df.corr()
corr.style.background_gradient(cmap='coolwarm')

Running the correlation function to identify relationships between the variables shows a high correlation between number of stores and number of platforms (to be expected).  Also to be expected is the strong correlation between metacritic score and rating

### Number of games by year <a id='gameyear'></a>

In [None]:
# Visualise category variables

fig, axes = plt.subplots(nrows = 2, ncols = 2, figsize = (20,15))

# Generate a bar plot for total released each year

total = game_df.groupby(game_df["year"])["id"].count()

axes[0,0].set_title("Number of Games Released Annually")
axes[0,0].set_ylabel("Number of Games")

total = axes[0,0].bar(total.index, total, color='c', alpha=0.5)



# Generate a bar plot for total users each year
users = game_df.groupby(game_df["year"])["users"].sum()

axes[0,1].set_title("Number of Users Annually")
axes[0,1].set_ylabel("Number of Users")

users = axes[0,1].bar(users.index, users, color='c', alpha=0.5)


# Generate a bar plot for average rating each year
users = game_df.groupby(game_df["year"])["rating"].mean()
axes[1,0].set_title("Average Rating")
axes[1,0].set_ylim(top = 4.5)
axes[1,0].set_ylabel("Rating")

users = axes[1,0].bar(users.index, users, color='c', alpha=0.5)

# Generate a bar plot for average metacritic score each year
users = game_df.groupby(game_df["year"])["metacritic score"].mean()

axes[1,1].set_title("Average Metacritic Score")
axes[1,1].set_ylim(top = 90)
axes[1,1].set_ylabel("Metacritic Score")

users = axes[1,1].bar(users.index, users, color='c', alpha=0.5)
plt.savefig("figures/game_annualtrend.png")

In [None]:
years = game_df["year"].unique()  # the x locations for the groups
years.sort()


owned = game_df.groupby(game_df["year"]).owned.sum()
beaten = game_df.groupby(game_df["year"]).beaten.sum()
playing = game_df.groupby(game_df["year"]).playing.sum()
b = list(owned + beaten)


fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.barh(years, owned, color="steelblue")
ax.barh(years, beaten, left=owned, color="lightskyblue")
ax.barh(years, playing, left=b, color="slategrey")


ax.set_xlabel("Number of Users")
ax.set_title("Breakdown of Users")
ax.set_yticks(years, (years[0], years[1], years[2], years[3], years[4]))

ax.legend(labels=["Owned", "Beaten", "Playing"])

plt.savefig("figures/gameuser_annualtrend.png")

In [None]:
fig, axes = plt.subplots(nrows = 1, ncols = 2, figsize = (15,5))

# Generate a bar plot for average rating each year
users = game_df.groupby(game_df["year"])["rating"].mean()
axes[0].set_title("Average Rating")
axes[0].set_ylim(top = 4.5)
axes[0].set_ylabel("Rating")

rating = axes[0].bar(users.index, users, color='c', alpha=0.5)

# Generate a bar plot for average metacritic score each year
users = game_df.groupby(game_df["year"])["metacritic score"].mean()

axes[1].set_title("Average Metacritic Score")
axes[1].set_ylim(top = 90)
axes[1].set_ylabel("Metacritic Score")

metascore = axes[1].bar(users.index, users, color='c', alpha=0.5)

In [None]:
# Generate a box plot for range of user ratings each year
blue_square = dict(markerfacecolor='b', marker='s')
years = game_df["year"].unique()
years.sort()

game_df.boxplot(column=["rating"], by="year", notch = True, 
                 labels = years, grid=False, flierprops=blue_square)

title_boxplot = "Annual Rating"
plt.xlabel("Month")
plt.ylabel("Rating")

plt.title(title_boxplot)
plt.ylim(top = 5)
plt.suptitle(" ")
plt.show()

# Perform ANOVA for user rating
group1 = game_df.loc[game_df["year"] == 2015]["rating"]
group2 = game_df.loc[game_df["year"] == 2016]["rating"]
group3 = game_df.loc[game_df["year"] == 2017]["rating"]
group4 = game_df.loc[game_df["year"] == 2018]["rating"]
group5 = game_df.loc[game_df["year"] == 2019]["rating"]

anova_r = stats.f_oneway(group1, group2, group3, group4, group5)

# Generate a box plot for range of metacritic scores each year
games_df.boxplot(column=["metacritic score"], by="year", notch = True, 
                 labels = years, grid=False, flierprops=blue_square)

title_boxplot = "Annual Metacritic Score"
plt.xlabel("Month")
plt.ylabel("Metacritic Score")

plt.title(title_boxplot)
plt.suptitle(" ")
plt.show()

# Perform ANOVA for metacritic score
group6 = game_df[game_df["year"] == 2015]["metacritic score"]
group7 = game_df[game_df["year"] == 2016]["metacritic score"]
group8 = game_df[game_df["year"] == 2017]["metacritic score"]
group9 = game_df[game_df["year"] == 2018]["metacritic score"]
group10 = game_df[game_df["year"] == 2019]["metacritic score"]

anova_m = stats.f_oneway(group6, group7, group8, group9, group10)

# Create and show dataframe
fstats = (anova_m[0], anova_r[0])
pstats = (anova_m[1], anova_r[1])
counts = ("Metacritic Score", "User Rating")


frame = {"Type" : counts,"F Statistic": fstats, "P Value" : pstats} 
summary_df = pd.DataFrame(frame)

summary_df

plt.savefig("figures/gamerating_annualtrend.png")

In [None]:
# Visualize metacritic score:

fig, axes = plt.subplots(nrows = 6, ncols = 2, figsize = (10,20))


x00 = game_df['metacritic score']
x01 = game_df['rating']
x10 = game_df.loc[game_df["year"] == 2015]['metacritic score']
x11 = game_df.loc[game_df["year"] == 2015]['rating']
x20 = game_df.loc[game_df["year"] == 2016]['metacritic score']
x21 = game_df.loc[game_df["year"] == 2016]['rating']
x30 = game_df.loc[game_df["year"] == 2017]['metacritic score']
x31 = game_df.loc[game_df["year"] == 2017]['rating']
x40 = game_df.loc[game_df["year"] == 2018]['metacritic score']
x41 = game_df.loc[game_df["year"] == 2018]['rating']
x50 = game_df.loc[game_df["year"] == 2019]['metacritic score']
x51 = game_df.loc[game_df["year"] == 2019]['rating']


barplot00 = axes[0,0].hist(x00, alpha=0.5, bins=10, color='c', label="Total")
axes[0,0].set_title("Total 2015-2019")
barplot01 = axes[0,1].hist(x01, alpha=0.5, bins=10, color='c', label="Total")
axes[0,1].set_title("Total 2015-2019")

barplot10 = axes[1,0].hist(x10, alpha=0.5, bins=10, color='g', label="2015")
axes[1,0].set_title("2015")
barplot11 = axes[1,1].hist(x11, alpha=0.5, bins=10, color='g', label="2015")
axes[1,1].set_title("2015")


barplot20 = axes[2,0].hist(x20, alpha=0.5, bins=10, color='b', label="2016")
axes[2,0].set_title("2016")
barplot21 = axes[2,1].hist(x21, alpha=0.5, bins=10, color='b', label="2016")
axes[2,1].set_title("2016")

barplot30 = axes[3,0].hist(x30, alpha=0.5, bins=10, color='r', label="2017")
axes[3,0].set_title("2017")
barplot31 = axes[3,1].hist(x31, alpha=0.5, bins=10, color='r', label="2017")
axes[3,1].set_title("2017")

barplot40 = axes[4,0].hist(x40, alpha=0.5, bins=10, color='y', label="2018")
axes[4,0].set_title("2018")
barplot41 = axes[4,1].hist(x41, alpha=0.5, bins=10, color='y', label="2018")
axes[4,1].set_title("2018")

barplot50 = axes[5,0].hist(x50, alpha=0.5, bins=10, color='m', label="2018")
axes[5,0].set_title("2019")
barplot51 = axes[5,1].hist(x51, alpha=0.5, bins=10, color='m', label="2018")
axes[5,1].set_title("2019")

plt.show()


### Number of games by month <a id='gamemonth'></a>

In [None]:
# Monthly spread of games by year

# Declare variables for loop
years = game_df["year"].unique()
years.sort()

months = game_df["month"].unique()
months.sort()

months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul","Aug", "Sep","Oct","Nov","Dec"]

# Create empty dataframe 
months_df = pd.DataFrame(0, index=np.arange(len(months)+1), columns=years)

# Populate dataframe
for year in years:  
    current_yr = game_df.loc[game_df["year"] == year]
    months_df[year] = pd.DataFrame(current_yr.groupby(["month"])["id"].count())
    
# Clean up dataframe    
month_games_df = month_games_df.dropna()
month_games_df["month"] = months
month_games_df = month_games_df[["month", 2015, 2016, 2017, 2018, 2019]]


In [None]:
# Create plot
fig, axes = plt.subplots(nrows = 2, ncols = 3, figsize = (20,10))
fig.suptitle("Number of Games Released by Month")

months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul","Aug", "Sep","Oct","Nov","Dec"]

#plt.title("Number of Games Released by Month")
all_years = game_df.groupby(["month"])["id"].count()
barplot0 = axes[0,0].bar(months, all_years, color='c', alpha=0.5)
axes[0,0].set_title('Total \n2015-2019')

year1 = months_df[2015]
barplot1 = axes[0,1].bar(months, year1, color='g', alpha=0.5)
axes[0,1].set_title('2015')

year2 = months_df[2016]
barplot2 = axes[0,2].bar(months, year2, color='b', alpha=0.5)
axes[0,2].set_title('2016')

year3 = months_df[2017]
barplot3 = axes[1,0].bar(months, year3, color='r', alpha=0.5)
axes[1,0].set_title('2017')

year4 = months_df[2018]
barplot4 = axes[1,1].bar(months, year4, color='y', alpha=0.5)
axes[1,1].set_title('2018')

year5 = months_df[2019]
barplot5 = axes[1,2].bar(months, year5, color='m', alpha=0.5)
axes[1,2].set_title('2019')

plt.savefig("figures/game_monthlytrend.png")

### Geographical distribution of game developers <a id='gamedeveloper'></a>

In [None]:
# load the required developer csv
developer_df=pd.read_csv("csvfiles/developers.csv")

In [None]:
# exploring game ratings by developer
dev_gamecount=pd.DataFrame(developer_df.groupby("first dev")["rating"].agg(["count","mean","median","var","std","sem"]))
dev_gamecount=dev_gamecount.sort_values("count",ascending=False)
dev_gamecount=dev_gamecount[dev_gamecount["count"]>1]
dev_gamecount["average rating count"]=developer_df.groupby("first dev")["rating count"].mean()

plt.hist(dev_gamecount["mean"],align="left",alpha=0.5)
plt.title("Distribution of average rating of games per developer")
plt.xlabel("Average game rating per developer")
plt.ylabel("Frequency")

In [None]:
plt.scatter(dev_gamecount["count"],dev_gamecount["mean"],s=dev_gamecount["average rating count"]/10,alpha=0.4)
plt.title("Average Game Rating by Number of Games per Developer")
plt.xlabel("Number of Games per Developer")
plt.ylabel("Average Game Rating")
plt.annotate("marker size denotes\naverage number of ratings",(12,2.75))
plt.savefig("figures/game_developer.png")

In [None]:
# exploring the number of games released by each developer between 2015-2019
plt.hist(dev_gamecount["count"],align="left",bins=20)
plt.title("Distribution of number of games per developer")
plt.xlabel("Number of games per developer")
plt.ylabel("Frequency")

In [None]:
# obtaining the location of each developer's office using google maps
lat=[]
lng=[]

params={
    "inputtype":"textquery",
    "key":g_key,
}

for i in dev_gamecount.index:
    params["input"]=i
    url="https://maps.googleapis.com/maps/api/place/autocomplete/json?parameters"
    response=requests.get(url, params=params).json()
    try:
        place_id=response["predictions"][0]["place_id"]
        url2="https://maps.googleapis.com/maps/api/geocode/json?place_id="+place_id+"&key="+g_key
        response2=requests.get(url2).json()
        lat.append(response2["results"][0]["geometry"]["location"]["lat"])
        lng.append(response2["results"][0]["geometry"]["location"]["lng"])
    except:
        lat.append("missing")
        lng.append("missing")

dev_gamecount["lat"]=lat
dev_gamecount["lng"]=lng
dev_gamecount=dev_gamecount[dev_gamecount["lat"]!="missing"]

lat_dict=dev_gamecount["lat"].to_dict()
lng_dict=dev_gamecount["lng"].to_dict()
developer_df["lat"]= developer_df["first dev"].map(lat_dict)
developer_df["lng"]= developer_df["first dev"].map(lng_dict)
developer_df.head()

In [None]:
# removing rows without a developer location
developer_df_2=developer_df.dropna(how="any").copy()
developer_df_2.count()

In [None]:
# getting the top 10 developers by rating
dev_gamecount=dev_gamecount.sort_values("mean",ascending=False)
top_devs_df=dev_gamecount[0:10]
top_devs_df=top_devs_df.reset_index()
top_devs_df["mean"]=round(top_devs_df["mean"],2)
top_devs_df

In [None]:
# configure gmaps
gmaps.configure(api_key=g_key)

# Use the Lat and Lng as locations
locations=developer_df_2[["lat","lng"]]

# create heatmap
fig = gmaps.figure(center=(20,0),zoom_level=2)
heatmap_layer = gmaps.heatmap_layer(
    locations,
    weights=developer_df_2["rating"],
    dissipating=False,
    max_intensity=50,
    point_radius=3
)
fig.add_layer(heatmap_layer)

In [None]:
# Add marker layer of top devs over heat map
info_box_template = """
<dl>
<dt>Developer</dt><dd>{first dev}</dd>
<dt>Number of Games</dt><dd>{count}</dd>
<dt>Average Rating</dt><dd>{mean}</dd>
</dl>
"""
dev_info = [info_box_template.format(**row) for index, row in top_devs_df.iterrows()]
locations = top_devs_df[["lat", "lng"]]

markers=gmaps.marker_layer(locations,info_box_content=dev_info)
fig.add_layer(markers)

# Display figure
fig

## Factors affecting game ratings <a id='gamerating'></a>
The next section looks into the factors that may influence game ratings.
Two candidate factors that are considered in this notebook are game platforms and game genre. 
Before diving into the factors, the first step is to examine the two game rating measures - 
1. user rating
2. metacritic score


In [None]:
# quartiles, outliers and interquartile range for user rating
rating = game_df["rating"]

rating_output =[]
rating_quartiles = rating.quantile([0.25, 0.5, 0.75])

rating_lowerq = rating_quartiles[0.25]
rating_upperq = rating_quartiles[0.75]
rating_iqr = rating_upperq-rating_lowerq
rating_lowerbound = rating_lowerq - (1.5*rating_iqr)
rating_upperbound = rating_upperq + (1.5*rating_iqr)

rating_outlier_count =  game_df.loc[(game_df["rating"] > rating_upperbound)|
                                   (game_df["rating"] < rating_lowerbound)
                               ]["rating"].count()
rating_outlier_vol = game_df.loc[(game_df["rating"] > rating_upperbound)|
                                (game_df["rating"] < rating_lowerbound)
                               ]["rating"].sum()

rating_output.append({"Lower Quartile":rating_lowerq, "Median":rating_quartiles[0.5], "Upper Quartile":rating_upperq, 
               "InterQuartile":rating_iqr, "Lower Bound":rating_lowerbound, "Upper Bound":rating_upperbound, "No. of Outliers":rating_outlier_count})

rating_output_df = pd.DataFrame(rating_output)

rating_output_df['Lower Quartile'] = rating_output_df['Lower Quartile'].map("{:,.2f}".format)
rating_output_df['Median'] = rating_output_df['Median'].map("{:,.2f}".format)
rating_output_df['Upper Quartile'] = rating_output_df['Upper Quartile'].map("{:,.2f}".format)
rating_output_df['InterQuartile'] = rating_output_df['InterQuartile'].map("{:,.2f}".format)
rating_output_df['Lower Bound'] = rating_output_df['Lower Bound'].map("{:,.2f}".format)
rating_output_df['Upper Bound'] = rating_output_df['Upper Bound'].map("{:,.2f}".format)
rating_output_df

In [None]:
# quartiles, outliers and interquartile range for meracritic score
metascore = game_df["metacritic score"]

metascore_output =[] 
metascore_quartiles = metascore.quantile([0.25, 0.5, 0.75])

metascore_lowerq = metascore_quartiles[0.25]
metascore_upperq = metascore_quartiles[0.75]
metascore_iqr = metascore_upperq-metascore_lowerq
metascore_lowerbound = metascore_lowerq - (1.5*metascore_iqr)
metascore_upperbound = metascore_upperq + (1.5*metascore_iqr)

metascore_outlier_count =  game_df.loc[(game_df["metacritic score"] > metascore_upperbound)|
                                   (game_df["metacritic score"] < metascore_lowerbound)
                               ]["metacritic score"].count()
metascore_outlier_vol = game_df.loc[(game_df["metacritic score"] > metascore_upperbound)|
                                (game_df["metacritic score"] < metascore_lowerbound)
                               ]["metacritic score"].sum()

metascore_output.append({"Lower Quartile":metascore_lowerq, "Median":metascore_quartiles[0.5], "Upper Quartile":metascore_upperq, 
               "InterQuartile":metascore_iqr, "Lower Bound":metascore_lowerbound, "Upper Bound":metascore_upperbound, "No. of Outliers":metascore_outlier_count})

metascore_output_df = pd.DataFrame(metascore_output)

metascore_output_df['Lower Quartile'] = metascore_output_df['Lower Quartile'].map("{:,.2f}".format)
metascore_output_df['Median'] = metascore_output_df['Median'].map("{:,.2f}".format)
metascore_output_df['Upper Quartile'] = metascore_output_df['Upper Quartile'].map("{:,.2f}".format)
metascore_output_df['InterQuartile'] = metascore_output_df['InterQuartile'].map("{:,.2f}".format)
metascore_output_df['Lower Bound'] = metascore_output_df['Lower Bound'].map("{:,.2f}".format)
metascore_output_df['Upper Bound'] = metascore_output_df['Upper Bound'].map("{:,.2f}".format)
metascore_output_df

In [None]:
# user rating VS metacritic score
mean_rating = game_df.groupby(["name"])["rating"].mean()
mean_metascore = game_df.groupby(["name"])["metacritic score"].mean()

slope, intercept, rvalue, pvalue, stderror = st.linregress(mean_rating, mean_metascore)
linear = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

print(f"Correlation coefficient is {round(rvalue,2)}")
print(f"Linear regression model is {linear}")

regression = mean_rating*slope + intercept
plt.figure(figsize=(20,10))
plt.annotate(linear, (4,38.6), fontsize=15, color="black")
plt.scatter(mean_rating, mean_metascore, color='blue')
plt.plot(mean_rating, regression)
plt.title("linear regression for metacritic score vs rating")
plt.xlabel("User rating")
plt.ylabel("Average metacritic score")
plt.tight_layout()

plt.savefig("figures/gamerating.png")

## Game platforms <a id='gameplatform'></a>

## Game genres <a id='gamegenre'></a>

In [None]:
# get the first genre of each game (treated as the main genre for further analysis)
genre=[]
for i in game_df.index:
    curr_genre=game_df.loc[i,"genres"].split("'")
    genre.append(curr_genre[1])

game_df["first genre"]=genre
game_df.head()

In [None]:
# explore summary of rating by genre

genre_summary=pd.DataFrame(game_df.groupby("first genre")["rating"].agg(["count","mean","median","var","std","sem"]))
genre_summary

In [None]:
# only examining genre that have at list 100 ratings
top_genre=genre_summary.index[genre_summary["count"]>100]
for i in game_df.index:
    if game_df.loc[i,"first genre"] in top_genre:
        game_df.loc[i,"top genre"]=True
    else:
        game_df.loc[i,"top genre"]=False

# get a shortened genre_df with only the top genres
game_df_shortgenre=game_df[game_df["top genre"]==True]
game_df_shortgenre

In [None]:
# visualising distribution of game rating in the top genres
game_df_shortgenre.boxplot("rating",by="first genre",figsize=(10,3),grid=False,\
                       notch=True,color="black",\
                       flierprops=dict(marker='x',markersize=7, markeredgecolor="red"))
plt.title("Box Plot of Average Game Rating by Genre",fontweight="bold")
plt.xlabel("Game Genre", fontweight="bold")
plt.ylabel("Average Game Rating", fontweight="bold")
plt.suptitle(" ")

plt.savefig("figures/gamerating_genre.png")

In [None]:
# one way anova across genre
rating=[]
curr_rating=[]
genre=[]

for i in game_df_shortgenre["first genre"].unique():
    curr_rating=game_df_shortgenre["rating"][game_df_shortgenre["first genre"]==i]
    rating.append(curr_rating)
    genre.append(i)

st.f_oneway(rating[0],rating[1],rating[2],rating[3])

The ANOVA was significant with a p-value of <0.001, suggesting that average game ratings do differ by game genre. As such, individual t-tests will be run across each pair of genres to find out which pair are driving the results. Alpha level is set at p=0.05/(number of tests) to control for multiple comparisons

In [None]:
numtests=0
for i in range(len(genre)-1):
    numtests=numtests+(i+1)

for i in range(len(genre)):
    for j in range(len(genre)):
        if genre[i]!=genre[j]:
            result=st.ttest_ind(rating[i],rating[j],equal_var=False)
            if result.pvalue<(0.05/numtests):
                print(genre[i]+" vs "+genre[j]+" - "+str(result))

In [None]:
# plotting finding
import seaborn as sns

sns.set(style="whitegrid")
sns.despine()
fig1=sns.swarmplot("first genre","rating", data=game_df_shortgenre,size=3)
fig1=sns.boxplot("first genre","rating", data=game_df_shortgenre,color="white")
fig1.set_title("Average game rating by genre",fontweight="bold")
fig1.text(3.6,4,"Adventure > Indie\nAdventure > Strategy\np-value=.00898", horizontalalignment="left", size="medium", color='black')

plt.savefig("figures/gamerating_genre2.png")

The t-tests show that adventure genre tend to be rated higher than indie genre or strategy genre games.
The above analyses were rerun without outliers (below) and demonstrate a similar results, suggesting that significant difference in rating between genre is likely sound.

In [None]:
# testing whether results will be similar without outliers
game_df_shortgenre_nooutlier=g=game_df_shortgenre.copy()

for i in game_df_shortgenre["first genre"].unique():
    Q1=game_df_shortgenre["rating"][game_df_shortgenre["first genre"]==i].quantile(.25)
    Q3=game_df_shortgenre["rating"][game_df_shortgenre["first genre"]==i].quantile(.75)
    IQR=Q3-Q1
    lower_bound=Q1-(1.5*IQR)
    for j in game_df_shortgenre_nooutlier.index:
        if (game_df_shortgenre_nooutlier.loc[j,"first genre"]==i) & (game_df_shortgenre_nooutlier.loc[j,"rating"]<lower_bound):
            game_df_shortgenre_nooutlier.loc[j,"outlier"]=True

game_df_shortgenre_nooutlier=game_df_shortgenre_nooutlier[game_df_shortgenre_nooutlier["outlier"]!=True]

game_df_shortgenre_nooutlier.boxplot("rating",by="first genre",figsize=(10,3),grid=False,\
                       notch=True,color="black",\
                       flierprops=dict(marker='x',markersize=7, markeredgecolor="red"))
plt.title("Box Plot of Average Game Rating by Genre (without outliers)",fontweight="bold")
plt.xlabel("Game Genre", fontweight="bold")
plt.ylabel("Average Game Rating", fontweight="bold")
plt.suptitle(" ")

rating_nooutlier=[]
curr_rating=[]
genre_nooutlier=[]

for i in game_df_shortgenre_nooutlier["first genre"].unique():
    curr_rating=game_df_shortgenre_nooutlier["rating"][game_df_shortgenre_nooutlier["first genre"]==i]
    rating_nooutlier.append(curr_rating)
    genre_nooutlier.append(i)

st.f_oneway(rating_nooutlier[0],rating_nooutlier[1],rating_nooutlier[2],rating_nooutlier[3])

In [None]:
# to further explore how rating across genre differ over time, the following visualisations and linear regressions were run in the top 4 genres

fig, ax = plt.subplots()
for genre, group in game_df_shortgenre.groupby("first genre"):
    plt.scatter(group["release date"], group["rating"],alpha=0.4,label=genre)
ax.legend()
ax.set_title("Trend in game rating by genre between 2015-2020")

game_df["ordtime"]=game_df["release date"].map(dt.datetime.toordinal)

for i in game_df_shortgenre["first genre"].unique():
    (slope,intercept,rval,pval,stderror)=st.linregress(game_df["ordtime"][game_df["first genre"]==i],game_df["rating"][game_df["first genre"]==i])
    if pval<0.05/4:
        print(f'{i} - y={round(slope,4)}x{round(intercept,4)}')
        game_df[i]=slope*game_df["ordtime"][game_df["first genre"]==i]+intercept
        
plt.plot(game_df["release date"],game_df["Indie"],marker="",color="green",alpha=0.5)
plt.plot(game_df["release date"],game_df["Strategy"],marker="",color="darkred",alpha=0.5)

plt.savefig("figures/gamerating_genre_trend.png")