# Visualizing the dataset

After downloading and cleaning the dataset, we move towards exploring it. Through different visualizations, we try to understand if our hypotheses make sense with respect to the data.

In [2]:
# Imports
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [3]:
# Loading the files required
games_df = pd.read_csv("../data/games_cleaned.csv").drop(['Unnamed: 0'], axis=1).set_index('BGGId')

In [4]:
# Basic description of data
games_df.describe()

Unnamed: 0,YearPublished,GameWeight,AvgRating,BayesAvgRating,StdDev,MinPlayers,MaxPlayers,ComAgeRec,NumOwned,NumWant,...,Rank:partygames,Rank:childrensgames,Cat:Thematic,Cat:Strategy,Cat:War,Cat:Family,Cat:CGS,Cat:Abstract,Cat:Party,Cat:Childrens
count,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,...,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0
mean,1985.494914,1.982131,6.424922,5.685673,1.516374,2.007343,5.707868,12.021072,1467.848164,41.690946,...,21295.352201,21062.680274,0.055827,0.10577,0.161003,0.105633,0.01382,0.050855,0.02919,0.040182
std,212.486214,0.848983,0.932477,0.365311,0.285578,0.693093,15.014643,4.477699,5294.120574,117.255229,...,3637.139987,4219.776597,0.229592,0.30755,0.367542,0.307374,0.116745,0.219707,0.168344,0.196391
min,-3500.0,0.0,1.04133,3.57481,0.196023,0.0,0.0,2.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2001.0,1.3333,5.83696,5.5103,1.32072,2.0,4.0,8.0,150.0,3.0,...,21926.0,21926.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2011.0,1.9688,6.45395,5.54654,1.47688,2.0,4.0,11.666667,320.0,9.0,...,21926.0,21926.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2017.0,2.5252,7.05245,5.67989,1.66547,2.0,6.0,18.0,899.0,28.0,...,21926.0,21926.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2021.0,5.0,9.91429,8.51488,4.27728,10.0,999.0,21.0,166497.0,2031.0,...,21926.0,21926.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Set 5: Understanding the recommended ages

We are mainly interested in understanding the effect of recommended ages with popularity. The columns 'ComAgeRec' and 'MfgAgeRec' show the community and manufacturer recommendations.

In [5]:
games_df.columns

Index(['Name', 'Description', 'YearPublished', 'GameWeight', 'AvgRating',
       'BayesAvgRating', 'StdDev', 'MinPlayers', 'MaxPlayers', 'ComAgeRec',
       'NumOwned', 'NumWant', 'NumWish', 'NumWeightVotes', 'MfgPlaytime',
       'ComMinPlaytime', 'ComMaxPlaytime', 'MfgAgeRec', 'NumUserRatings',
       'NumComments', 'NumAlternates', 'NumExpansions', 'NumImplementations',
       'IsReimplementation', 'Kickstarted', 'Rank:boardgame',
       'Rank:strategygames', 'Rank:abstracts', 'Rank:familygames',
       'Rank:thematic', 'Rank:cgs', 'Rank:wargames', 'Rank:partygames',
       'Rank:childrensgames', 'Cat:Thematic', 'Cat:Strategy', 'Cat:War',
       'Cat:Family', 'Cat:CGS', 'Cat:Abstract', 'Cat:Party', 'Cat:Childrens'],
      dtype='object')

On finding the difference between the recommended ages, we can get an understanding of how far from the intended community did the game end up being used.

In [7]:
games_df['ComMfgChange'] = games_df['ComAgeRec'] - games_df['MfgAgeRec']
games_df.ComMfgChange.describe()

count    21925.000000
mean         2.407662
std          5.103704
min        -12.000000
25%         -1.000000
50%          0.400000
75%          6.000000
max         21.000000
Name: ComMfgChange, dtype: float64

In [12]:
popularity_metrics = ['NumWant','NumWish','ComMaxPlaytime', 'ComMinPlaytime', 'NumWeightVotes']

In [None]:
# games_df[['ComMfgChange']+popularity_metrics].set_index('ComMfgChange')
games_df.hist(column=popularity_metrics, by='ComMfgChange', bins=100)