Data Analysis with Pandas

In [5]:
import pandas as pd
import numpy as py

In [6]:
df = pd.read_csv("video_game_sales/vgsales.csv")
df.head(10)

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37
5,6,Tetris,GB,1989.0,Puzzle,Nintendo,23.2,2.26,4.22,0.58,30.26
6,7,New Super Mario Bros.,DS,2006.0,Platform,Nintendo,11.38,9.23,6.5,2.9,30.01
7,8,Wii Play,Wii,2006.0,Misc,Nintendo,14.03,9.2,2.93,2.85,29.02
8,9,New Super Mario Bros. Wii,Wii,2009.0,Platform,Nintendo,14.59,7.06,4.7,2.26,28.62
9,10,Duck Hunt,NES,1984.0,Shooter,Nintendo,26.93,0.63,0.28,0.47,28.31


In [7]:
# Which company is the most common video game publisher?
most_common_publisher = df["Publisher"].value_counts().head(1)
most_common_publisher

Electronic Arts    1351
Name: Publisher, dtype: int64

In [8]:
# What’s the most common platform?
most_common_platform = df["Platform"].value_counts().head(1)
most_common_platform

DS    2163
Name: Platform, dtype: int64

In [9]:
# What about the most common genre?
most_common_genre = df["Genre"].value_counts().head(10)
most_common_genre

Action          3316
Sports          2346
Misc            1739
Role-Playing    1488
Shooter         1310
Adventure       1286
Racing          1249
Platform         886
Simulation       867
Fighting         848
Name: Genre, dtype: int64

In [10]:
# What are the top 20 highest grossing games?
highest_grossing_games = df[["Name","Global_Sales"]].sort_values("Global_Sales", ascending = False).head(20)
highest_grossing_games

Unnamed: 0,Name,Global_Sales
0,Wii Sports,82.74
1,Super Mario Bros.,40.24
2,Mario Kart Wii,35.82
3,Wii Sports Resort,33.0
4,Pokemon Red/Pokemon Blue,31.37
5,Tetris,30.26
6,New Super Mario Bros.,30.01
7,Wii Play,29.02
8,New Super Mario Bros. Wii,28.62
9,Duck Hunt,28.31


In [11]:
#For North American video game sales, what’s the median?
NA_Median = df["NA_Sales"].median()
NA_Median

0.08

In [12]:
# Provide a secondary output showing ten games surrounding the median sales output, assume that games with same median value are sorted in descending order
Na_ten_median_sales = df[df["NA_Sales"] == NA_Median]
Na_ten_median_sales[["Rank","Name", "Platform", "NA_Sales"]].head(10)

Unnamed: 0,Rank,Name,Platform,NA_Sales
446,447,Dragon Warrior IV,NES,0.08
497,498,World Soccer Winning Eleven 7 International,PS2,0.08
1617,1619,Farming Simulator 2015,PC,0.08
1926,1928,Pro Evolution Soccer 2008,X360,0.08
2067,2069,Winning Eleven: Pro Evolution Soccer 2007 (All...,X360,0.08
2373,2375,Phantasy Star Portable 2,PSP,0.08
2579,2581,The Sims 2: Castaway,PSP,0.08
3186,3188,SingStar Queen,PS2,0.08
3503,3505,Top Spin 3,PS3,0.08
3703,3705,Sonic & All-Stars Racing Transformed,PS3,0.08


In [17]:
# For the top-selling game of all time, how many standard deviations above/below the mean are its sales for North America?
top_selling_games = df["NA_Sales"].std()
top_selling_games

0.8166830292988796

In [18]:
# Standard deviations above/below the mean are its sales for North America?
max = df[["NA_Sales"]].max()
mean = df[["NA_Sales"]].mean()
result = (max-mean)/top_selling_games
result

NA_Sales    50.478988
dtype: float64

In [13]:
# Standard deviation five below median
five_below_median = df[df["NA_Sales"] < NA_Median ][["Name", "NA_Sales"]].head(5)
five_below_median

Unnamed: 0,Name,NA_Sales
137,World of Warcraft,0.07
214,Monster Hunter Freedom 3,0.0
338,Friend Collection,0.0
348,Pro Evolution Soccer 2008,0.05
383,Monster Hunter 4,0.0


In [14]:
# Standard deviation five above median
five_above_median = df[df["NA_Sales"] > NA_Median ][["Name", "NA_Sales"]].tail(5)
five_above_median

Unnamed: 0,Name,NA_Sales
10975,Fighting Fantasy: The Warlock of Firetop Mountain,0.09
10990,I Love Puppies,0.09
10999,Cake Mania: Main Street,0.09
11003,DaGeDar,0.09
11012,NERF N-Strike: Double Blast Bundle,0.09


In [None]:
# Nintendo Wii average number of global sales
wii_global_sales = df[df["Platform"] == "Wii"]["Global_Sales"].mean()
print(f"Wii avg Global Sales {wii_global_sales}.")