# Data Preparation – Console & Game Sales

This notebook loads the raw console sales and video game sales datasets,
performs basic cleaning and standardization, and outputs cleaned CSV files
for use in later analysis, SQL, machine learning, and dashboarding.

Raw data is read from:
- data/raw/

Cleaned data is saved to:
- data/processed/

In [67]:
import pandas as pd
import numpy as np

# Load Raw CSV Files

In [68]:
# Load Raw Datasets as Pandas Dataframe
df_console = pd.read_csv('../data/raw/console_data.csv')
df_games = pd.read_csv("../data/raw/game_data.csv")

# Quick Check
display(df_console.head())
display(df_games.head())

Unnamed: 0,Console Name,Type,Company,Gen,Gen Years,Released Year,Generation,Discontinuation Year,Units sold (million),Remarks,Link to gif
0,Magnavox Odyssey,Home,Magnavox,1st Gen,1972-1978,1972,1,1975,0.35,The Magnavox Odyssey is the first commercial ...,https://s12.gifyu.com/images/SVLO3.gif
1,Home Pong,Home,Atari,1st Gen,1972-1978,1975,1,1978,0.15,Atari's Home Pong was a dedicated console that...,https://s12.gifyu.com/images/SVz99.gif
2,Atari 2600,Home,Atari,2nd Gen,1978-1982,1977,2,1992,30.0,Atari2600 is often credited with popularizing ...,https://s12.gifyu.com/images/SVz3U.gif
3,Magnavox Odyssey 2,Home,Magnavox,2nd Gen,1978-1982,1978,2,1984,2.0,The Odyssey² featured a built-in keyboard for ...,https://s12.gifyu.com/images/SVz70.gif
4,Intellivision,Home,Mattel,2nd Gen,1978-1982,1979,2,1990,3.0,The Intellivision boasted superior graphics an...,https://s12.gifyu.com/images/SVLBF.gif


Unnamed: 0,Game Name,System Full,Units(m),Publisher,Developer,Image_URL,Release Date
0,Pac-Man,Atari 2600,7.7,Atari,Atari,https://www.vgchartz.com/games/boxart/3878609c...,01/03/1982
1,Pitfall!,Atari 2600,4.0,Activision,Activision,https://www.vgchartz.com/games/boxart/127822cc...,20/04/1982
2,Frogger,Atari 2600,4.0,Parker Bros.,Konami,https://www.vgchartz.com/games/boxart/7351891c...,01/01/1982
3,Missile Command,Atari 2600,2.5,Atari,Atari,https://www.vgchartz.com/games/boxart/8855822c...,01/01/1981
4,Space Invaders,Atari 2600,2.0,Atari,Atari,https://www.vgchartz.com/games/boxart/7131076c...,01/01/1978


# Inspect Structure

In [69]:
# Check data types, nulls, and column info
print(df_console.info(), '\n')
print(df_games.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26 entries, 0 to 25
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Console Name          26 non-null     object 
 1   Type                  26 non-null     object 
 2   Company               26 non-null     object 
 3   Gen                   26 non-null     object 
 4   Gen Years             26 non-null     object 
 5   Released Year         26 non-null     int64  
 6   Generation            26 non-null     int64  
 7   Discontinuation Year  26 non-null     object 
 8   Units sold (million)  26 non-null     float64
 9   Remarks               26 non-null     object 
 10  Link to gif           26 non-null     object 
dtypes: float64(1), int64(2), object(8)
memory usage: 2.4+ KB
None 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27308 entries, 0 to 27307
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------   

In [70]:
# Check Basic Stats (Descriptive Statistics)
display(df_console.describe())
display(df_games.describe())

Unnamed: 0,Released Year,Generation,Units sold (million)
count,26.0,26.0,26.0
mean,1996.384615,5.230769,47.326923
std,14.612534,2.550415,45.542229
min,1972.0,1.0,0.15
25%,1983.75,3.0,10.0
50%,1997.0,5.5,31.5
75%,2006.0,7.0,78.5
max,2020.0,9.0,155.0


Unnamed: 0,Units(m)
count,27308.0
mean,0.29597
std,1.3636
min,0.0
25%,0.0
50%,0.0
75%,0.16
max,82.9


# Data Cleaning

## Remove Irrelevant Columns

In [71]:
# Console Dataframe
df_console = df_console.drop(columns=['Type', 'Gen', 'Gen Years', 'Remarks', 'Link to gif'])

# Games Dataframe
df_games = df_games.drop(columns=['Image_URL'])

## Standardize Column Names

In [72]:
# Convert Column Names to snake_case Format
df_console.columns = (
    df_console.columns
    .str.strip()
    .str.lower()
    .str.replace(' ', '_')
)

df_games.columns = (
    df_games.columns
    .str.strip()
    .str.lower()
    .str.replace(' ', '_')
)

# Check
df_console.columns, df_games.columns

(Index(['console_name', 'company', 'released_year', 'generation',
        'discontinuation_year', 'units_sold_(million)'],
       dtype='object'),
 Index(['game_name', 'system_full', 'units(m)', 'publisher', 'developer',
        'release_date'],
       dtype='object'))

In [73]:
# Change Column Names for More Clarity/Accuracy
df_console = df_console.rename(columns={
    'units_sold_(million)' : 'units_sold(m)',
    'company' : 'manufacturer',
    'discontinuation_year' : 'discontinued_year'
})

df_games = df_games.rename(columns={
    'game_name': 'title',
    'system_full': 'console_name',
    'units(m)' : 'units_sold(m)'
})

# Change Column Orders
df_console = df_console[['console_name', 'generation', 'manufacturer', 'released_year', 'discontinued_year', 'units_sold(m)']]
df_games = df_games[['title', 'console_name', 'publisher', 'developer', 'release_date', 'units_sold(m)']]

# Check
df_console.columns, df_games.columns

(Index(['console_name', 'generation', 'manufacturer', 'released_year',
        'discontinued_year', 'units_sold(m)'],
       dtype='object'),
 Index(['title', 'console_name', 'publisher', 'developer', 'release_date',
        'units_sold(m)'],
       dtype='object'))

## Filter Out Irrelevant Data

In [74]:
# For Consoles we are only interested in 7th and 8th Generation (Xbox, Playstation, Wii)
df_console = df_console[
    df_console['console_name'].isin(['Xbox 360', 'Xbox One', 'PlayStation 3', 'PlayStation 4', 'Wii', 'Wii U'])
].reset_index(drop=True)

# As such, we are only interested in Titles that released for these consoles
df_games = df_games[
    df_games['console_name'].isin(df_console['console_name'])
]

# Check
df_games['console_name'].value_counts()

console_name
PlayStation 4    3106
Xbox One         2008
PlayStation 3    1905
Xbox 360         1735
Wii              1676
Wii U             554
Name: count, dtype: int64

There are also many games that have 0.0 units sold. We will remove these as it may skew our final analysis.

In [75]:
# Only keep games that have 'units_sold(m)' higher than 0
print('# of Games Before:', df_games.shape[0])
df_games = df_games[df_games['units_sold(m)'] > 0]

# Check to see that not too many games were removed
print('# of Games After:', df_games.shape[0])

# of Games Before: 10984
# of Games After: 5705


## Handle Missing Values

In [76]:
# First, let's see which fields have missing data
print('Console Sales:\n', df_console.isna().sum())
print('\nGame Sales:\n', df_games.isna().sum())

Console Sales:
 console_name         0
generation           0
manufacturer         0
released_year        0
discontinued_year    0
units_sold(m)        0
dtype: int64

Game Sales:
 title             0
console_name      0
publisher         0
developer         0
release_date     20
units_sold(m)     0
dtype: int64


In [77]:
# Games that have a missing release_date aren't actual games, but rather expansions or DLC. As such, we will remove those rows. 
df_games = df_games[df_games['release_date'].notna()]

# Check
df_games.isna().sum()

title            0
console_name     0
publisher        0
developer        0
release_date     0
units_sold(m)    0
dtype: int64

In [78]:
# Final Check to see How Many Games are Left
print('# of Games:', df_games.shape[0])

# of Games: 5685


## Convert Datatypes

In [79]:
# In the console sales data, we have an issue in that Playstation 4 hasn't been discontinued
# For our analysis, we will act as though the PS4 was discontinued in 2021 (1 years after the PS5 release)
df_console['discontinued_year'] = df_console['discontinued_year'].replace('ongoing', 2021)

# Now we can easily convert the datatype to int
df_console['discontinued_year'] = df_console['discontinued_year'].astype('int64')

# Check
df_console.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   console_name       6 non-null      object 
 1   generation         6 non-null      int64  
 2   manufacturer       6 non-null      object 
 3   released_year      6 non-null      int64  
 4   discontinued_year  6 non-null      int64  
 5   units_sold(m)      6 non-null      float64
dtypes: float64(1), int64(3), object(2)
memory usage: 420.0+ bytes


In [80]:
# For game sales, let's convert the release_date to type Datetime
df_games['release_date'] = pd.to_datetime(
    df_games['release_date'],
    errors='coerce'
)

# Check
df_games.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5685 entries, 11595 to 27303
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   title          5685 non-null   object        
 1   console_name   5685 non-null   object        
 2   publisher      5685 non-null   object        
 3   developer      5685 non-null   object        
 4   release_date   5685 non-null   datetime64[ns]
 5   units_sold(m)  5685 non-null   float64       
dtypes: datetime64[ns](1), float64(1), object(4)
memory usage: 310.9+ KB


  df_games['release_date'] = pd.to_datetime(


# Feature Engineering

This next step is crucial for our analysis. We need to add a column to indicate whether a game is 'exclusive'. 

Exclusivity will be determined based on the 'platform', not the console name. So a game that was released on only the Wii and Wii U is still considered exclusive, since both consoles are from the same 'platform'.

In [81]:
# First, we will need to add a column to indicate what 'platform' the game belongs to
def map_platform(x):
    x = str(x).lower()
    if 'playstation' in x:     # PlayStation 3 / 4 --> Sony
        return 'Sony'
    elif 'wii' in x:           # Wii / Wii U --> Nintendo
        return 'Nintendo'
    else:                      # Xbox 360 / One --> Microsoft
        return 'Microsoft'

df_games['platform'] = df_games['console_name'].apply(map_platform)

# Check
df_games['platform'].value_counts()

platform
Sony         2326
Microsoft    1845
Nintendo     1514
Name: count, dtype: int64

Now we can determine platform exclusivity

In [82]:
# First, for each title, we want to see how many unique 'platforms' it appears on
platform_counts = (
    df_games.groupby('title')['platform'].nunique()
)

# Now we can determine platform exclusivity if platform_counts is 1 for that title
df_games['is_exclusive'] = df_games['title'].map(platform_counts == 1)

# Check
df_games.head()

Unnamed: 0,title,console_name,publisher,developer,release_date,units_sold(m),platform,is_exclusive
11595,Gran Turismo 5,PlayStation 3,Sony Computer Entertainment,Polyphony Digital,2010-11-24,11.95,Sony,True
11596,Uncharted 3: Drake's Deception,PlayStation 3,Sony Computer Entertainment,Naughty Dog,2011-11-01,9.3,Sony,True
11597,The Last of Us,PlayStation 3,Sony Computer Entertainment America,Naughty Dog,2013-06-14,8.15,Sony,True
11598,God of War III,PlayStation 3,Sony Computer Entertainment,SCEA Santa Monica Studio,2010-03-16,7.6,Sony,True
11599,Metal Gear Solid 4: Guns of the Patriots,PlayStation 3,Konami,Kojima Productions,2008-06-12,6.0,Sony,True


Now that our game sales table is clean, we can determine the number of exclusives and total exclusives sold for each console and merge it back with the original console sales data

In [83]:
# Aggregate exclusives per console
exclusive_summary = (
    df_games[df_games['is_exclusive']]      # only exclusive games
    .groupby('console_name')                # grouped by each console
    .agg(
        exclusive_games_released=('title', 'nunique'),   # number of exclusive titles per console
        exclusive_games_sold=('units_sold(m)', 'sum')    # total sales of exclusives per console
    )
    .reset_index()
)

# Check
display(exclusive_summary.head())
display(df_console.head())

Unnamed: 0,console_name,exclusive_games_released,exclusive_games_sold
0,PlayStation 3,479,222.97
1,PlayStation 4,436,297.91
2,Wii,1003,678.81
3,Wii U,71,74.36
4,Xbox 360,374,222.45


Unnamed: 0,console_name,generation,manufacturer,released_year,discontinued_year,units_sold(m)
0,Xbox 360,7,Microsoft,2005,2016,84.0
1,PlayStation 3,7,Sony,2006,2017,87.0
2,Wii,7,Nintendo,2006,2013,101.0
3,Wii U,8,Nintendo,2012,2017,14.0
4,PlayStation 4,8,Sony,2013,2021,117.0


In [None]:
# Merge this dataframe with our console sales data for a highly detailed dataframe
df_console = df_console.merge(
    exclusive_summary,
    left_on='console_name',
    right_on='console_name',
    how="left"  
)

# Slight Formatting Changes for Accuracy
df_console = df_console.rename(columns={'exclusive_games_sold' : 'exclusive_games_sold(m)'})

Unnamed: 0,console_name,generation,manufacturer,released_year,discontinued_year,units_sold(m),exclusive_games_released,exclusive_games_sold(m)
0,Xbox 360,7,Microsoft,2005,2016,84.0,374,222.45
1,PlayStation 3,7,Sony,2006,2017,87.0,479,222.97
2,Wii,7,Nintendo,2006,2013,101.0,1003,678.81
3,Wii U,8,Nintendo,2012,2017,14.0,71,74.36
4,PlayStation 4,8,Sony,2013,2021,117.0,436,297.91
5,Xbox One,8,Microsoft,2013,2020,58.0,40,40.16


# Save Cleaned Data

In [87]:
# Final Check to Ensure Dataframes are analysis ready
display(df_console.info())
display(df_games.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   console_name              6 non-null      object 
 1   generation                6 non-null      int64  
 2   manufacturer              6 non-null      object 
 3   released_year             6 non-null      int64  
 4   discontinued_year         6 non-null      int64  
 5   units_sold(m)             6 non-null      float64
 6   exclusive_games_released  6 non-null      int64  
 7   exclusive_games_sold(m)   6 non-null      float64
dtypes: float64(2), int64(4), object(2)
memory usage: 516.0+ bytes


None

<class 'pandas.core.frame.DataFrame'>
Index: 5685 entries, 11595 to 27303
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   title          5685 non-null   object        
 1   console_name   5685 non-null   object        
 2   publisher      5685 non-null   object        
 3   developer      5685 non-null   object        
 4   release_date   5685 non-null   datetime64[ns]
 5   units_sold(m)  5685 non-null   float64       
 6   platform       5685 non-null   object        
 7   is_exclusive   5685 non-null   bool          
dtypes: bool(1), datetime64[ns](1), float64(1), object(5)
memory usage: 360.9+ KB


None

In [85]:
# Save Clean Dataframes as CSV files
df_console.to_csv(
    "../data/processed/console_data_clean.csv", 
    index=False
)

df_games.to_csv(
    "../data/processed/game_data_clean.csv", 
    index=False
)