# Data Cleaning – Console & Game Sales

This notebook loads the raw console sales and video game sales datasets,
performs basic cleaning and standardization, and outputs cleaned CSV files
for use in later analysis, SQL, machine learning, and dashboarding.

Raw data is read from:
- data/raw/

Cleaned data is saved to:
- data/processed/

In [70]:
import pandas as pd
import numpy as np

# Load Raw CSV Files

In [71]:
# Load Raw Datasets as Pandas Dataframe
df_console = pd.read_csv('../data/raw/console_data.csv')
df_games = pd.read_csv("../data/raw/game_data.csv")

# Quick Check
display(df_console.head())
display(df_games.head())

Unnamed: 0,Console Name,Type,Company,Gen,Gen Years,Released Year,Generation,Discontinuation Year,Units sold (million),Remarks,Link to gif
0,Magnavox Odyssey,Home,Magnavox,1st Gen,1972-1978,1972,1,1975,0.35,The Magnavox Odyssey is the first commercial ...,https://s12.gifyu.com/images/SVLO3.gif
1,Home Pong,Home,Atari,1st Gen,1972-1978,1975,1,1978,0.15,Atari's Home Pong was a dedicated console that...,https://s12.gifyu.com/images/SVz99.gif
2,Atari 2600,Home,Atari,2nd Gen,1978-1982,1977,2,1992,30.0,Atari2600 is often credited with popularizing ...,https://s12.gifyu.com/images/SVz3U.gif
3,Magnavox Odyssey 2,Home,Magnavox,2nd Gen,1978-1982,1978,2,1984,2.0,The Odyssey² featured a built-in keyboard for ...,https://s12.gifyu.com/images/SVz70.gif
4,Intellivision,Home,Mattel,2nd Gen,1978-1982,1979,2,1990,3.0,The Intellivision boasted superior graphics an...,https://s12.gifyu.com/images/SVLBF.gif


Unnamed: 0,Game Name,System Full,Units(m),Publisher,Developer,Image_URL,Release Date
0,Pac-Man,Atari 2600,7.7,Atari,Atari,https://www.vgchartz.com/games/boxart/3878609c...,01/03/1982
1,Pitfall!,Atari 2600,4.0,Activision,Activision,https://www.vgchartz.com/games/boxart/127822cc...,20/04/1982
2,Frogger,Atari 2600,4.0,Parker Bros.,Konami,https://www.vgchartz.com/games/boxart/7351891c...,01/01/1982
3,Missile Command,Atari 2600,2.5,Atari,Atari,https://www.vgchartz.com/games/boxart/8855822c...,01/01/1981
4,Space Invaders,Atari 2600,2.0,Atari,Atari,https://www.vgchartz.com/games/boxart/7131076c...,01/01/1978


# Inspect Structure

In [72]:
# Check data types, nulls, and column info
df_console.info()
df_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26 entries, 0 to 25
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Console Name          26 non-null     object 
 1   Type                  26 non-null     object 
 2   Company               26 non-null     object 
 3   Gen                   26 non-null     object 
 4   Gen Years             26 non-null     object 
 5   Released Year         26 non-null     int64  
 6   Generation            26 non-null     int64  
 7   Discontinuation Year  26 non-null     object 
 8   Units sold (million)  26 non-null     float64
 9   Remarks               26 non-null     object 
 10  Link to gif           26 non-null     object 
dtypes: float64(1), int64(2), object(8)
memory usage: 2.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27308 entries, 0 to 27307
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --

In [73]:
# Check Basic Stats (Descriptive Statistics)
display(df_console.describe())
display(df_games.describe())

Unnamed: 0,Released Year,Generation,Units sold (million)
count,26.0,26.0,26.0
mean,1996.384615,5.230769,47.326923
std,14.612534,2.550415,45.542229
min,1972.0,1.0,0.15
25%,1983.75,3.0,10.0
50%,1997.0,5.5,31.5
75%,2006.0,7.0,78.5
max,2020.0,9.0,155.0


Unnamed: 0,Units(m)
count,27308.0
mean,0.29597
std,1.3636
min,0.0
25%,0.0
50%,0.0
75%,0.16
max,82.9


# Data Cleaning

## Remove Irrelevant Columns

In [74]:
# Console Dataframe
df_console = df_console.drop(columns=['Type', 'Gen', 'Gen Years', 'Remarks', 'Link to gif'])

# Games Dataframe
df_games = df_games.drop(columns=['Image_URL', ])

## Standardize Column Names

In [75]:
# Convert Column Names to snake_case Format
df_console.columns = (
    df_console.columns
    .str.strip()
    .str.lower()
    .str.replace(' ', '_')
)

df_games.columns = (
    df_games.columns
    .str.strip()
    .str.lower()
    .str.replace(' ', '_')
)

# Check
df_console.columns, df_games.columns

(Index(['console_name', 'company', 'released_year', 'generation',
        'discontinuation_year', 'units_sold_(million)'],
       dtype='object'),
 Index(['game_name', 'system_full', 'units(m)', 'publisher', 'developer',
        'release_date'],
       dtype='object'))

In [76]:
# Change Column Names for More Clarity/Accuracy
df_console = df_console.rename(columns={
    'units_sold_(million)' : 'units_sold(m)',
    'company' : 'manufacturer',
    'discontinuation_year' : 'discontinued_year'
})

df_games = df_games.rename(columns={
    'game_name': 'title',
    'system_full': 'platform',
    'units(m)' : 'units_sold(m)'
})

# Change Column Orders
df_console = df_console[['console_name', 'generation', 'manufacturer', 'released_year', 'discontinued_year', 'units_sold(m)']]
df_games = df_games[['title', 'platform', 'publisher', 'developer', 'release_date', 'units_sold(m)']]

# Check
df_console.columns, df_games.columns

(Index(['console_name', 'generation', 'manufacturer', 'released_year',
        'discontinued_year', 'units_sold(m)'],
       dtype='object'),
 Index(['title', 'platform', 'publisher', 'developer', 'release_date',
        'units_sold(m)'],
       dtype='object'))

## Filter Out Irrelevant Data

In [None]:
# For Consoles we are only interested in 7th and 8th Generation (Xbox, Playstation, Wii)
df_console = df_console[
    df_console['console_name'].isin(['Xbox 360', 'Xbox One', 'PlayStation 3', 'PlayStation 4', 'Wii', 'Wii U'])
].reset_index(drop=True)

# As such, we are only interested in Titles that released for these consoles
df_games = df_games[
    df_games['platform'].isin(df_console['console_name'])
]

Unnamed: 0,title,platform,publisher,developer,release_date,units_sold(m)
11595,Gran Turismo 5,PlayStation 3,Sony Computer Entertainment,Polyphony Digital,24/11/2010,11.95
11596,Uncharted 3: Drake's Deception,PlayStation 3,Sony Computer Entertainment,Naughty Dog,01/11/2011,9.30
11597,The Last of Us,PlayStation 3,Sony Computer Entertainment America,Naughty Dog,14/06/2013,8.15
11598,God of War III,PlayStation 3,Sony Computer Entertainment,SCEA Santa Monica Studio,16/03/2010,7.60
11599,Metal Gear Solid 4: Guns of the Patriots,PlayStation 3,Konami,Kojima Productions,12/06/2008,6.00
...,...,...,...,...,...,...
27303,Fishing Sim World,Xbox One,Dovetail Games,Dovetail Games,18/09/2018,0.05
27304,The Council,Xbox One,Bigben Interactive,Big Bad Wolf,29/01/2019,0.00
27305,Sable,Xbox One,Raw Fury,Shedworks,01/12/2020,0.00
27306,Doom Eternal: The Ancient Gods - Part Two,Xbox One,Unknown,id Software,,0.00


array(['Atari 2600', 'Atari 5200/7800', 'GameCube', 'Intellivision',
       'Nintendo 64', 'Nintendo Entertainment System', 'Nintendo Switch',
       'PlayStation', 'PlayStation 2', 'PlayStation 3', 'PlayStation 4',
       'Sega Dreamcast', 'Sega Genesis', 'Sega Master System',
       'Sega Saturn', 'Super Nintendo', 'Wii', 'Wii U', 'Xbox',
       'Xbox 360', 'Xbox One'], dtype=object)

## Convert Datatypes