# Dataset Cleaning

### Games Categories

In [None]:
# Source datasets required to rerun notebooks are not included in the repo due to size constraints.
# Dataset URL: https://amazon-reviews-2023.github.io/ << Video_Games

# Note: Data cleaning notebooks must be run sequentially (1, 2, 3).

In [None]:
# Added common game-categories to enhance the dataset. 
# Those categories were retrieved using the ChatGPT API.

In [None]:
# Imports

import time
import dotenv
import openai
import pandas as pd

In [None]:
# Set OpenAI API key

openai.api_key = dotenv.get_key('../.env', 'OPENAI_API_KEY')

print('Key:', openai.api_key and True)

In [None]:
# Load game meta data

meta_data = pd.read_csv('../data/meta_cleaned.csv')

meta_data.sample()

In [None]:
# Retrieve game categories from the web

def get_game_genre(game_name):
    game_genres = ['Action', 'Adventure', 'Role-Playing', 'Simulation', 
               'Strategy', 'Sports', 'Puzzle', 'Fighting', 'Shooter', 
               'Horror', 'Platformer', 'Racing', 'MMORPG', 'Idle', 
               'Sandbox', 'Survival', 'Battle Royale', 'Rhythm', 
               'Party', 'Card Game', 'MOBA', 'Stealth', 'Visual Novel', 
               'Text-Based', 'Tycoon', 'Compilation']

    prompt = f"""
	Classify the game '{game_name}' into the 
    most suitable genre from this list: {game_genres}.
	Provide only the name of the chosen genre as your response.
	"""

    # Make the API call to OpenAI
    response = openai.chat.completions.create(
        model = 'gpt-4',
        messages = [{'role': 'user', 'content': prompt}],
        max_tokens = 10,
        temperature = 0
    )

    # Extract the response
    genre = response.choices[0].message.content.strip()
    
    return (game_name, genre)

game_titles = []

game_categories = []

for title in game_titles:
    print('Processing:', title)
    category = get_game_genre(title)
    game_categories.append(category)
    time.sleep(3)

In [None]:
# Load retrieved game categories

categories = pd.read_csv('../data/cats_enriched.csv')

categories.head()

In [None]:
meta_data.shape, categories.shape

In [None]:
# Removing duplicates in both dataframes

print(meta_data['title'].duplicated().sum())  # Check duplicates in meta_data
print(categories['title'].duplicated().sum())  # Check duplicates in categories

In [None]:
meta_data = meta_data.drop_duplicates(subset = 'title')
categories = categories.drop_duplicates(subset = 'title')

meta_data.shape, categories.shape

In [None]:
# Join retrieved game categories

meta_data = meta_data.merge(categories[['title', 'category']], on = 'title', how = 'left')

meta_data.sample()

In [None]:
# Action, Adventure, Role-Playing, Simulation, Strategy, Sports, Puzzle, Fighting,
# Shooter, Horror, Platformer, Racing, MMORPG, Idle, Sandbox, Survival, Battle Royale, 
# Rhythm, Party, Card Game, MOBA, Stealth, Visual Novel, Text-Based, Tycoon, Compilation

main_categories = ['Action', 'Adventure', 'Role-Playing', 'Simulation', 'Strategy', 'Sports', 
                   'Puzzle', 'Fighting', 'Shooter', 'Horror', 'Platformer', 'Racing', 'MMORPG', 
                   'Idle', 'Sandbox', 'Survival', 'Battle Royale', 'Rhythm', 'Party', 'Card Game', 
                   'MOBA', 'Stealth', 'Visual Novel', 'Text-Based', 'Tycoon', 'Compilation']

cat_check = list(meta_data['category'].unique())

for cat in cat_check:
    if cat not in main_categories:
        print(cat)


In [None]:
# Reassign "Open Country - Xbox One would be classified as a 'Survival' game."

meta_data[meta_data['category'] == "Open Country - Xbox One would be classified as a 'Survival' game."]

In [None]:
meta_data.loc[231, 'category'] = 'Survival'

In [None]:
# Reassign 'Adult Visual Novel'

meta_data[meta_data['category'] == 'Adult Visual Novel']

In [None]:
meta_data.loc[838, 'category'] = 'Visual Novel'

In [None]:
# Reassign 'Educational'

meta_data[meta_data['category'] == 'Educational']

In [None]:
meta_data.loc[859, 'category'] = 'Party'

In [None]:
# Reassign 'Exploration'

meta_data[meta_data['category'] == 'Exploration']

In [None]:
meta_data.loc[1158, 'category'] = 'Adventure'

In [None]:
# Reassign 'Metroidvania'

meta_data[meta_data['category'] == 'Metroidvania']

In [None]:
meta_data.loc[[1411, 2462], 'category'] = 'Platformer'

In [None]:
# Reassign 'Electronic Games'

meta_data[meta_data['category'] == 'Electronic Games']

In [None]:
meta_data = meta_data.drop(index = [1862, 2584])  # Hardware items

In [None]:
# Reassign 'Exploration / Sandbox'

meta_data[meta_data['category'] == 'Exploration / Sandbox']

In [None]:
meta_data.loc[2340, 'category'] = 'Adventure'

In [None]:
# Reassign 'Co-op Action Adventure'

meta_data[meta_data['category'] == 'Co-op Action Adventure']

In [None]:
meta_data.loc[2626, 'category'] = 'Adventure'

meta_data.shape

In [None]:
# Reset index

meta_data.reset_index(drop = True)

# Saving the enriched game meta data

meta_data.to_csv('../data/meta_cleaned.csv', index = False)