# Dataset Cleaning

### Game Reviews

In [None]:
# Source datasets required to rerun notebooks are not included in the repo due to size constraints.
# Dataset URL: https://amazon-reviews-2023.github.io/ << Video_Games

# Note: Data cleaning notebooks must be run sequentially (1, 2, 3).

In [None]:
# Cleaning 4.624.615 game reviews aggressively to reduce size
# Strict handling of missing data to retain high-quality entries

In [None]:
# Imports

import re
import pandas as pd

In [None]:
# Reset to show all columns + rows

pd.reset_option('display.max_columns')
pd.reset_option('display.max_rows')

### Data Exploration

In [None]:
reviews = pd.read_json('../data/video_games_reviews.jsonl', lines = True)

reviews.shape

In [None]:
reviews.columns

In [None]:
reviews.sample()

### Data Cleaning

In [None]:
# Ensure only main products are retained

reviews = reviews[reviews['parent_asin'] == reviews['asin']]

reviews.shape

In [None]:
# Keep reviews only for games with available metadata

asins_meta = pd.read_csv('../data/meta_cleaned.csv')['parent_asin']

reviews = reviews[reviews['asin'].isin(asins_meta)]

reviews.shape

In [None]:
# Drop 'asin' since it now matches 'parent_asin'

reviews = reviews.drop('asin', axis = 1)

reviews.columns

In [None]:
reviews.sample()

In [None]:
reviews.images.value_counts()

In [None]:
# Count rows without 'images'

empty_images_count = reviews['images'].apply(lambda x: x == []).sum()

print('No images provided:', empty_images_count)

In [None]:
# Drop 'images' due to insufficient data

reviews.drop('images', axis = 1, inplace = True)

reviews.shape

In [None]:
reviews.sample()

In [None]:
# Remove duplicate game reviews

reviews.drop_duplicates(inplace = True)

reviews.shape

In [None]:
reviews.verified_purchase.value_counts()

In [None]:
# Remove unverified_purchases

reviews = reviews[reviews['verified_purchase']]

reviews.shape

In [None]:
# Filter reviews

filter_hardware = ['controler', 'controller', 'blu-Ray', 'card game', 'stick drift', 'battery', 'batteries', 
                   'slight wear', 'cable', 'usb', 'adapter', 'plastic piece', ' mic ', 'router', 
                   'device', 'great hardware', 'your xbox', 'charger', 'cable tv', 'remote', 
                   'console', 'soldering', 'clearly used', 'stick broke', 'leaks after']

filter_spanish = ['excelente', 'completamente', 'nuevo', 'descripción', 'juego', 'divertido', 
                  'llego', 'tiempo', 'completo', 'totalmente', 'recomendado', 'problema']

filter = filter_hardware + filter_spanish

def filter_hw_lang(row):
    for item in filter:
        if item in row:
            return False
    return True

reviews = reviews[reviews['text'].apply(filter_hw_lang)]

reviews.shape

In [None]:
# Replace patterns '<br />', '&#34;', '[[videoid', '[[ASIN:'

patterns = r"(<br\s*/?>|&#34;|\[\[videoid|\[\[ASIN:)"

def remove_patterns(text):
	cleaned_text = re.sub(patterns, ' ', text)
	return cleaned_text

reviews['text'] = reviews['text'].apply(remove_patterns)

### Save Game Reviews

In [None]:
# Reset index

reviews.reset_index(drop = True)

# Save cleaned game reviews

reviews.to_csv('../data/reviews_cleaned.csv', index = False)