In [None]:
import pandas as pd
import requests as req
import numpy as np
import time
import json
from tqdm import tqdm

In [None]:
# Constants
STEAM_REVIEW_CSV_PATH = './datasets/steam_reviews.csv'          # Enter the path to the steam reviews dataset file
CATEGORIES_CSV_PATH = './datasets/steam_categories.csv'         # Enter the output path for the categories dataset file
GENRES_CSV_PATH = './datasets/steam_genres.csv'                 # Enter the output path for the genres dataset file

In [None]:
df = pd.read_csv(STEAM_REVIEW_CSV_PATH)

## Cleaning the data

In [None]:
# Get the first 5 rows of the dataset
df.head()

In [None]:
# Get basic information about the dataset
print(df.info())

In [None]:
# Clear rows with null values
df = df.dropna()
print(df.shape)

In [None]:
# Check for reviews that contain only more than three words
df = df[df['review_text'].str.count('\s+') > 2]

In [None]:
# Check for reviews that contain only space and remove them
df = df[~df['review_text'].str.isspace()]

In [None]:
# Check for reviews that don't contain any letters and remove them
df = df[df['review_text'].str.contains('[a-zA-Z]')]

In [None]:
# Check for reviews with the text 'Early Access Review' and remove them
df = df[~df['review_text'].str.contains('Early Access Review')]

In [None]:
# Drop duplicated reviews
df = df.drop_duplicates(subset='review_text')

In [None]:
# Change ♥ to * in the reviews
df['review_text'] = df['review_text'].str.replace('♥', '*')

In [None]:
# Reset the index and check the shape of the cleaned dataset
df = df.reset_index(drop=True)
print(df.shape)

## Incrementing the dataset

In [None]:
# Show how many unique games are in the dataset
print(df['app_id'].nunique())

In [None]:
# Get all unique app_ids in the dataset
app_ids = df['app_id'].unique()

### Adding categories column

In [None]:
# Function to get the categories of a game from the Steam API
# Creates a csv file with the app_id and its categories

def get_categories(app_ids):
    # Make a new dataframe with the unique app_ids
    df_app_ids = pd.DataFrame(app_ids, columns=['app_id'])

    # For each id, get the categories from the Steam API and save them in a new column
    for id in tqdm(app_ids):
        url = f'https://store.steampowered.com/api/appdetails?appids={id}'
        response = req.get(url)
        data = response.json()
        try:
            categories = data[str(id)]['data']['categories']
            categories = [category['description'] for category in categories]
            df_app_ids.loc[df_app_ids['app_id'] == id, 'categories'] = ', '.join(categories)
        except:
            df_app_ids.loc[df_app_ids['app_id'] == id, 'categories'] = np.nan

        # Wait for 2 second to avoid getting blocked by the server
        time.sleep(2)

    # Export the dataframe to a csv file
    df_app_ids.to_csv(CATEGORIES_CSV_PATH, index=False)

In [None]:
# # Get the categories of the games
# get_categories(app_ids)

In [None]:
# Read the df_app_ids dataframe and merge with the original dataset
df_app_ids = pd.read_csv(CATEGORIES_CSV_PATH)
df = df.merge(df_app_ids, on='app_id')

In [None]:
# Fill NaN categories with empty string
df['categories'] = df['categories'].fillna('')

### Adding genres column

In [None]:
# Function to get the genres of a game from the Steam API
# Creates a csv file with the app_id and its genres

def get_genres(app_ids):
    # Make a new dataframe with the unique app_ids
    df_app_ids = pd.DataFrame(app_ids, columns=['app_id'])

    # For each id, get the genres from the Steam API and save them in a new column
    for id in tqdm(app_ids):
        try:
            url = f'https://store.steampowered.com/api/appdetails?appids={id}&l=english'
            response = req.get(url)

            data = response.json()

        except json.JSONDecodeError:
            # Ignore if there is an error in decoding the JSON
            print(f'Json decode error in app_id = {id}')
            data = None
        except Exception as e:
            print(f'Unexpected error for app_id = {id}: {e}')
            data = None

        try:
            genres = data[str(id)]['data']['genres']
            genres = [genre['description'] for genre in genres]
            df_app_ids.loc[df_app_ids['app_id'] == id, 'genres'] = ', '.join(genres)
        except:
            df_app_ids.loc[df_app_ids['app_id'] == id, 'genres'] = ''

        # Wait for 2 second to avoid getting blocked by the server
        time.sleep(2)

    # Export the dataframe to a csv file
    df_app_ids.to_csv(GENRES_CSV_PATH, index=False)

In [None]:
# Get the genres of the games
# get_genres(app_ids)

In [None]:
# Read the df_app_ids dataframe and merge with the original dataset
df_app_ids = pd.read_csv(GENRES_CSV_PATH)
df = df.merge(df_app_ids, on='app_id')

## Exporting cleaned dataset

In [None]:
# Export the final dataset to a csv file
CLEAN_DATA_PATH = 'datasets/cleaned_steam_reviews.csv' # change this to the path of the cleaned dataset
df.to_csv(CLEAN_DATA_PATH, index=False)