In [92]:
import pandas as pd
import requests as req
import numpy as np
import time
import matplotlib.pyplot as plt

In [93]:
# Constants
STEAM_REVIEW_CSV_PATH = './datasets/steam_reviews.csv'  # Enter the path to the steam reviews dataset file
CATEGORIES_CSV_PATH = './datasets/steam_categories.csv'       # Enter the output path for the categories dataset file

In [94]:
df = pd.read_csv(STEAM_REVIEW_CSV_PATH)

## Cleaning the data

In [95]:
# Get the first 5 rows of the dataset
df.head()

Unnamed: 0,app_id,app_name,review_text,review_score,review_votes
0,10,Counter-Strike,Ruined my life.,1,0
1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1
2,10,Counter-Strike,This game saved my virginity.,1,0
3,10,Counter-Strike,• Do you like original games? • Do you like ga...,1,0
4,10,Counter-Strike,"Easy to learn, hard to master.",1,1


In [96]:
# Get basic information about the dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6417106 entries, 0 to 6417105
Data columns (total 5 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   app_id        int64 
 1   app_name      object
 2   review_text   object
 3   review_score  int64 
 4   review_votes  int64 
dtypes: int64(3), object(2)
memory usage: 244.8+ MB
None


In [97]:
# Clear rows with null values
df = df.dropna()
print(df.shape)

(6226728, 5)


In [98]:
# Check for reviews that contain only more than three words
df = df[df['review_text'].str.split().str.len() > 3]

In [99]:
# Check for reviews that contain only space and remove them
df = df[~df['review_text'].str.isspace()]

In [100]:
# Check for reviews that don't contain any letters and remove them
df = df[df['review_text'].str.contains('[a-zA-Z]')]

In [101]:
# Check for reviews with the text 'Early Access Review' and remove them
df = df[~df['review_text'].str.contains('Early Access Review')]

In [102]:
# Drop duplicated reviews
df = df.drop_duplicates(subset='review_text')

In [103]:
# Change ♥ to * in the reviews
df['review_text'] = df['review_text'].str.replace('♥', '*')

In [104]:
# Reset the index and check the shape of the cleaned dataset
df = df.reset_index(drop=True)
print(df.shape)

(4246269, 5)


## Incrementing the dataset

In [105]:
# Show how many unique games are in the dataset
print(df['app_id'].nunique())

8560


In [106]:
# Get all unique app_ids in the dataset
app_ids = df['app_id'].unique()

### Adding categories column

In [107]:
# Function to get the categories of a game from the Steam API
# Creates a csv file with the app_id and its categories

def get_categories(app_ids):
    # Make a new dataframe with the unique app_ids
    df_app_ids = pd.DataFrame(app_ids, columns=['app_id'])

    # For each id, get the categories from the Steam API and save them in a new column
    for id in app_ids:
        url = f'https://store.steampowered.com/api/appdetails?appids={id}'
        response = req.get(url)
        data = response.json()
        try:
            categories = data[str(id)]['data']['categories']
            categories = [category['description'] for category in categories]
            df_app_ids.loc[df_app_ids['app_id'] == id, 'categories'] = ', '.join(categories)
        except:
            df_app_ids.loc[df_app_ids['app_id'] == id, 'categories'] = np.nan

        # Wait for 2 second to avoid getting blocked by the server
        time.sleep(2)

    # Export the dataframe to a csv file
    df_app_ids.to_csv(CATEGORIES_CSV_PATH, index=False)

In [108]:
# # Get the categories of the games
# get_categories(app_ids)

In [109]:
# Read the df_app_ids dataframe and merge with the original dataset
df_app_ids = pd.read_csv(CATEGORIES_CSV_PATH)
df = df.merge(df_app_ids, on='app_id')

In [110]:
# Fill NaN categories with empty string
df['categories'] = df['categories'].fillna('')