In [12]:
import pandas as pd

## Load data

In [13]:
df = pd.read_csv("../../data/steam_reviews.csv")
df.head()

Unnamed: 0,compensation,date,early_access,found_funny,hours,page,page_order,product_id,products,text,user_id,username
0,,2017-12-17,False,,0.1,1,0,725280,41.0,This would not be acceptable as an entertainme...,,Chaos Syren
1,,2017-12-27,False,,51.1,1,0,328100,769.0,looks like a facebook game,,₮ʜᴇ Wᴀʀᴛᴏɴ
2,Product received for free,2017-10-16,False,2.0,14.6,1,1,328100,2.0,Better than Minecraft,,hello?<
3,,2018-01-04,False,,5.0,1,0,35140,64.0,I love and idolized Batman and this game is Ma...,,Cyderine916
4,,2018-01-04,False,,16.6,1,1,35140,577.0,Still worth playing in 2018.Probably my favori...,7.65612e+16,DarklyThinking


In [14]:
print(df.isnull().sum())

compensation    7598810
date                  1
early_access          0
found_funny     6552914
hours             26349
page                  0
page_order            0
product_id            0
products          14906
text                279
user_id         4582797
username            302
dtype: int64


In [15]:
# Fix missing values

# Remove reviews with missing text
df = df.dropna(subset=['text'])

# Remove the compensation column - too many missing values
df = df.drop(columns=['compensation'])

# Remove the 1 review with a missing date
df = df.dropna(subset=['date'])

# Remove the found_funny column
df = df.drop(columns=['found_funny'])

# If the hours column is missing, assume 0 hours played
df['hours'] = df['hours'].fillna(0)

# The user_id is widely missing, so we will rather use the usernames
# We will remove the user_id column and remove rows with missing usernames
df = df.drop(columns=['user_id'])

# Remove rows with missing usernames
df = df.dropna(subset=['username'])

# The products column is not needed to just remove it
df = df.drop(columns=['products'])

print(df.isnull().sum())

df.head()

date            0
early_access    0
hours           0
page            0
page_order      0
product_id      0
text            0
username        0
dtype: int64


Unnamed: 0,date,early_access,hours,page,page_order,product_id,text,username
0,2017-12-17,False,0.1,1,0,725280,This would not be acceptable as an entertainme...,Chaos Syren
1,2017-12-27,False,51.1,1,0,328100,looks like a facebook game,₮ʜᴇ Wᴀʀᴛᴏɴ
2,2017-10-16,False,14.6,1,1,328100,Better than Minecraft,hello?<
3,2018-01-04,False,5.0,1,0,35140,I love and idolized Batman and this game is Ma...,Cyderine916
4,2018-01-04,False,16.6,1,1,35140,Still worth playing in 2018.Probably my favori...,DarklyThinking


In [16]:
# Filter data

# Users may have reviewed the same game multiple times so we will use the latest review:
df = df.sort_values(by=['username', 'product_id', 'date'], ascending=[True, True, False])
df = df.drop_duplicates(subset=['username', 'product_id'])

duplicates = df.duplicated(subset=['username', 'product_id']).sum()
print(f"Number of duplicate (username, product_id) pairs: {duplicates}")

df_games = pd.read_csv("../../data/final/steam_games_final.csv")

# Total reviews
print(f"Total reviews: {len(df)}")
# Total users
active_users = df['username'].value_counts()
print(f"Total users from reviews: {len(active_users)}")

# Total games
active_games = df['product_id'].value_counts()
print(f"Total games from reviews: {len(active_games)}")

# Total games in the games dataset
print(f"Total games in the games dataset: {len(df_games)}")

# Do data analysis on non filtered data
df_unfiltered = df.copy()

min_reviews = 10
min_games = 100

# To get more meaningful results, let us ensure that the user has at least the minimum number of reviews
# and the game has at least the minimum number of reviews

df = df[ df['username'].isin(active_users[active_users > min_reviews].index) &
                         df['product_id'].isin(active_games[active_games > min_games].index)]

print(f"Total reviews after filtering: {len(df)}")

# Ensure product_id in df is treated as integers
reviewed_game_ids = df['product_id'].astype(int).unique()

# Filter df_games to keep only games that are reviewed
df_games_with_reviews = df_games[df_games['id'].isin(reviewed_game_ids)]

# Now df_games_with_reviews contains only games with at least one review
print(f"Number of games with reviews: {len(df_games_with_reviews)}")

df.head()

Number of duplicate (username, product_id) pairs: 0
Total reviews: 6844013
Total users from reviews: 2554103
Total games from reviews: 15471
Total games in the games dataset: 29245
Total reviews after filtering: 2064103
Number of games with reviews: 4811


Unnamed: 0,date,early_access,hours,page,page_order,product_id,text,username
6858188,2017-08-09,False,40.9,610,7,49520,So yea Borderlands. Umm Borderlands is game th...,I Need Healing
6131897,2017-08-09,False,6.7,24,8,57690,Best city builder in the world ! Awesome sound...,I Need Healing
1661023,2017-08-09,False,9.2,237,9,211420,Just Preapare to Die..,I Need Healing
446434,2014-09-03,False,3.7,345,6,224600,Game is fine but :/ the glyph.... Just f*ck gl...,I Need Healing
7307179,2016-09-22,False,10.8,1235,7,227300,Just tunn on the radio and boom !,I Need Healing


In [None]:
df = df.reset_index(drop=True)
df_unfiltered = df_unfiltered.reset_index(drop=True)

Unnamed: 0,date,early_access,hours,page,page_order,product_id,text,username
0,2017-08-09,False,40.9,610,7,49520,So yea Borderlands. Umm Borderlands is game th...,I Need Healing
1,2017-08-09,False,6.7,24,8,57690,Best city builder in the world ! Awesome sound...,I Need Healing
2,2017-08-09,False,9.2,237,9,211420,Just Preapare to Die..,I Need Healing
3,2014-09-03,False,3.7,345,6,224600,Game is fine but :/ the glyph.... Just f*ck gl...,I Need Healing
4,2016-09-22,False,10.8,1235,7,227300,Just tunn on the radio and boom !,I Need Healing


# Create Ratings

## Use sentiment analysis to create ratings from reviews:
1 - Negative Review

3 - Neutral Review

5 - Positive Review

In [18]:
# Perform Sentiment Analysis on review text, we need to give either positive, neutral or negative sentiment to each review
# We will use TextBlob library for this purpose
from textblob import TextBlob

def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 5 # Positive
    elif analysis.sentiment.polarity == 0:
        return 3 # Neutral
    else:
        return 1 # Negative

# Apply sentiment analysis to each review to create a sudo rating
df['rating_sentiment'] = df['text'].apply(get_sentiment)

df_unfiltered['rating_sentiment'] = df_unfiltered['text'].apply(get_sentiment)


In [19]:
df_unfiltered.to_csv("../../data/adjusted/steam_reviews_unfiltered.csv", index=False)

ValueError: Could not interpret value `rating_sentiment` for `y`. An entry with this name does not appear in `data`.

<Figure size 800x600 with 0 Axes>