In [2]:
import pandas as pd
import Utils as ut

## Sentiment Analysis

In [5]:
path = 'data/csv/df_reviews_cl.csv'

df_reviews = pd.read_csv(path)
df_reviews.head(2)

Unnamed: 0,funny,posted,last_edited,item_id,helpful,recommend,review,user_id,user_url
0,No data,"Posted November 5, 2011.",No data,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,76561197970982479,http://steamcommunity.com/profiles/76561197970...
1,No data,"Posted July 15, 2011.",No data,22200,No ratings yet,True,It's unique and worth a playthrough.,76561197970982479,http://steamcommunity.com/profiles/76561197970...


In [13]:
# Applying the sentiment analysis to the reviews column

df_reviews['sentiment'] = df_reviews['review'].apply(ut.get_sentiment)

In [42]:
filtered = df_reviews[['review', 'sentiment']]
filtered[filtered['sentiment'] == 2].loc[0:20,:]

Unnamed: 0,review,sentiment
0,Simple yet with great replayability. In my opi...,2
1,It's unique and worth a playthrough.,2
2,Great atmosphere. The gunplay can be a bit chu...,2
3,I know what you think when you see this title ...,2
6,A suitably punishing roguelike platformer. Wi...,2
7,"""Run for fun? What the hell kind of fun is that?""",2
8,"Elegant integration of gameplay, story, world ...",2
10,Fun balance of tactics and strategy. Potentia...,2
11,"Fun world builder, with plenty of option of ho...",2
12,This game... is so fun. The fight sequences ha...,2


In [44]:
df_reviews = df_reviews.drop('review', axis=1)

In [46]:
df_reviews.to_csv('df_reviews_st.csv', index=False)

## Functions

DataSets preparation for API functions

### User Data

In [3]:
path_g = 'data/csv/df_games_cl.csv'
path_i = 'data/csv/df_items_cl.csv'
path_r = 'data/csv/df_reviews_st.csv'

df_games = pd.read_csv(path_g)
df_items = pd.read_csv(path_i)
df_reviews = pd.read_csv(path_r)

In [7]:
ut.data_review(df_items)


Total rows:  5094105

Total full null rows:  0

Total duplicated rows: 0


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,item_id,[<class 'int'>],100.0,5094105,0.0,0
1,item_name,[<class 'str'>],100.0,5094105,0.0,0
2,playtime_forever,[<class 'int'>],100.0,5094105,0.0,0
3,playtime_2weeks,[<class 'int'>],100.0,5094105,0.0,0
4,user_id,[<class 'str'>],100.0,5094105,0.0,0
5,items_count,[<class 'int'>],100.0,5094105,0.0,0
6,steam_id,[<class 'int'>],100.0,5094105,0.0,0
7,user_url,[<class 'str'>],100.0,5094105,0.0,0


In [4]:
df_joined = pd.merge(df_items, df_games, left_on='item_id', right_on='id')

In [8]:
# The information wasn't retrieved in the same period of time, though the inner join is not showing 
# all the records from the items df

ut.data_review(df_joined)


Total rows:  4244831

Total full null rows:  0

Total duplicated rows: 0


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,item_id,[<class 'int'>],100.0,4244831,0.0,0
1,item_name,[<class 'str'>],100.0,4244831,0.0,0
2,playtime_forever,[<class 'int'>],100.0,4244831,0.0,0
3,playtime_2weeks,[<class 'int'>],100.0,4244831,0.0,0
4,user_id,[<class 'str'>],100.0,4244831,0.0,0
5,items_count,[<class 'int'>],100.0,4244831,0.0,0
6,steam_id,[<class 'int'>],100.0,4244831,0.0,0
7,user_url,[<class 'str'>],100.0,4244831,0.0,0
8,publisher,[<class 'str'>],100.0,4244831,0.0,0
9,genres,[<class 'str'>],100.0,4244831,0.0,0


In [None]:
# Need to keep user_id, price, recommend, items_count\

keep = ['user_id', 'price', 'items_count']

for column in df_joined.columns:
    if column not in keep:
        df_joined.drop(columns=column, inplace=True)

In [11]:
# The items_count is not matching the rows count due to the join issue explained before

df_joined[df_joined['user_id'] == '76561197970982479']

Unnamed: 0,user_id,items_count,price
0,76561197970982479,277,9.99
9611,76561197970982479,277,4.99
15879,76561197970982479,277,4.99
19310,76561197970982479,277,4.99
22552,76561197970982479,277,4.99
...,...,...,...
1207039,76561197970982479,277,29.99
1208688,76561197970982479,277,59.99
1210724,76561197970982479,277,1.99
1210991,76561197970982479,277,29.99


In [40]:
# Grouping the information by user and summarizing the total amount spent per user and items acquired

u_spentlog = df_joined.groupby('user_id').agg({'items_count': 'mean', 'price': 'sum'}).reset_index()
u_spentlog['items_count'] = u_spentlog['items_count'].astype(int)
u_spentlog['price'] = u_spentlog['price'].round(2)
u_spentlog.rename(columns={'items_count': 'total_items', 'price': 'total_spent'}, inplace=True)

In [41]:
u_spentlog.head(10)

Unnamed: 0,user_id,total_items,total_spent
0,--000--,58,402.77
1,--ace--,44,184.61
2,--ionex--,23,118.82
3,-2SV-vuLB-Kg,68,446.39
4,-404PageNotFound-,149,1541.0
5,-AnimeIsMyThing-,127,1243.91
6,-Azsael-,167,2479.47
7,-Beave-,47,81.64
8,-Encore-,24,287.78
9,-GM-Dragon,106,875.85


In [43]:
ut.data_review(u_spentlog)


Total rows:  68712

Total full null rows:  0

Total duplicated rows: 0


Unnamed: 0,Column,dType,No_Null_%,No_Null_Qty,Null_%,Null_Qty
0,user_id,[<class 'str'>],100.0,68712,0.0,0
1,total_items,[<class 'int'>],100.0,68712,0.0,0
2,total_spent,[<class 'float'>],100.0,68712,0.0,0


In [44]:
u_spentlog.to_csv('userdata_pq.csv', index=False)