In [38]:
#import relevant libraries
import pandas as pd
import json
from glob import glob
import time

In [32]:
#functions to read/manipulate the data
def print_with_sleep(message, sleep_time):
    print(message)
    time.sleep(sleep_time)

def get_details_df(folder_path):
    df = pd.DataFrame()
    #df = pd.concat((pd.read_json(file) for file in glob(folder_path + "*.json")), ignore_index = True)
    for file in glob(folder_path + "*.json"):
        with open(file, "r") as json_file:
            json_data = json.load(json_file)
            for attribute, value in json_data.items():
                df = df.append(value, ignore_index=True)            
    return df

def get_details_df_improved(folder_path):
    dict_list = []
    for file in glob(folder_path + "*.json"):
        with open(file, "r") as json_file:
            df = pd.read_json(json_file)
            dict_list.append(df.transpose())
    result = pd.concat(dict_list)
    return result

def get_details_dict(folder_path):
    details = dict()
    for file in glob(folder_path + "*.json"):
        with open(file, "r") as json_file:
            json_data = json.load(json_file)
    return details        

#Function used to get userscore
def divide(positive, negative):
    return positive / (positive+negative)
    if positive == 0:
        return 0
    return positive / (positive + negative)

def convert_cents_to_dollars(price_column_series):
    column = price_column_series.astype('string')
    column = column.astype(int) / 100
    return column

In [33]:
#Weirdly slow - don't use
#chunks_folder_path = "./app_details_chunks/"
#df = get_details_df(chunks_folder_path)
#df
#df.info()
#df.describe()

In [34]:
#A more effective way to get the data
chunks_folder_path = "./app_details_chunks/"
all_details_df = get_details_df_improved(chunks_folder_path)
all_details_df


Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,ccu,languages,genre,tags
10,10,Counter-Strike,Valve,Valve,,184180,4787,0,"10,000,000 .. 20,000,000",11054,339,220,81,999,999,0,13899,"English, French, German, Italian, Spanish - Sp...",Action,"{'Action': 5370, 'FPS': 4794, 'Multiplayer': 3..."
1000000,1000000,ASCENXION,IndigoBlue Game Studio,PsychoFlux Entertainment,,26,4,0,"0 .. 20,000",0,0,0,0,999,999,0,0,"Korean, English, Simplified Chinese","Action, Adventure, Indie","{'Shoot 'Em Up': 186, 'Metroidvania': 181, 'Bu..."
1000010,1000010,Crown Trick,NEXT Studios,"Team17, NEXT Studios",,3568,526,0,"200,000 .. 500,000",489,0,615,0,799,1999,60,217,"Simplified Chinese, English, Japanese, Traditi...","Adventure, Indie, RPG, Strategy","{'Rogue-like': 260, 'Turn-Based Combat': 244, ..."
1000030,1000030,"Cook, Serve, Delicious! 3?!",Vertigo Gaming Inc.,Vertigo Gaming Inc.,,1347,79,0,"50,000 .. 100,000",310,0,367,0,1999,1999,0,41,English,"Action, Indie, Simulation, Strategy","{'Typing': 217, 'Casual': 208, 'Management': 2..."
1000040,1000040,细胞战争,DoubleC Games,DoubleC Games,,0,1,0,"0 .. 20,000",0,0,0,0,199,199,0,0,"English, Not supported, Simplified Chinese","Action, Casual, Indie, Simulation","{'Action': 22, 'Casual': 22, 'Indie': 21, 'Sim..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1369950,1369950,亡命尸潮惊魂夜 Escape Zombies At Night,morescore studio,morescore studio,,5,3,0,"0 .. 20,000",0,0,0,0,99,99,0,0,"English, Simplified Chinese","Action, Adventure, Indie, RPG","{'Post-apocalyptic': 103, 'Survival': 96, 'Adv..."
1369980,1369980,Abyss The Forgotten Past,OWG Studios,OWG Studios,,9,4,0,"0 .. 20,000",0,0,0,0,399,399,0,0,"English, Portuguese - Brazil","Action, Adventure, Indie, RPG, Early Access","{'Adventure': 60, 'Action': 52, 'Action-Advent..."
1370000,1370000,Max and the Book of Chaos,Orenji Games Entertainment,Orenji Games Entertainment,,5,1,0,"0 .. 20,000",0,0,0,0,899,899,0,0,"English, Spanish - Spain","Action, Adventure, Indie","{'Action': 68, 'Adventure': 61, 'Action-Advent..."
1370040,1370040,Underlings,One Man Games,One Man Games,,13,6,0,"0 .. 20,000",0,0,0,0,999,999,0,0,English,"Action, Adventure, Indie, RPG, Simulation","{'Open World Survival Craft': 327, 'Survival':..."


In [35]:
#Clean the data
data_df = all_details_df

#Remove games with too much issing data
#data_df[data_df['price'].isnull() == True]
#remove_these_games = ['530940', '657860', '684740', '846260']
data_df = data_df.drop(data_df.loc[data_df['price'].isnull() == True].index)
#Remove games without any language defined
data_df = data_df.drop(data_df.loc[data_df['languages'].isnull() == True].index)



data_df.appid = data_df.appid.astype(int)
data_df.name = data_df.name.astype('string')
data_df.developer = data_df.developer.astype('string')
data_df.publisher = data_df.publisher.astype('string')
#data_df.score_rank = data_df.score_rank.astype('category')
data_df = data_df.drop('score_rank', 1)
data_df.positive = data_df.positive.astype(int)
data_df.negative = data_df.negative.astype(int)
data_df.userscore = divide(data_df.positive, data_df.negative)
data_df.owners = data_df.owners.astype('category')
data_df.average_forever = data_df.average_forever.astype(int)
data_df.average_2weeks = data_df.average_2weeks.astype(int)
data_df.median_forever = data_df.median_forever.astype(int)
data_df.median_2weeks = data_df.median_2weeks.astype(int)
data_df.price = convert_cents_to_dollars(data_df.price)
data_df.initialprice = convert_cents_to_dollars(data_df.initialprice)
data_df = data_df.drop('discount', 1)
data_df.ccu = data_df.ccu.astype(int)
#Leave languages as is for now
data_df = data_df.drop('genre', 1)
#Leave tags as is for now

data_df.info()
data_df.dtypes
data_df.head(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42631 entries, 10 to 1370130
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   appid            42631 non-null  int32   
 1   name             42631 non-null  string  
 2   developer        42631 non-null  string  
 3   publisher        42631 non-null  string  
 4   positive         42631 non-null  int32   
 5   negative         42631 non-null  int32   
 6   userscore        42575 non-null  float64 
 7   owners           42631 non-null  category
 8   average_forever  42631 non-null  int32   
 9   average_2weeks   42631 non-null  int32   
 10  median_forever   42631 non-null  int32   
 11  median_2weeks    42631 non-null  int32   
 12  price            42631 non-null  float64 
 13  initialprice     42631 non-null  float64 
 14  ccu              42631 non-null  int32   
 15  languages        42631 non-null  object  
 16  tags             42631 non-null  obje

Unnamed: 0,appid,name,developer,publisher,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,ccu,languages,tags
10,10,Counter-Strike,Valve,Valve,184180,4787,0.974668,"10,000,000 .. 20,000,000",11054,339,220,81,9.99,9.99,13899,"English, French, German, Italian, Spanish - Sp...","{'Action': 5370, 'FPS': 4794, 'Multiplayer': 3..."
1000000,1000000,ASCENXION,IndigoBlue Game Studio,PsychoFlux Entertainment,26,4,0.866667,"0 .. 20,000",0,0,0,0,9.99,9.99,0,"Korean, English, Simplified Chinese","{'Shoot 'Em Up': 186, 'Metroidvania': 181, 'Bu..."
1000010,1000010,Crown Trick,NEXT Studios,"Team17, NEXT Studios",3568,526,0.871519,"200,000 .. 500,000",489,0,615,0,7.99,19.99,217,"Simplified Chinese, English, Japanese, Traditi...","{'Rogue-like': 260, 'Turn-Based Combat': 244, ..."
1000030,1000030,"Cook, Serve, Delicious! 3?!",Vertigo Gaming Inc.,Vertigo Gaming Inc.,1347,79,0.9446,"50,000 .. 100,000",310,0,367,0,19.99,19.99,41,English,"{'Typing': 217, 'Casual': 208, 'Management': 2..."
1000040,1000040,细胞战争,DoubleC Games,DoubleC Games,0,1,0.0,"0 .. 20,000",0,0,0,0,1.99,1.99,0,"English, Not supported, Simplified Chinese","{'Action': 22, 'Casual': 22, 'Indie': 21, 'Sim..."
1000080,1000080,Zengeon,IndieLeague Studio,2P Games,1004,402,0.714083,"50,000 .. 100,000",8,0,13,0,9.99,19.99,4,"English, Simplified Chinese, Traditional Chine...","{'Action': 91, 'Indie': 86, 'RPG': 82, 'Advent..."
1000110,1000110,Jumping Master(跳跳大咖),重庆环游者网络科技,重庆环游者网络科技,52,33,0.611765,"20,000 .. 50,000",0,0,0,0,0.0,0.0,0,"English, Simplified Chinese, Traditional Chinese","{'Free to Play': 26, 'Massively Multiplayer': ..."
1000130,1000130,Cube Defender,Simon Codrington,Simon Codrington,6,0,1.0,"0 .. 20,000",0,0,0,0,2.99,2.99,0,English,"{'Indie': 31, 'Casual': 31, 'Tower Defense': 1..."
1000280,1000280,Tower of Origin2-Worm's Nest,Villain Role,Villain Role,18,5,0.782609,"0 .. 20,000",0,0,0,0,13.99,13.99,1,"Simplified Chinese, English, Traditional Chinese","{'Turn-Based Strategy': 40, 'Word Game': 38, '..."
1000310,1000310,人气动漫大乱斗,海南众铖互娱网络科技有限公司,海南众铖互娱网络科技有限公司,1,5,0.166667,"0 .. 20,000",0,0,0,0,0.99,0.99,0,"English, Not supported, Simplified Chinese","{'RPG': 32, 'Action': 31, 'Strategy': 29, 'Com..."


In [36]:
#Save cleaned data for analyzing
#use csv because easy
data_df.to_csv("cleaned_data.csv")