In [13]:
# IMPORTANT: BEFORE RUNNING THIS SCRIPT make sure you add the following
# to the bottom of your .gitignore file: commodities_data.csv
# The generated file is too large for standard Github and can screw with
# your commits if not ignored prior to generating the csv file.

In [14]:
import pandas as pd
import os
import json

In [16]:
def combine_json_to_dataframe(folder_path):
    all_dataframes = []  # List to store individual DataFrames

    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)

            # Load JSON data
            with open(file_path, 'r') as file:
                data = json.load(file)

            # Convert JSON data to DataFrame
            if 'auctions' in data:
                df = pd.json_normalize(data, 'auctions')

                # Add the file_date as a new column
                df['datetime'] = data['file_date']

                # Append this DataFrame to the list
                all_dataframes.append(df)

    # Combine all DataFrames into one
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    return combined_df


In [17]:
# Path to the folder containing JSON files
folder_path = 'data'

In [18]:
# Combine all JSON files into a single DataFrame
final_dataframe = combine_json_to_dataframe(folder_path)

In [19]:
# Convert the datetime column to a datetime object
final_dataframe['datetime'] = pd.to_datetime(final_dataframe['datetime'], format='%Y-%m-%d %H:%M')

In [20]:
# Display the combined DataFrame
final_dataframe.tail()

Unnamed: 0,id,quantity,unit_price,time_left,item.id,item.name,item.class,datetime
20807317,1891659518,2,1449800,LONG,200038,Enchant Ring - Devotion of Haste,Unknown,2023-12-24 08:41:00
20807318,1891659519,2,185000,VERY_LONG,7067,Elemental Earth,Trade Goods,2023-12-24 08:41:00
20807319,1891659520,304,139900,LONG,52185,Elementium Ore,Trade Goods,2023-12-24 08:41:00
20807320,1891659524,4,45000,LONG,192847,Sundered Onyx,Trade Goods,2023-12-24 08:41:00
20807321,1891659525,47,3000,VERY_LONG,33470,Frostweave Cloth,Trade Goods,2023-12-24 08:41:00


In [21]:
final_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20807322 entries, 0 to 20807321
Data columns (total 8 columns):
 #   Column      Dtype         
---  ------      -----         
 0   id          int64         
 1   quantity    int64         
 2   unit_price  int64         
 3   time_left   object        
 4   item.id     int64         
 5   item.name   object        
 6   item.class  object        
 7   datetime    datetime64[ns]
dtypes: datetime64[ns](1), int64(4), object(3)
memory usage: 1.2+ GB


In [22]:
total_items = len(final_dataframe)
missing_name = final_dataframe['item.name'].isna() | (final_dataframe['item.name'] == 'Unknown')
missing_class = final_dataframe['item.class'].isna() | (final_dataframe['item.class'] == 'Unknown')
missing_both = missing_name & missing_class

# Count based on conditions
num_missing_name = final_dataframe[missing_name].shape[0]
num_missing_class = final_dataframe[missing_class].shape[0]
num_missing_both = final_dataframe[missing_both].shape[0]

# Report
print(f"Total items: {total_items}")
print(f"Items missing name: {num_missing_name}")
print(f"Items missing class: {num_missing_class}")
print(f"Items missing both name and class: {num_missing_both}")


Total items: 20807322
Items missing name: 1639
Items missing class: 2069310
Items missing both name and class: 1639


In [23]:
# Save as a CSV file
csv_file_name = 'commodities_data.csv'
final_dataframe.to_csv(csv_file_name, index=False)
print(f"Data saved to {csv_file_name}")


Data saved to commodities_data.csv
