In [1]:
# IMPORTANT: BEFORE RUNNING THIS SCRIPT make sure you add the following
# to the bottom of your .gitignore file: commodities_data.csv
# The generated file is too large for standard Github and can screw with
# your commits if not ignored prior to generating the csv file.

In [2]:
import pandas as pd
import os
import json

In [3]:
def combine_json_to_dataframe(folder_path):
    all_dataframes = []  # List to store individual DataFrames

    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)

            # Load JSON data
            with open(file_path, 'r') as file:
                data = json.load(file)

            # Convert JSON data to DataFrame
            if 'auctions' in data:
                df = pd.json_normalize(data, 'auctions')

                # Add the file_date as a new column
                df['datetime'] = data['file_date']

                # Append this DataFrame to the list
                all_dataframes.append(df)

    # Combine all DataFrames into one
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    return combined_df


In [4]:
# Path to the folder containing JSON files
folder_path = 'data'

In [5]:
# Combine all JSON files into a single DataFrame
final_dataframe = combine_json_to_dataframe(folder_path)

In [6]:
# Convert the datetime column to a datetime object
final_dataframe['datetime'] = pd.to_datetime(final_dataframe['datetime'], format='%Y-%m-%d %H:%M')

In [7]:
# Display the combined DataFrame
final_dataframe.tail()

Unnamed: 0,id,quantity,unit_price,time_left,item.id,item.name,item.class,datetime
79852825,1910242848,12,25200,VERY_LONG,197755,Lava Beetle,Trade Goods,2023-12-29 00:41:00
79852826,1910242860,3,30600,VERY_LONG,191465,Saxifrage,Trade Goods,2023-12-29 00:41:00
79852827,1910242862,3,1500000,VERY_LONG,193227,Stonecrust Hide,Trade Goods,2023-12-29 00:41:00
79852828,1910242863,2,25200,VERY_LONG,197755,Lava Beetle,Trade Goods,2023-12-29 00:41:00
79852829,1910242868,319,19500,VERY_LONG,194966,Thousandbite Piranha,Trade Goods,2023-12-29 00:41:00


In [8]:
final_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79852830 entries, 0 to 79852829
Data columns (total 8 columns):
 #   Column      Dtype         
---  ------      -----         
 0   id          int64         
 1   quantity    int64         
 2   unit_price  int64         
 3   time_left   object        
 4   item.id     int64         
 5   item.name   object        
 6   item.class  object        
 7   datetime    datetime64[ns]
dtypes: datetime64[ns](1), int64(4), object(3)
memory usage: 4.8+ GB


In [9]:
total_items = len(final_dataframe)
missing_name = final_dataframe['item.name'].isna() | (final_dataframe['item.name'] == 'Unknown')
missing_class = final_dataframe['item.class'].isna() | (final_dataframe['item.class'] == 'Unknown')
missing_both = missing_name & missing_class

# Count based on conditions
num_missing_name = final_dataframe[missing_name].shape[0]
num_missing_class = final_dataframe[missing_class].shape[0]
num_missing_both = final_dataframe[missing_both].shape[0]

# Report
print(f"Total items: {total_items}")
print(f"Items missing name: {num_missing_name}")
print(f"Items missing class: {num_missing_class}")
print(f"Items missing both name and class: {num_missing_both}")


Total items: 79852830
Items missing name: 6902
Items missing class: 7323232
Items missing both name and class: 6902


In [10]:
# Save as a CSV file
csv_file_name = 'commodities_data.csv'
final_dataframe.to_csv(csv_file_name, index=False)
print(f"Data saved to {csv_file_name}")


Data saved to commodities_data.csv
