In [1]:
import pandas as pd
import os
import json

In [3]:
def combine_json_to_dataframe(folder_path):
    all_dataframes = []  # List to store individual DataFrames

    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)

            # Load JSON data
            with open(file_path, 'r') as file:
                data = json.load(file)

            # Convert JSON data to DataFrame
            if 'auctions' in data:
                df = pd.json_normalize(data, 'auctions')

                # Add the file_date as a new column
                df['datetime'] = data['file_date']

                # Append this DataFrame to the list
                all_dataframes.append(df)

    # Combine all DataFrames into one
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    return combined_df


In [4]:
# Path to the folder containing JSON files
folder_path = 'data'

In [5]:
# Combine all JSON files into a single DataFrame
final_dataframe = combine_json_to_dataframe(folder_path)

In [6]:
# Convert the datetime column to a datetime object
final_dataframe['datetime'] = pd.to_datetime(final_dataframe['datetime'], format='%Y-%m-%d %H:%M')

In [11]:
# Display the combined DataFrame
final_dataframe.tail()

Unnamed: 0,id,quantity,unit_price,time_left,item.id,item.name,item.class,datetime
6812907,1885539338,3,173100,VERY_LONG,52183,Pyrite Ore,Trade Goods,2023-12-22 22:41:00
6812908,1885539342,1000,33400,LONG,193217,Dense Hide,Trade Goods,2023-12-22 22:41:00
6812909,1885539343,3,57420000,LONG,200054,Enchant Weapon - Sophic Devotion,Unknown,2023-12-22 22:41:00
6812910,1885539346,5,1244800,VERY_LONG,190312,Khaz'gorite Ore,Trade Goods,2023-12-22 22:41:00
6812911,1885539344,1,2867000,LONG,200023,Enchant Bracer - Devotion of Speed,Unknown,2023-12-22 22:41:00


In [9]:
final_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6812912 entries, 0 to 6812911
Data columns (total 8 columns):
 #   Column      Dtype         
---  ------      -----         
 0   id          int64         
 1   quantity    int64         
 2   unit_price  int64         
 3   time_left   object        
 4   item.id     int64         
 5   item.name   object        
 6   item.class  object        
 7   datetime    datetime64[ns]
dtypes: datetime64[ns](1), int64(4), object(3)
memory usage: 415.8+ MB


In [13]:
total_items = len(final_dataframe)
missing_name = final_dataframe['item.name'].isna() | (final_dataframe['item.name'] == 'Unknown')
missing_class = final_dataframe['item.class'].isna() | (final_dataframe['item.class'] == 'Unknown')
missing_both = missing_name & missing_class

# Count based on conditions
num_missing_name = final_dataframe[missing_name].shape[0]
num_missing_class = final_dataframe[missing_class].shape[0]
num_missing_both = final_dataframe[missing_both].shape[0]

# Report
print(f"Total items: {total_items}")
print(f"Items missing name: {num_missing_name}")
print(f"Items missing class: {num_missing_class}")
print(f"Items missing both name and class: {num_missing_both}")


Total items: 6812912
Items missing name: 0
Items missing class: 721198
Items missing both name and class: 0
