In [1]:
import pandas as pd
import os
import json

In [3]:
def combine_json_to_dataframe(folder_path):
    all_dataframes = []  # List to store individual DataFrames

    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)

            # Load JSON data
            with open(file_path, 'r') as file:
                data = json.load(file)

            # Convert JSON data to DataFrame
            if 'auctions' in data:
                df = pd.json_normalize(data, 'auctions')

                # Add the file_date as a new column
                df['datetime'] = data['file_date']

                # Append this DataFrame to the list
                all_dataframes.append(df)

    # Combine all DataFrames into one
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    return combined_df


In [4]:
# Path to the folder containing JSON files
folder_path = 'data'

In [5]:
# Combine all JSON files into a single DataFrame
final_dataframe = combine_json_to_dataframe(folder_path)

In [6]:
# Convert the datetime column to a datetime object
final_dataframe['datetime'] = pd.to_datetime(final_dataframe['datetime'], format='%Y-%m-%d %H:%M')

In [7]:
# Display the combined DataFrame
final_dataframe.head()

Unnamed: 0,id,quantity,unit_price,time_left,item.id,item.name,item.class,datetime
0,1879838406,2,2420900000,SHORT,191331,Phial of Charged Isolation,Consumables,2023-12-23 13:41:00
1,1879838449,3,4000,SHORT,120293,Lukewarm Yak Roast Broth,Consumables,2023-12-23 13:41:00
2,1879838573,2,51300,SHORT,171352,Potion of Empowered Exorcisms,Consumables,2023-12-23 13:41:00
3,1879838720,5,10100,SHORT,191460,Hochenblume,Trade Goods,2023-12-23 13:41:00
4,1879838812,2,11500000,SHORT,139315,Grimoire of the Wrathguard,Glyphs,2023-12-23 13:41:00


In [9]:
final_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6812912 entries, 0 to 6812911
Data columns (total 8 columns):
 #   Column      Dtype         
---  ------      -----         
 0   id          int64         
 1   quantity    int64         
 2   unit_price  int64         
 3   time_left   object        
 4   item.id     int64         
 5   item.name   object        
 6   item.class  object        
 7   datetime    datetime64[ns]
dtypes: datetime64[ns](1), int64(4), object(3)
memory usage: 415.8+ MB


In [10]:
# Check for missing name or class
missing_name = final_dataframe['item.name'].isna()
missing_class = final_dataframe['item.class'].isna()

# Count rows where either 'item.name' or 'item.class' is missing
missing_either = final_dataframe[missing_name | missing_class]
num_missing_either = len(missing_either)

print(f"Number of rows missing either item.name or item.class: {num_missing_either}")

Number of rows missing either item.name or item.class: 0
