In [69]:
import json
import os
from copy import deepcopy
from datetime import datetime


In [40]:
def print_dict(dict_obj):
    print(json.dumps(dict_obj, indent=2, ensure_ascii=False))

In [73]:
def drop_uninterested_fields_from_dict(my_dict, interesting_fields: list) -> dict:
    if not my_dict or type(my_dict) == str:
        return my_dict

    for key in list(my_dict.keys()):
        if key not in interesting_fields:
            my_dict.pop(key, None)
    return my_dict

def unbox_one_level_nested_sub_dicts(my_dict) -> dict:
    if not my_dict or type(my_dict) == str:
        return my_dict

    for key in list(my_dict.keys()):
        if type(my_dict[key]) == dict:
            sub_dict_keys = list(my_dict[key].keys())
            if len(sub_dict_keys) == 1:
                my_dict[key] = my_dict[key][sub_dict_keys[0]]

    return my_dict

def add_none_for_missing_interesting_fields(my_dict, interesting_fields) -> dict:
    if not my_dict or type(my_dict) == str:
        return my_dict

    keys = list(my_dict.keys())

    for interesting_field in interesting_fields:
        if interesting_field not in keys:
            my_dict[interesting_field] = None

def fill_vehicle_age(my_dict):
    vehicle_age_field_name = "vehicle_age"
    
    in_operation_date = my_dict["in_operation_date"]
    manufacturing_date = my_dict["manufacturing_date"]

    if not in_operation_date and not manufacturing_date:
        my_dict[vehicle_age_field_name] = None
        return my_dict
    
    if in_operation_date:
        in_operation_date_datetime = datetime.strptime(in_operation_date, "%Y-%m-%d")
        delta = datetime.now() - in_operation_date_datetime
        my_dict[vehicle_age_field_name] = int(delta.days / 365.25)
        return my_dict
    
    if manufacturing_date:
        manufacturing_date_datetime = datetime.strptime(manufacturing_date, "%Y-%m-%d")
        delta = datetime.now() - manufacturing_date_datetime
        my_dict[vehicle_age_field_name] = int(delta.days / 365.25)
        return my_dict
    
    return my_dict


In [74]:
# Load all data

files = os.listdir("./data/search-results")

all_results_data = []
for file in files:
    with open("./data/search-results/" + file, "r") as f:
        results = json.load(f)
        all_results_data += results["results"]

In [77]:
# Drop fields other than: "manufacturer_cb.name", "model_cb.name", "price", "tachometer", "locality.region", "fuel_cb.name", "gearbox_cb.name"
interesting_fields = ["in_operation_date", "manufacturing_date", "id", "category", "manufacturer_cb", "model_cb", "price", "tachometer", "locality", "fuel_cb", "gearbox_cb"]
interesting_fields_after_preprocessing = ["id", "category", "manufacturer_cb", "model_cb", "price", "tachometer", "locality", "fuel_cb", "gearbox_cb", "vehicle_age"]
interesting_fields_locality = ["region"]
interesting_fields_fuel_cb = ["name"]
interesting_fields_gearbox_cb = ["name"]
interesting_fields_manufacturer_cb = ["name"]
interesting_fields_model_cb = ["name"]
interesting_fields_category = ["name"]

all_results = deepcopy(all_results_data)

for result in all_results:
    drop_uninterested_fields_from_dict(result, interesting_fields)
    drop_uninterested_fields_from_dict(result.get('locality'), interesting_fields_locality)
    drop_uninterested_fields_from_dict(result.get('fuel_cb'), interesting_fields_fuel_cb)
    drop_uninterested_fields_from_dict(result.get('gearbox_cb'), interesting_fields_gearbox_cb)
    drop_uninterested_fields_from_dict(result.get('manufacturer_cb'), interesting_fields_manufacturer_cb)
    drop_uninterested_fields_from_dict(result.get('model_cb'), interesting_fields_model_cb)
    unbox_one_level_nested_sub_dicts(result)
    add_none_for_missing_interesting_fields(result, interesting_fields)
    fill_vehicle_age(result)
    drop_uninterested_fields_from_dict(result, interesting_fields_after_preprocessing)
    

In [78]:
with open("./data/all_ads.json", "w") as f:
    f.write(json.dumps(all_results, indent=2, ensure_ascii=False, sort_keys=True))
