In [15]:
import pandas as pd
import seaborn as sns
import numpy as np
import os
import json
from collections import defaultdict
from datetime import datetime

Our dataset have different keywords, and different time-stamps for the date when they are collected.

In [11]:
kewords = ["renda familiar", "idealista", "extended"]

In [12]:
renda = pd.read_excel("../temporal_landing/renda familiar 2020.xlsx")

In [17]:
# Set the path to the directory where your JSON files are stored
temporal_directory = '../temporal_landing/'
persistent_directory = '../persistent_landing/'

# Dictionary to hold DataFrames for each year
dfs_by_year = defaultdict(list)

# Loop through all the files in the directory
for filename in os.listdir(temporal_directory):
    # Check if the filename matches the pattern YYYY_MM_dd_idealista.json
    if filename.endswith('_idealista.json'):
        try:
            # Extract the year, month, and day from the filename
            # Assuming the filename format is YYYY_MM_dd_idealista.json
            parts = filename.split('_')
            if len(parts) < 4:
                print(f"Filename {filename} does not match the expected pattern. Skipping.")
                continue
            year, month, day = parts[0], parts[1], parts[2]
            
            # Validate extracted parts
            datetime.strptime(f"{year}_{month}_{day}", "%Y_%m_%d")  # This will raise ValueError if invalid
            
            # Create a date string or datetime object as needed
            extraction_date = f"{year}-{month}-{day}"  # Format: YYYY-MM-DD
            # Alternatively, as a datetime object:
            # extraction_date = datetime.strptime(f"{year}_{month}_{day}", "%Y_%m_%d")
            
            # Read the JSON file
            file_path = os.path.join(temporal_directory, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)

                # Check if the data is a list of dictionaries, convert it to a DataFrame
                if isinstance(data, list):
                    df = pd.DataFrame(data)
                elif isinstance(data, dict):
                    # If it's a dictionary, convert it to a DataFrame (handle accordingly)
                    df = pd.DataFrame([data])
                else:
                    print(f"Unexpected data format in {filename}. Skipping.")
                    continue
            
            # Add the extraction date as a new column
            df['extraction_timestamp'] = extraction_date
            
            # Append the DataFrame to the list corresponding to the year
            dfs_by_year[year].append(df)
        
        except ValueError as ve:
            print(f"Error processing filename {filename}: {ve}. Skipping.")
            continue

columns_to_remove = ['thumbnail', 'externalReference', "numPhotos", "showAddress", 
                     "url", "distance", "hasVideo", "detailedType", "suggestedTexts", 
                     "hasPlan", "has3DTour", "has360", "hasStaging", "parkingSpace",
                     "topNewDevelopment", "newDevelopmentFinished", ]

# Loop through each year, merge the DataFrames, remove duplicates, and save to a new JSON file
for year, dfs in dfs_by_year.items():
    # Concatenate all DataFrames for the given year
    combined_df = pd.concat(dfs, ignore_index=True)
    
    # Drop the unwanted columns
    combined_df = combined_df.drop(columns=columns_to_remove)

    # Save the combined DataFrame to a new JSON file
    output_file = os.path.join(persistent_directory, f'{year}_idealista.json')
    combined_df.to_json(output_file, orient='records', indent=4)
    
    print(f'Merged file for {year} saved as {output_file}')

Merged file for 2020 saved as ../persistent_landing/2020_idealista.json
Merged file for 2021 saved as ../persistent_landing/2021_idealista.json
