In [4]:
import os
import json
import pandas as pd
import seaborn as sns
import numpy as np

from collections import defaultdict
from datetime import datetime

Our dataset have different keywords, and different time-stamps for the date when they are collected.

In [7]:
renda2020 = pd.read_excel("../temporal_landing/renda familiar 2020.xlsx")
renda2021 = pd.read_excel("../temporal_landing/renda familiar 2021.xlsx")

look_up_table = pd.read_csv("../temporal_landing/income_opendatabcn_extended.csv")

In [28]:
renda2020_merged = pd.merge(left=renda2020[["Distric", "Barris", "RDLpc (€)"]], left_on="Barris", 
                            right=look_up_table[["neighborhood", "district_id", "neighborhood_id"]], 
                            right_on="neighborhood", how="left")
renda2020_merged = renda2020_merged.drop(columns="Barris") # Remove Barris, as it's identical than nerighborhood
renda2020_merged = renda2020_merged.drop(index=73) # Drop the last row, in this case 74, with index 73

Unnamed: 0,Distric,Barris,RDLpc (€),neighborhood,district_id,neighborhood_id
0,Ciutat Vella,el Raval,11.178,el Raval,Q941385,Q1758503
1,Ciutat Vella,el Barri Gòtic,15.990,el Barri Gòtic,Q941385,Q17154
2,Ciutat Vella,la Barceloneta,13.246,la Barceloneta,Q941385,Q377070
3,Ciutat Vella,"Sant Pere, Santa Caterina i la Ribera",16.770,"Sant Pere, Santa Caterina i la Ribera",Q941385,Q2442135
4,l'Eixample,el Fort Pienc,21.045,el Fort Pienc,Q64124,Q2107762
...,...,...,...,...,...,...
68,Sant Marti,Diagonal Mar i el Front Marítim del Poblenou,37.346,Diagonal Mar i el Front Marítim del Poblenou,Q250935,Q2736444
69,Sant Marti,el Besòs i el Maresme,15.070,el Besòs i el Maresme,Q250935,Q2390761
70,Sant Marti,Provençals del Poblenou,20.246,Provençals del Poblenou,Q250935,Q6273541
71,Sant Marti,Sant Martí de Provençals,17.346,Sant Martí de Provençals,Q250935,Q3773169


Unnamed: 0,Distric,RDLpc (€),neighborhood,district_id,neighborhood_id
0,Ciutat Vella,11.178,el Raval,Q941385,Q1758503
1,Ciutat Vella,15.990,el Barri Gòtic,Q941385,Q17154
2,Ciutat Vella,13.246,la Barceloneta,Q941385,Q377070
3,Ciutat Vella,16.770,"Sant Pere, Santa Caterina i la Ribera",Q941385,Q2442135
4,l'Eixample,21.045,el Fort Pienc,Q64124,Q2107762
...,...,...,...,...,...
69,Sant Marti,15.070,el Besòs i el Maresme,Q250935,Q2390761
70,Sant Marti,20.246,Provençals del Poblenou,Q250935,Q6273541
71,Sant Marti,17.346,Sant Martí de Provençals,Q250935,Q3773169
72,Sant Marti,15.070,la Verneda i la Pau,Q250935,Q542473


In [17]:
# Set the path to the directory where your JSON files are stored
temporal_directory = '../temporal_landing/'
persistent_directory = '../persistent_landing/'

# Dictionary to hold DataFrames for each year
dfs_by_year = defaultdict(list)

# Loop through all the files in the directory
for filename in os.listdir(temporal_directory):
    # Check if the filename matches the pattern YYYY_MM_dd_idealista.json
    if filename.endswith('_idealista.json'):
        try:
            # Extract the year, month, and day from the filename
            # Assuming the filename format is YYYY_MM_dd_idealista.json
            parts = filename.split('_')
            if len(parts) < 4:
                print(f"Filename {filename} does not match the expected pattern. Skipping.")
                continue
            year, month, day = parts[0], parts[1], parts[2]
            
            # Validate extracted parts
            datetime.strptime(f"{year}_{month}_{day}", "%Y_%m_%d")  # This will raise ValueError if invalid
            
            # Create a date string or datetime object as needed
            extraction_date = f"{year}-{month}-{day}"  # Format: YYYY-MM-DD
            # Alternatively, as a datetime object:
            # extraction_date = datetime.strptime(f"{year}_{month}_{day}", "%Y_%m_%d")
            
            # Read the JSON file
            file_path = os.path.join(temporal_directory, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)

                # Check if the data is a list of dictionaries, convert it to a DataFrame
                if isinstance(data, list):
                    df = pd.DataFrame(data)
                elif isinstance(data, dict):
                    # If it's a dictionary, convert it to a DataFrame (handle accordingly)
                    df = pd.DataFrame([data])
                else:
                    print(f"Unexpected data format in {filename}. Skipping.")
                    continue
            
            # Add the extraction date as a new column
            df['extraction_timestamp'] = extraction_date
            
            # Append the DataFrame to the list corresponding to the year
            dfs_by_year[year].append(df)
        
        except ValueError as ve:
            print(f"Error processing filename {filename}: {ve}. Skipping.")
            continue

columns_to_remove = ['thumbnail', 'externalReference', "numPhotos", "showAddress", 
                     "url", "distance", "hasVideo", "detailedType", "suggestedTexts", 
                     "hasPlan", "has3DTour", "has360", "hasStaging", "parkingSpace",
                     "topNewDevelopment", "newDevelopmentFinished", ]

# Loop through each year, merge the DataFrames, remove duplicates, and save to a new JSON file
for year, dfs in dfs_by_year.items():
    # Concatenate all DataFrames for the given year
    combined_df = pd.concat(dfs, ignore_index=True)
    
    # Drop the unwanted columns
    combined_df = combined_df.drop(columns=columns_to_remove)

    # Save the combined DataFrame to a new JSON file
    output_file = os.path.join(persistent_directory, f'{year}_idealista.json')
    combined_df.to_json(output_file, orient='records', indent=4)
    
    print(f'Merged file for {year} saved as {output_file}')

Merged file for 2020 saved as ../persistent_landing/2020_idealista.json
Merged file for 2021 saved as ../persistent_landing/2021_idealista.json
