# MAIN NOTEBOOK

In [35]:
import pandas as pd
import numpy as np


In [36]:
# --- 1. Load the Raw Data ---
df_raw = pd.read_csv(
    "Ultimos 12 meses_N2024.csv",
    encoding="latin-1",
    sep=';',
    header=None,
    skiprows=3
)

# --- 2. Define Headers ---
location_headers = ['COD (NUTS2024)', 'Designação']
time_headers = [
    '1ºT2025', '4ºT 2024', '3ºT2024', '2ºT2024', '1ºT2024', '4ºT 2023',
    '3ºT2023', '2ºT2023', '1ºT2023', '4ºT 2022', '3ºT 2022', '2ºT2022',
    '1ºT 2022', '4ºT 2021', '3ºT 2021', '2ºT 2021', '1ºT 2021', '4ºT 2020',
    '3ºT 2020', '2ºT 2020', '1ºT 2020', '4ºT 2019'
]
full_headers = location_headers + time_headers

# --- 3. Extract and Build the Median DataFrame ---
df_median = pd.concat([
    df_raw.iloc[:, 0:2],
    df_raw.iloc[:, 25:47]
], axis=1)
df_median.columns = full_headers

# --- 4. Clean and Convert Data Types ---
# Convert time columns to numeric
for col in time_headers:
    cleaned_series = df_median[col].astype(str).str.replace(' ', '', regex=False).replace('//', np.nan, regex=False)
    df_median[col] = pd.to_numeric(cleaned_series, errors='coerce')

# --- 5. Impute Missing Values (Targeted Interpolation) ---
print("\n--- Applying targeted interpolation... ---")
# Select only the numeric time columns for interpolation
numeric_data = df_median[time_headers].interpolate(method='linear', axis=1, limit_direction='both')
# Fill any remaining NaNs (usually start/end)
numeric_data.fillna(method='ffill', axis=1, inplace=True)
numeric_data.fillna(method='bfill', axis=1, inplace=True)

# Put the imputed numeric data back into the main DataFrame
df_median[time_headers] = numeric_data

# --- 6. Final Setup ---
# Remove footer rows
df_median.dropna(subset=['Designação'], inplace=True)
# Set the index
df_median.set_index('Designação', inplace=True)

# --- 7. Verify the Final Result ---
print("\n--- Final Check for Missing Values ---")
print(f"Total remaining NaNs: {df_median.isnull().sum().sum()}") # Should be 0

print("\n--- Cleaned & Imputed DataFrame Head ---")
df_median.head()


--- Applying targeted interpolation... ---

--- Final Check for Missing Values ---
Total remaining NaNs: 66

--- Cleaned & Imputed DataFrame Head ---


  numeric_data.fillna(method='ffill', axis=1, inplace=True)
  numeric_data.fillna(method='bfill', axis=1, inplace=True)


Unnamed: 0_level_0,COD (NUTS2024),1ºT2025,4ºT 2024,3ºT2024,2ºT2024,1ºT2024,4ºT 2023,3ºT2023,2ºT2023,1ºT2023,...,1ºT 2022,4ºT 2021,3ºT 2021,2ºT 2021,1ºT 2021,4ºT 2020,3ºT 2020,2ºT 2020,1ºT 2020,4ºT 2019
Designação,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Portugal,PT,1845.0,1777.0,1709.0,1661.0,1633.0,1611.0,1579.0,1541.0,1509.0,...,1346.0,1297.0,1251.0,1218.0,1197.0,1190.0,1167.0,1146.0,1127.0,1094.0
Continente,1,1846.0,1778.0,1714.0,1667.0,1638.0,1618.0,1587.0,1550.0,1519.0,...,1351.0,1301.0,1257.0,1222.0,1200.0,1195.0,1169.0,1150.0,1131.0,1096.0
Norte,11,1621.0,1558.0,1492.0,1449.0,1417.0,1394.0,1367.0,1331.0,1304.0,...,1169.0,1133.0,1108.0,1079.0,1065.0,1047.0,1019.0,1000.0,977.0,948.0
Alto Minho,111,1313.0,1248.0,1193.0,1158.0,1124.0,1108.0,1091.0,1077.0,1073.0,...,972.0,945.0,925.0,895.0,893.0,891.0,870.0,865.0,853.0,846.0
Arcos de Valdevez,1111601,960.0,958.0,1000.0,998.0,828.0,813.0,766.0,754.0,760.0,...,690.0,635.0,680.0,705.0,740.0,794.0,711.0,726.0,693.0,672.0


In [37]:
# Drop rows where all time columns are NaN
df_median_final = df_median.dropna(subset=time_headers, how='all')

# Verify that all NaNs are gone
print("\n--- Final Check after Dropping ---")
print(f"Total remaining NaNs: {df_median_final.isnull().sum().sum()}") # Should now be 0

print("\n--- Final DataFrame Head ---")
df_median_final.head()

# You can now save this truly final DataFrame if needed
df_median_final.to_csv('final_median_prices.csv', sep=';', encoding='latin-1', index=True)


--- Final Check after Dropping ---
Total remaining NaNs: 0

--- Final DataFrame Head ---


In [38]:
df_median.dropna(subset=time_headers, how='all', inplace=True)

# Verify that all NaNs are gone
print("\n--- Final Check after Dropping All-NaN Rows ---")
print(f"Total remaining NaNs: {df_median.isnull().sum().sum()}") # Should be 0 now

print("\n--- Final DataFrame Head ---")
df_median.head()


--- Final Check after Dropping All-NaN Rows ---
Total remaining NaNs: 0

--- Final DataFrame Head ---


Unnamed: 0_level_0,COD (NUTS2024),1ºT2025,4ºT 2024,3ºT2024,2ºT2024,1ºT2024,4ºT 2023,3ºT2023,2ºT2023,1ºT2023,...,1ºT 2022,4ºT 2021,3ºT 2021,2ºT 2021,1ºT 2021,4ºT 2020,3ºT 2020,2ºT 2020,1ºT 2020,4ºT 2019
Designação,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Portugal,PT,1845.0,1777.0,1709.0,1661.0,1633.0,1611.0,1579.0,1541.0,1509.0,...,1346.0,1297.0,1251.0,1218.0,1197.0,1190.0,1167.0,1146.0,1127.0,1094.0
Continente,1,1846.0,1778.0,1714.0,1667.0,1638.0,1618.0,1587.0,1550.0,1519.0,...,1351.0,1301.0,1257.0,1222.0,1200.0,1195.0,1169.0,1150.0,1131.0,1096.0
Norte,11,1621.0,1558.0,1492.0,1449.0,1417.0,1394.0,1367.0,1331.0,1304.0,...,1169.0,1133.0,1108.0,1079.0,1065.0,1047.0,1019.0,1000.0,977.0,948.0
Alto Minho,111,1313.0,1248.0,1193.0,1158.0,1124.0,1108.0,1091.0,1077.0,1073.0,...,972.0,945.0,925.0,895.0,893.0,891.0,870.0,865.0,853.0,846.0
Arcos de Valdevez,1111601,960.0,958.0,1000.0,998.0,828.0,813.0,766.0,754.0,760.0,...,690.0,635.0,680.0,705.0,740.0,794.0,711.0,726.0,693.0,672.0
