In [63]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

try:
    df = pd.read_csv("df_final.csv")
except FileNotFoundError:
    exit()

df.head()



Unnamed: 0,cod_municipio,municipio,latitud,longitud,altitud,poblacion,centros_salud_10mil,farmacias_10mil,latitud_google_places,longitud_google_places,...,afiliados_agricultura_y_ganadería,afiliados_construcción,afiliados_minería_industria_y_energía,afiliados_otros_servicios,afiliados_servicios_a_empresas_y_financieros,afiliados_servicios_de_distribución_y_hostelería,afiliados_total,alumnos_total,cultura,IPVA
0,14,Acebeda (La),41.08697,-3.624634,1266.542,68.0,0,0,41.08697,-3.624634,...,0,1,2,8,8,11,30,0,0.0,
1,29,Ajalvir,40.53437,-3.481002,680.1722,4946.0,0,2,40.53437,-3.481002,...,15,110,253,578,521,691,2168,1336,0.0,
2,35,Alameda del Valle,40.9179,-3.843788,1109.934,256.0,0,39,40.9179,-3.843788,...,4,3,4,31,20,27,89,0,0.0,
3,40,Álamo (El),40.22972,-3.992688,606.2238,10413.0,0,2,40.22972,-3.992688,...,22,371,319,1164,773,1122,3771,4244,0.0,
4,88,Aldea del Fresno,40.32399,-4.202217,476.7994,3422.0,0,3,40.32399,-4.202217,...,27,149,70,340,213,283,1082,566,0.0,


In [64]:
if 'IPVA' in df.columns:
    df = df.drop(columns=['IPVA'])
else:
    print("Column 'IPVA' does not exist in the DataFrame.")

# positive variables
positive_vars = [
    "centros_salud_10mil", "farmacias_10mil",
    "n_gym", "gym_weighted_avg_rating",
    "n_restaurant",  "restaurant_weighted_avg_rating",
    "n_pharmacy",  "pharmacy_weighted_avg_rating",
    "n_school",  "school_weighted_avg_rating",
    "n_transport","transport_weighted_avg_rating",
    "afiliados_agricultura_y_ganadería", "afiliados_construcción",
    "afiliados_minería_industria_y_energía", "afiliados_otros_servicios",
    "afiliados_servicios_a_empresas_y_financieros",
    "afiliados_servicios_de_distribución_y_hostelería",
    "afiliados_total", "alumnos_total", "cultura",
]
# negative variables
negative_vars = ["paro_total", "paro_100", "distancia_capital"]

scaler = MinMaxScaler()


In [65]:
def commute_score(x):
    if pd.isna(x):
        return None
    if x < 30:
        return 1
    elif x <= 60:
        return 0.5
    else:
        return 0
    
df['commute_score'] = df['distancia_capital'].apply(commute_score)
positive_vars += ["commute_score"]
df['restaurant_total_reviews'] = np.log1p(df['restaurant_total_reviews']) 
df['gym_total_reviews'] = np.log1p(df['gym_total_reviews']) 
df['school_total_reviews'] = np.log1p(df['school_total_reviews']) 
df['pharmacy_total_reviews'] = np.log1p(df['pharmacy_total_reviews']) 
df['transport_total_reviews'] = np.log1p(df['transport_total_reviews'])


In [66]:
df_scaled = df.copy()

# scale all variables to [0, 1]
df_scaled[positive_vars + negative_vars] = scaler.fit_transform(
    df[positive_vars + negative_vars]
)

# invert negative variables
df_scaled[negative_vars] = 1 - df_scaled[negative_vars]
print(df_scaled)

     cod_municipio              municipio   latitud  longitud    altitud  \
0               14           Acebeda (La)  41.08697 -3.624634  1266.5420   
1               29                Ajalvir  40.53437 -3.481002   680.1722   
2               35      Alameda del Valle  40.91790 -3.843788  1109.9340   
3               40             Álamo (El)  40.22972 -3.992688   606.2238   
4               88       Aldea del Fresno  40.32399 -4.202217   476.7994   
..             ...                    ...       ...       ...        ...   
150           1795        Villar del Olmo  40.33669 -3.235551   680.3748   
151           1809  Villarejo de Salvanés  40.16873 -3.275975   759.0782   
152           1816   Villaviciosa de Odón  40.35710 -3.900590   647.7311   
153           1821  Villavieja del Lozoya  41.00592 -3.671118  1062.4740   
154           1837              Zarzalejo  40.54749 -4.181604  1105.8080   

     poblacion  centros_salud_10mil  farmacias_10mil  latitud_google_places  \
0       

In [76]:
categories = { 
    "salud": ["centros_salud_10mil", 
              "farmacias_10mil", 
              "n_pharmacy", 
              "pharmacy_weighted_avg_rating"], 
    "educacion": ["school_weighted_avg_rating",
                   "n_school", 
                   "alumnos_total"], 
    "cultural": ["cultura",
                 "n_gym", 
                 "gym_weighted_avg_rating",
                 "n_restaurant", 
                 "restaurant_weighted_avg_rating"], 
    "transporte": ["commute_score"], 
    "calidad_vida":["paro_100", 
                    "afiliados_total",] }
category_weights = {
    "salud": {
        "centros_salud_10mil": 0.4,
        "farmacias_10mil": 0.3,
        "n_pharmacy": 0.1,
        "pharmacy_weighted_avg_rating": 0.2
    },
    "educacion": {
        "school_weighted_avg_rating": 0.5,
        "n_school": 0.3,
        "alumnos_total": 0.2
    },
    "cultural": {
        "cultura": 0.3,
        "n_gym": 0.15,
        "gym_weighted_avg_rating": 0.2,
        "n_restaurant": 0.15,
        "restaurant_weighted_avg_rating": 0.2
    },
    "transporte": {
        "commute_score": 1.0
    },
    "calidad_vida": {
        "paro_100": 0.6,
        "afiliados_total": 0.4
    }
}


In [79]:
category_indices = pd.DataFrame(index=df_scaled.index)
for category, vars in categories.items():
    weights = category_weights[category]
    category_indices[category] = (
    pd.concat([df_scaled[var] * weights[var] for var in vars], axis=1)
    .sum(axis=1)
)


In [80]:
category_indices.head()


Unnamed: 0,salud,educacion,cultural,transporte,calidad_vida
0,0.205,0.545,0.190537,0.0,0.450426
1,0.24437,0.766337,0.556492,0.5,0.413636
2,0.377813,0.465,0.302021,0.0,0.434382
3,0.222431,0.741529,0.579033,0.5,0.329784
4,0.184679,0.550632,0.501691,0.5,0.2166
