# 

In [1]:
#SOURCES
dataset = "https://datos.gob.mx/busca/dataset/indicadores-de-pobreza-pobreza-por-ingresos-rezago-social-y-gini-a-nivel-municipal1990-200-2010"
head_description = "https://www.coneval.org.mx/Informes/Pobreza/Datos_abiertos/Indicadores_municipales/Indicadores_municipales_sabana_DIC.txt"

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter

In [3]:
# Start by creating a dataframe for the dataset
file = 'Indicadores_municipales_sabana_DA.csv'
df = pd.read_csv(file, encoding='latin-1')
var_Ytarget = df["ic_segsoc"] #target variable
var_Ytarget.describe()

count    2456.000000
mean       78.753363
std        15.354490
min        24.201883
25%        70.657546
50%        82.849633
75%        90.958454
max        99.283028
Name: ic_segsoc, dtype: float64

In [4]:
# See number of empty spaces
null_data = df.isna().sum().sum()
print("null values =>", null_data)

null values => 305


In [5]:
# Cleaning the empty spaces
df.bfill(inplace=True) #backwards
df.ffill(inplace=True) #forwards
null_data = df.isna().sum().sum()
print("null values =>", null_data)

null values => 0


In [6]:
rows = df.shape[0]
columns = df.shape[1]
print(f"rows => {rows} x columns => {columns}")
df

rows => 2456 x columns => 139


Unnamed: 0,ent,nom_ent,mun,clave_mun,nom_mun,pobtot_ajustada,pobreza,pobreza_e,pobreza_m,vul_car,...,pobreza_alim_10,pobreza_cap_90,pobreza_cap_00,pobreza_cap_10,pobreza_patrim_90,pobreza_patrim_00,pobreza_patrim_10,gini_90,gini_00,gini_10
0,1,Aguascalientes,1,1001,Aguascalientes,794304,30.531104,2.264478,28.266627,27.983320,...,11.805700,20.4,12.7,18.474600,43.4,33.7,41.900398,0.473,0.425,0.422628
1,1,Aguascalientes,2,1002,Asientos,48592,67.111172,8.040704,59.070468,22.439389,...,21.993299,39.9,29.0,30.980801,64.2,48.9,59.175800,0.379,0.533,0.343879
2,1,Aguascalientes,3,1003,Calvillo,53104,61.360527,7.241238,54.119289,29.428583,...,19.266800,39.5,33.1,28.259199,63.9,57.9,56.504902,0.414,0.465,0.386781
3,1,Aguascalientes,4,1004,Cosío,14101,52.800458,4.769001,48.031458,27.128568,...,14.303200,35.2,21.0,22.386101,59.7,40.1,51.164501,0.392,0.541,0.344984
4,1,Aguascalientes,5,1005,Jesús María,101379,45.338512,6.084037,39.254475,26.262912,...,15.085100,36.6,22.6,22.139999,60.6,42.2,45.703899,0.391,0.469,0.458083
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2451,32,Zacatecas,54,32054,Villa Hidalgo,21016,74.848837,12.301183,62.547654,19.229856,...,30.055300,51.8,54.8,41.368999,73.5,70.9,70.859596,0.403,0.589,0.342037
2452,32,Zacatecas,55,32055,Villanueva,27385,65.450191,10.203506,55.246687,23.623556,...,13.138800,34.2,25.9,20.563601,57.8,44.1,46.659199,0.422,0.463,0.362527
2453,32,Zacatecas,56,32056,Zacatecas,117528,29.541959,3.535624,26.006335,16.644262,...,7.164800,15.7,20.7,12.115300,36.6,41.8,32.302700,0.528,0.498,0.436339
2454,32,Zacatecas,57,32057,Trancoso,20456,78.374962,14.607016,63.767946,13.750759,...,21.285900,36.2,36.4,30.037100,60.5,54.7,57.394501,0.380,0.483,0.365307


In [7]:
years = ["90","00","05","10","_00", "_05", "_10", "_90"]

# Filtra las columnas que no terminan con los sufijos de años y crea un nuevo DataFrame
new_df = df[[col for col in df.columns if not col.endswith(tuple(years))]]


In [8]:
# Based on the selected target create a new column and fill with 0/1
df['secure_live'] = (var_Ytarget <= 50).astype(int) 
i_secure_live = df.columns.get_loc('secure_live')
print(i_secure_live)

# Deleting the old target column.
df_compare = df[['ic_segsoc', 'secure_live']]
print(df_compare)
df = df.drop(columns='ic_segsoc')
df

139
      ic_segsoc  secure_live
0     41.799885            1
1     78.003570            0
2     80.051980            0
3     65.831374            0
4     52.616992            0
...         ...          ...
2451  76.550988            0
2452  74.542926            0
2453  32.666426            1
2454  83.235286            0
2455  76.211864            0

[2456 rows x 2 columns]


Unnamed: 0,ent,nom_ent,mun,clave_mun,nom_mun,pobtot_ajustada,pobreza,pobreza_e,pobreza_m,vul_car,...,pobreza_cap_90,pobreza_cap_00,pobreza_cap_10,pobreza_patrim_90,pobreza_patrim_00,pobreza_patrim_10,gini_90,gini_00,gini_10,secure_live
0,1,Aguascalientes,1,1001,Aguascalientes,794304,30.531104,2.264478,28.266627,27.983320,...,20.4,12.7,18.474600,43.4,33.7,41.900398,0.473,0.425,0.422628,1
1,1,Aguascalientes,2,1002,Asientos,48592,67.111172,8.040704,59.070468,22.439389,...,39.9,29.0,30.980801,64.2,48.9,59.175800,0.379,0.533,0.343879,0
2,1,Aguascalientes,3,1003,Calvillo,53104,61.360527,7.241238,54.119289,29.428583,...,39.5,33.1,28.259199,63.9,57.9,56.504902,0.414,0.465,0.386781,0
3,1,Aguascalientes,4,1004,Cosío,14101,52.800458,4.769001,48.031458,27.128568,...,35.2,21.0,22.386101,59.7,40.1,51.164501,0.392,0.541,0.344984,0
4,1,Aguascalientes,5,1005,Jesús María,101379,45.338512,6.084037,39.254475,26.262912,...,36.6,22.6,22.139999,60.6,42.2,45.703899,0.391,0.469,0.458083,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2451,32,Zacatecas,54,32054,Villa Hidalgo,21016,74.848837,12.301183,62.547654,19.229856,...,51.8,54.8,41.368999,73.5,70.9,70.859596,0.403,0.589,0.342037,0
2452,32,Zacatecas,55,32055,Villanueva,27385,65.450191,10.203506,55.246687,23.623556,...,34.2,25.9,20.563601,57.8,44.1,46.659199,0.422,0.463,0.362527,0
2453,32,Zacatecas,56,32056,Zacatecas,117528,29.541959,3.535624,26.006335,16.644262,...,15.7,20.7,12.115300,36.6,41.8,32.302700,0.528,0.498,0.436339,1
2454,32,Zacatecas,57,32057,Trancoso,20456,78.374962,14.607016,63.767946,13.750759,...,36.2,36.4,30.037100,60.5,54.7,57.394501,0.380,0.483,0.365307,0


In [9]:
#Deleting not relevant columns with numerical and string type
df = df.drop(columns=df.columns[:5],axis=1)
df 

Unnamed: 0,pobtot_ajustada,pobreza,pobreza_e,pobreza_m,vul_car,vul_ing,npnv,ic_rezedu,ic_asalud,ic_cv,...,pobreza_cap_90,pobreza_cap_00,pobreza_cap_10,pobreza_patrim_90,pobreza_patrim_00,pobreza_patrim_10,gini_90,gini_00,gini_10,secure_live
0,794304,30.531104,2.264478,28.266627,27.983320,8.419106,33.066469,14.970553,24.034493,4.721187,...,20.4,12.7,18.474600,43.4,33.7,41.900398,0.473,0.425,0.422628,1
1,48592,67.111172,8.040704,59.070468,22.439389,5.557604,4.891835,21.222712,15.514032,11.062247,...,39.9,29.0,30.980801,64.2,48.9,59.175800,0.379,0.533,0.343879,0
2,53104,61.360527,7.241238,54.119289,29.428583,2.921336,6.289554,27.361207,20.812551,9.880071,...,39.5,33.1,28.259199,63.9,57.9,56.504902,0.414,0.465,0.386781,0
3,14101,52.800458,4.769001,48.031458,27.128568,7.709276,12.361698,20.889023,14.071657,11.443449,...,35.2,21.0,22.386101,59.7,40.1,51.164501,0.392,0.541,0.344984,0
4,101379,45.338512,6.084037,39.254475,26.262912,8.279864,20.118712,20.578144,16.567818,12.936542,...,36.6,22.6,22.139999,60.6,42.2,45.703899,0.391,0.469,0.458083,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2451,21016,74.848837,12.301183,62.547654,19.229856,3.177689,2.743618,27.350040,36.056322,5.679670,...,51.8,54.8,41.368999,73.5,70.9,70.859596,0.403,0.589,0.342037,0
2452,27385,65.450191,10.203506,55.246687,23.623556,5.007426,5.918827,29.914879,53.313420,2.857021,...,34.2,25.9,20.563601,57.8,44.1,46.659199,0.422,0.463,0.362527,0
2453,117528,29.541959,3.535624,26.006335,16.644262,8.828019,44.985759,11.936088,18.316528,2.417258,...,15.7,20.7,12.115300,36.6,41.8,32.302700,0.528,0.498,0.436339,1
2454,20456,78.374962,14.607016,63.767946,13.750759,4.440331,3.433948,26.649950,11.769479,10.705211,...,36.2,36.4,30.037100,60.5,54.7,57.394501,0.380,0.483,0.365307,0


In [10]:
# change string columns into numericals, using doomies function
#df = pd.get_dummies(df,columns=["gdo_rezsoc00","gdo_rezsoc05","gdo_rezsoc10"],dtype="float")
df

Unnamed: 0,pobtot_ajustada,pobreza,pobreza_e,pobreza_m,vul_car,vul_ing,npnv,ic_rezedu,ic_asalud,ic_cv,...,pobreza_cap_90,pobreza_cap_00,pobreza_cap_10,pobreza_patrim_90,pobreza_patrim_00,pobreza_patrim_10,gini_90,gini_00,gini_10,secure_live
0,794304,30.531104,2.264478,28.266627,27.983320,8.419106,33.066469,14.970553,24.034493,4.721187,...,20.4,12.7,18.474600,43.4,33.7,41.900398,0.473,0.425,0.422628,1
1,48592,67.111172,8.040704,59.070468,22.439389,5.557604,4.891835,21.222712,15.514032,11.062247,...,39.9,29.0,30.980801,64.2,48.9,59.175800,0.379,0.533,0.343879,0
2,53104,61.360527,7.241238,54.119289,29.428583,2.921336,6.289554,27.361207,20.812551,9.880071,...,39.5,33.1,28.259199,63.9,57.9,56.504902,0.414,0.465,0.386781,0
3,14101,52.800458,4.769001,48.031458,27.128568,7.709276,12.361698,20.889023,14.071657,11.443449,...,35.2,21.0,22.386101,59.7,40.1,51.164501,0.392,0.541,0.344984,0
4,101379,45.338512,6.084037,39.254475,26.262912,8.279864,20.118712,20.578144,16.567818,12.936542,...,36.6,22.6,22.139999,60.6,42.2,45.703899,0.391,0.469,0.458083,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2451,21016,74.848837,12.301183,62.547654,19.229856,3.177689,2.743618,27.350040,36.056322,5.679670,...,51.8,54.8,41.368999,73.5,70.9,70.859596,0.403,0.589,0.342037,0
2452,27385,65.450191,10.203506,55.246687,23.623556,5.007426,5.918827,29.914879,53.313420,2.857021,...,34.2,25.9,20.563601,57.8,44.1,46.659199,0.422,0.463,0.362527,0
2453,117528,29.541959,3.535624,26.006335,16.644262,8.828019,44.985759,11.936088,18.316528,2.417258,...,15.7,20.7,12.115300,36.6,41.8,32.302700,0.528,0.498,0.436339,1
2454,20456,78.374962,14.607016,63.767946,13.750759,4.440331,3.433948,26.649950,11.769479,10.705211,...,36.2,36.4,30.037100,60.5,54.7,57.394501,0.380,0.483,0.365307,0


In [11]:
#Deviding the dataframe, observations and target
X = df.drop(columns=['secure_live'])
Y = df['secure_live']

#Splitting betwwen train and validation df
X_train, X_val = train_test_split(X,test_size=0.2,random_state=42)
y_train, y_val = train_test_split(Y,test_size=0.2,random_state=42)

In [12]:
columnas_no_numericas = []

for columna in X_val.columns:
    try:
        pd.to_numeric(df[columna])
    except (ValueError, TypeError):
        # Si la conversión falla, la columna no es numérica
        columnas_no_numericas.append(columna)

if columnas_no_numericas:
    print("Se encontraron columnas no numéricas en el DataFrame:")
    print(columnas_no_numericas)
else:
    print("Todas las columnas son numéricas.")


Se encontraron columnas no numéricas en el DataFrame:
['gdo_rezsoc00', 'gdo_rezsoc05', 'gdo_rezsoc10']


In [13]:

def euclidean_distance(x1, x2):
    distance = np.sqrt(np.sum((x1 - x2)**2))
    return distance

class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = [self._predict(x) for x in X]
        return np.array(predictions) #predictions

    def _predict(self, x):
        # compute the distance
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]

        # get the closest k
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]

        # majority voye
        most_common = Counter(k_nearest_labels).most_common()
        return most_common[0][0]

In [14]:
columnas_numericas = df.select_dtypes(include=['int', 'float','object']).columns
df[columnas_numericas] = df[columnas_numericas].astype(float)
col= df.select_dtypes(include=['str']).columns
col

ValueError: could not convert string to float: 'Muy bajo'

In [None]:
# Initialize the KNN classifier with the value K wanted
knn = KNN(k=5)

# Fit the KNN classifier with the data
knn.fit(X_train, y_train)

# Make a prediction for the input data
prediction = knn.predict(X_val)

print(prediction)
# Show the predicted group for the input data
if prediction[0] == 1:
    print("The input data is predicted to belong to the 'Buy' group.")
else:
    print("The input data is predicted to belong to the 'Do Not Buy' group.")

TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [None]:
#KNN Implementation using scikit-learn
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train.values, y_train.values)
neigh.predict_proba(X_val.values)

AttributeError: 'Flags' object has no attribute 'c_contiguous'