In [31]:
# librerías
import os
import pickle
import pandas as pd
import gower
from sklearn.svm import SVC
from sklearn.model_selection import  GridSearchCV, StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.manifold import MDS
import gc




In [32]:
# directorios
dataset_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(''))), 'datasets')
image_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(''))), 'images')


### Dataset A: primer experimento con NMDS

In [33]:
# dataset a (edades categorizadas)

dataset_a= pd.read_excel(os.path.join(dataset_dir, 'xlsx/llamados_v2.xlsx'))

In [34]:
# categorizar edad en dataset a


def categoria_edad (x):
    if (x >= 0) and (x <= 11) :
        return 'Niñez'
    elif (x >= 12) and (x <=18):
        return 'Adolescencia'
    elif (x >= 19) and (x <=30):
        return 'Juventud'
    elif (x>=31) and (x<=65) :
        return 'Vejez'
    elif x>=66:
        return 'Vejez mayor'
    else:
        return 'NS/NC'


dataset_a['victima_edad_cat'] = \
dataset_a.victima_edad.apply(categoria_edad)
dataset_a['llamante_edad_cat'] = \
dataset_a.llamante_edad.apply(categoria_edad)

In [35]:
# drop columnas sin usar

dataset_a.drop(['victima_edad', 'llamante_edad'], axis=1, inplace=True) 

In [36]:
# reemplazar nsnc por na
dataset_a.loc[:, 'victima_convive_agresor'] = dataset_a['victima_convive_agresor'].replace({'NS/NC': pd.NA})


In [37]:
# mapear SI NO a 1 0
dataset_a['victima_convive_agresor'] = dataset_a['victima_convive_agresor'].map({'SI': 1, 'NO':0})

In [38]:
dataset_a.to_excel("/Users/vcolombo/Documents/tp especializacion/linea_137_llamados_vs/datasets/xlsx/llamados_dataset_a_nmds.xlsx", index=False)


In [None]:
dataset_a= pd.read_excel(os.path.join(dataset_dir, 'xlsx/llamados_dataset_a_nmds.xlsx'))

In [39]:
# chequear proporciones previas
print('prop. SI:', len(dataset_a[dataset_a['victima_convive_agresor']==1])/len(dataset_a)*100)
print('prop. NO:',len(dataset_a[dataset_a['victima_convive_agresor']==0])/len(dataset_a)*100)

prop. SI: 14.39690748576503
prop. NO: 64.40474324818472


In [40]:
# separar features de target
X = dataset_a.drop(['victima_convive_agresor'], axis=1)
y_previo = dataset_a['victima_convive_agresor']

In [41]:
# guardar los índices de casos NSNC (vacíos)
nsnc_indices = y_previo[y_previo.isna()].index

In [42]:
# Remove "NSNC" rows from the target
y = y_previo.drop(nsnc_indices)

In [None]:
# gower de X
gower_X = gower.gower_matrix(X)
print("gower para dataset_a hecho")

lista_dimensiones=[2,3,4,5,6,7]
lista_modelos=[]
contador_exp= 0
for i in lista_dimensiones:
    print('n_components: ', i)
    # correr NMDS sobre el total del dataset
    nmds = MDS(n_components=i ,metric=False, dissimilarity='precomputed', max_iter=300, random_state=0, normalized_stress=True) 
    X_nmds = nmds.fit_transform(gower_X)
    print('Fit transform hecho')

    # crear el test final con lo que corresponde a target de X transformado 
    test_final = X_nmds[nsnc_indices.tolist(), :]
    print('Test final hecho')
    test_final_df_a = pd.DataFrame(test_final, columns=[f'Component_{i}' for i in range(test_final.shape[1])])


    print('Stress: ' + str(round(nmds.stress_,2)))
    # quitar el test final 
    X_nmds_clean = pd.DataFrame(X_nmds).drop(nsnc_indices)


    print('entrenando el modelo svm')
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=2)
    for train_index, test_index in sss.split(X_nmds_clean, y):
        X_train, X_test = X_nmds_clean.iloc[train_index], X_nmds_clean.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Parámetros de gridsearch
    param_grid = {
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto']
    }


    svm = SVC(class_weight='balanced')


    # GridSearchCV
    grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='f1_weighted')
    grid_search.fit(X_train, y_train)

    # Mejor modelo
    best_model = grid_search.best_estimator_

    # Aplicar al test set
    y_pred = best_model.predict(X_test)

  # Evaluate the model
    print("Best parameters found: ", grid_search.best_params_)
    print("Best cross-validation f1: ", grid_search.best_score_)
    print("Test set f1: ", f1_score(y_test, y_pred, average='weighted'))
    print("Classification report:\n", classification_report(y_test, y_pred, zero_division=1))
    

    # por cada modelo me guardo: n_components usado, best parameters, best cross validation acc, test set acc, classification report

    contador_exp+=1

    gc.collect()


In [14]:
with open('lista_modelos_a.pkl', 'wb') as fp:
    pickle.dump(lista_modelos, fp)

In [16]:
lista_modelos_a = pd.read_pickle("lista_modelos_a.pkl")  

In [15]:
del X_nmds, X_nmds_clean, dataset_a, X,y

### Elegir el mejor n_components y entrenar el modelo con los mejores params de gridsearch para aplicar a test final

In [23]:
X = dataset_a.drop(['victima_convive_agresor'], axis=1)
y_previo = dataset_a['victima_convive_agresor']

In [24]:
nsnc_indices = y_previo[y_previo.isna()].index

In [25]:
y = y_previo.drop(nsnc_indices)

In [26]:
# gower de X
gower_X = gower.gower_matrix(X)

# correr NMDS sobre el total del dataset
nmds = MDS(n_components=7 ,metric=False, dissimilarity='precomputed', max_iter=300, random_state=0, normalized_stress=True) 
X_nmds = nmds.fit_transform(gower_X)

# crear el test final con lo que corresponde a target de X transformado 
test_final = X_nmds[nsnc_indices.tolist(), :]
print('Test final hecho')
test_final_df_a = pd.DataFrame(test_final, columns=[f'Component_{i}' for i in range(test_final.shape[1])])

# quitar el test final 
X_nmds_clean = pd.DataFrame(X_nmds).drop(nsnc_indices)



Test final hecho


In [27]:
#Create a svm Classifier
svm_a = SVC(C=0.1, kernel='poly', degree=3, gamma='auto', class_weight='balanced',random_state=3)

#Train the model using the training sets
svm_a.fit(X_nmds_clean, y)

#Predict the response for test dataset
na_predictions_a = svm_a.predict(test_final)

In [28]:
# Add the predictions to the na_convive_df DataFrame
test_final_df_a['victima_convive_agresor_pred'] = na_predictions_a

In [29]:
test_final_df_a.victima_convive_agresor_pred.value_counts()

victima_convive_agresor_pred
0.0    4058
Name: count, dtype: int64

In [30]:
with open('bets_svm_a.pkl', 'wb') as fp:
    pickle.dump(svm_a, fp)

### Dataset B: segundo experimento con NMDS

In [3]:
llamados= pd.read_excel(os.path.join(dataset_dir, 'xlsx/llamados_v2.xlsx'))

In [4]:
llamados.drop(['llamante_edad'], axis=1, inplace=True) 

In [5]:
dataset_b = llamados[~llamados['victima_edad'].isnull()]

In [6]:
dataset_b.loc[:, 'victima_convive_agresor'] = dataset_b['victima_convive_agresor'].replace({'NS/NC': pd.NA})


In [7]:
dataset_b['victima_convive_agresor'] = dataset_b['victima_convive_agresor'].map({'SI': 1, 'NO': 0})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_b['victima_convive_agresor'] = dataset_b['victima_convive_agresor'].map({'SI': 1, 'NO': 0})


In [None]:
dataset_b.to_excel("/Users/vcolombo/Documents/tp especializacion/linea_137_llamados_vs/datasets/xlsx/llamados_dataset_b_nmds.xlsx", index=False)


In [9]:
del dataset_b

In [None]:
dataset_b= pd.read_excel(os.path.join(dataset_dir, 'xlsx/llamados_dataset_b_nmds.xlsx'))

In [11]:
# separar features de target
X = dataset_b.drop(['victima_convive_agresor'], axis=1)
y_previo = dataset_b['victima_convive_agresor']

In [12]:
# guardar los índices de casos NSNC (vacíos)
nsnc_indices = y_previo[y_previo.isna()].index

In [13]:
# Remove "NSNC" rows from the target
y = y_previo.drop(nsnc_indices)

In [None]:
# gower de X
gower_X = gower.gower_matrix(X)
print("gower para dataset_a hecho")

lista_dimensiones=[2,3,4,5,6,7]
lista_modelos=[]
contador_exp= 0
for i in lista_dimensiones:
    print('n_components: ', i)
    # correr NMDS sobre el total del dataset
    nmds = MDS(n_components=i ,metric=False, dissimilarity='precomputed', max_iter=300, random_state=0, normalized_stress=True) 
    X_nmds = nmds.fit_transform(gower_X)
    print('Fit transform hecho')

    # crear el test final con lo que corresponde a target de X transformado 
    test_final = X_nmds[nsnc_indices.tolist(), :]
    print('Test final hecho')
    test_final_df_b = pd.DataFrame(test_final, columns=[f'Component_{i}' for i in range(test_final.shape[1])])


    print('Stress: ' + str(round(nmds.stress_,2)))
    # quitar el test final 
    X_nmds_clean = pd.DataFrame(X_nmds).drop(nsnc_indices)


    print('entrenando el modelo svm')
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=2)
    for train_index, test_index in sss.split(X_nmds_clean, y):
        X_train, X_test = X_nmds_clean.iloc[train_index], X_nmds_clean.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Parámetros de gridsearch
    param_grid = {
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto']
    }


    svm = SVC(class_weight='balanced')


    # GridSearchCV
    grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='f1_weighted')
    grid_search.fit(X_train, y_train)

    # Mejor modelo
    best_model = grid_search.best_estimator_

    # Aplicar al test set
    y_pred = best_model.predict(X_test)
 
  # Evaluate the model
    print("Best parameters found: ", grid_search.best_params_)
    print("Best cross-validation f1: ", grid_search.best_score_)
    print("Test set f1: ", f1_score(y_test, y_pred, average='weighted'))
    print("Classification report:\n", classification_report(y_test, y_pred, zero_division=1))

    # por cada modelo me guardo: n_components usado, best parameters, best cross validation acc, test set acc, classification report

    contador_exp+=1
    

    gc.collect()


gower para dataset_a hecho
n_components:  2
Fit transform hecho
Test final hecho
Stress: 0.3
entrenando el modelo svm
Classification report:
               precision    recall  f1-score   support

         0.0       0.82      1.00      0.90      2334
         1.0       1.00      0.00      0.00       524

    accuracy                           0.82      2858
   macro avg       0.91      0.50      0.45      2858
weighted avg       0.85      0.82      0.73      2858

n_components:  3
Fit transform hecho
Test final hecho
Stress: 0.25
entrenando el modelo svm
Classification report:
               precision    recall  f1-score   support

         0.0       0.82      1.00      0.90      2334
         1.0       1.00      0.00      0.00       524

    accuracy                           0.82      2858
   macro avg       0.91      0.50      0.45      2858
weighted avg       0.85      0.82      0.73      2858

n_components:  4
Fit transform hecho
Test final hecho
Stress: 0.21
entrenando el modelo 

In [16]:
with open('lista_modelos_b.pkl', 'wb') as fp:
    pickle.dump(lista_modelos, fp)

In [17]:
# elegir el mejor modelo, entrenar con esos parámetros, y testear sobre test final.

del X_nmds, X_nmds_clean, dataset_b

In [18]:
dataset_b= pd.read_excel(os.path.join(dataset_dir, 'xlsx/llamados_dataset_b.xlsx'))

In [19]:
# separar features de target
X = dataset_b.drop(['victima_convive_agresor'], axis=1)
y_previo = dataset_b['victima_convive_agresor']

In [20]:
# guardar los índices de casos NSNC (vacíos)
nsnc_indices = y_previo[y_previo.isna()].index

In [21]:
# sacar las filas del test final
y = y_previo.drop(nsnc_indices)

In [22]:
# gower de X
gower_X = gower.gower_matrix(X)

# correr NMDS sobre el total del dataset
nmds = MDS(n_components=7 ,metric=False, dissimilarity='precomputed', max_iter=300, random_state=0, normalized_stress=True) 
X_nmds = nmds.fit_transform(gower_X)

# crear el test final con lo que corresponde a target de X transformado 
test_final = X_nmds[nsnc_indices.tolist(), :]
print('Test final hecho')
test_final_df_b= pd.DataFrame(test_final, columns=[f'Component_{i}' for i in range(test_final.shape[1])])

# quitar el test final 
X_nmds_clean = pd.DataFrame(X_nmds).drop(nsnc_indices)



Test final hecho


Mejores parámetros: n_components=7
SVM: 'kernel': 'poly', 'C': 0.1, and 'gamma': 'auto'.


In [24]:
# SVM

#X_train, X_test, y_train, y_test = train_test_split(X_nmds_clean, y, test_size=0.3,random_state=109) # 70% training and 30% test


svm = SVC(C=0.1, kernel='poly', degree=3, gamma='auto', class_weight='balanced',random_state=3)


#Create a svm Classifier
svm_b = SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
svm_b.fit(X_nmds_clean, y)

#Predict the response for test dataset
na_predictions_b = svm_b.predict(test_final)

In [25]:
# Add the predictions to the na_convive_df DataFrame
test_final_df_b['victima_convive_agresor_pred'] = na_predictions_b

In [26]:
test_final_df_b.victima_convive_agresor_pred.value_counts()

victima_convive_agresor_pred
0.0    2977
Name: count, dtype: int64

In [27]:
with open('bets_svm_b.pkl', 'wb') as fp:
    pickle.dump(svm_b, fp)

Si bien hay diferencias en los valores de stress cuando varío los n_components, que mejoran a medida que los n_componenets suben, la variación de n_components no produce mejoras en la performance de los modelos de SVM, de hecho no parecería estar afectando el entrenamiento de ninguna manera ya que las métricas de accuracy y etc. de los mejores modelos dan los mismo variando los n_components.