# SVM

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

from sklearn.svm import SVC
from sklearn import svm

from preprocesamiento import feature_engineering
from preprocesamiento import obtenerDFHoldout, obtenerDFTraining
from preprocesamiento import preprocesar_data_frame

from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

RANDOM_STATE = 19 * 103785

## Funciones de Preprocesamiento

In [2]:
def preprocesar_df_min_max_scaler(X : pd.DataFrame):
    X = pd.get_dummies(X)
    scaler = MinMaxScaler()
    scaler.fit(X)
    return scaler.transform(X)

def preprocesar_df_pca(df , dim):
    #df = feature_engineering(df)
    (X, Y) = preprocesar_data_frame(df)
    X = preprocesar_df_min_max_scaler(X)
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=RANDOM_STATE)
    
    pca = PCA(dim)
    x_train_pca = pd.DataFrame(pca.fit_transform(x_train))
    x_test_pca = pca.transform(x_test)
    
    return (x_train_pca,x_test_pca,y_train,y_test)


In [3]:
df = obtenerDFTraining()
feature_engineering(df)
(X, Y) = preprocesar_data_frame(df)

#Prepos
X = pd.get_dummies(X[['estado_marital','genero','edad','anios_estudiados',
                       'ganancia_perdida_declarada_bolsa_argentina']])

X['ganancia_perdida_declarada_bolsa_argentina'] = X['ganancia_perdida_declarada_bolsa_argentina'].apply(lambda x: np.tanh(x) )
X = X.drop(['genero_mujer'],axis = 1)
X = X.drop(['estado_marital_divorciado'],axis = 1)
#Prepros
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=RANDOM_STATE)

## Kernel POLY

In [4]:
parametros_poly = {'kernel' : ['poly'],'C':range(100,501,20),'degree':range(2,11), 'random_state':[RANDOM_STATE]}

svc = svm.SVC()
clf_kernel_poly = GridSearchCV(svc, parametros_poly,n_jobs =-1,verbose = 4, cv = 2,scoring = 'f1')

df = obtenerDFTraining()
x_train, x_test, y_train, y_test  = preprocesar_df_pca(df,48)

clf_kernel_poly.fit(x_train, y_train)

print(classification_report(y_test,clf_kernel_poly.predict(x_test)))
clf_kernel_poly.best_params_

Fitting 2 folds for each of 189 candidates, totalling 378 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed: 14.6min
[Parallel(n_jobs=-1)]: Done 213 tasks      | elapsed: 36.6min
[Parallel(n_jobs=-1)]: Done 378 out of 378 | elapsed: 71.1min finished


              precision    recall  f1-score   support

           0       0.87      0.91      0.89      6196
           1       0.66      0.57      0.61      1936

    accuracy                           0.83      8132
   macro avg       0.77      0.74      0.75      8132
weighted avg       0.82      0.83      0.82      8132



{'C': 140, 'degree': 3, 'kernel': 'poly', 'random_state': 1971915}

## Kernel RBF

In [5]:
parametros_rbf = {'kernel' : ['rbf'],'C':range(100,501,10), 'gamma' :['scale','auto'], 'random_state':[RANDOM_STATE]}

svc = svm.SVC()
clf_kernel_rbf = GridSearchCV(svc, parametros_rbf,n_jobs =-1,verbose = 4, cv = 2,scoring = 'f1')

df = obtenerDFTraining()
x_train, x_test, y_train, y_test  = preprocesar_df_pca(df,48)

clf_kernel_rbf.fit(x_train, y_train)

print(classification_report(y_test,clf_kernel_rbf.predict(x_test)))
clf_kernel_rbf.best_params_

Fitting 2 folds for each of 82 candidates, totalling 164 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed: 11.7min
[Parallel(n_jobs=-1)]: Done 164 out of 164 | elapsed: 21.9min finished


              precision    recall  f1-score   support

           0       0.87      0.92      0.90      6196
           1       0.70      0.55      0.62      1936

    accuracy                           0.84      8132
   macro avg       0.78      0.74      0.76      8132
weighted avg       0.83      0.84      0.83      8132



{'C': 350, 'gamma': 'auto', 'kernel': 'rbf', 'random_state': 1971915}

In [6]:
df = obtenerDFTraining()
x_train, x_test, y_train, y_test  = preprocesar_df_pca(df,48)


a = SVC(kernel='rbf', C=270,gamma = 'auto', random_state=1971915)

a.fit(x_train,y_train)
print(classification_report(y_test,a.predict(x_test)))

              precision    recall  f1-score   support

           0       0.87      0.93      0.90      6196
           1       0.70      0.56      0.62      1936

    accuracy                           0.84      8132
   macro avg       0.78      0.74      0.76      8132
weighted avg       0.83      0.84      0.83      8132

