In [1]:
import pandas as pd
import numpy as np

from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn.metrics import classification_report

from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

from preprocesamiento import feature_engineering
from preprocesamiento import obtenerDFTraining
from preprocesamiento import preprocesar_data_frame

RANDOM_STATE = 19 * 103785

In [2]:
parametros = {'n_estimators': [10000], 'criterion' : ['entropy'], 'max_depth':range(2,5), 
              "max_features": ["auto"],
              'random_state':[RANDOM_STATE]} 

random_forest = RandomForestClassifier()
clf = GridSearchCV(random_forest, parametros, n_jobs = -1, verbose = 4, scoring = "roc_auc")

df = obtenerDFTraining()
(X, y) = preprocesar_data_frame(df)
X1 = pd.get_dummies(X[['ganancia_perdida_declarada_bolsa_argentina','trabajo',
                       'estado_marital','genero',"edad"]])

X1['ganancia_perdida_declarada_bolsa_argentina'] = X['ganancia_perdida_declarada_bolsa_argentina'].apply(lambda x: np.tanh(x) )
X1 = X1.drop(['trabajo_No contesto'],axis = 1)
X1 = X1.drop(['genero_mujer'],axis = 1)
X1 = X1.drop(['estado_marital_divorciado'],axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.25, random_state=RANDOM_STATE)

clf.fit(X_train, y_train)
print(classification_report(y_test,clf.predict(X_test)))
clf.best_params_

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  15 | elapsed:  6.7min remaining:  1.7min
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  9.2min finished


              precision    recall  f1-score   support

           0       0.80      1.00      0.89      6196
           1       0.99      0.22      0.37      1936

    accuracy                           0.81      8132
   macro avg       0.90      0.61      0.63      8132
weighted avg       0.85      0.81      0.77      8132



{'criterion': 'entropy',
 'max_depth': 4,
 'max_features': 'auto',
 'n_estimators': 10000,
 'random_state': 1971915}

In [7]:
def preprocesar_df_min_max_scaler(X : pd.DataFrame):
    X = pd.get_dummies(X)
    scaler = MinMaxScaler()
    scaler.fit(X)
    return scaler.transform(X)

def preprocesar_df_pca(df , dim):
    #df = feature_engineering(df)
    (X, Y) = preprocesar_data_frame(df)
    X = preprocesar_df_min_max_scaler(X)
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=RANDOM_STATE)
    
    pca = PCA(dim)
    x_train_pca = pd.DataFrame(pca.fit_transform(x_train))
    x_test_pca = pca.transform(x_test)
    
    return (x_train_pca,x_test_pca,y_train,y_test)



parametros = {'n_estimators': [10000], 'criterion' : ['entropy'], 'max_depth':range(3,10), 
              "max_features": ["auto"],
              'random_state':[RANDOM_STATE]} 

random_forest = RandomForestClassifier()
clf = GridSearchCV(random_forest, parametros, n_jobs = -1, verbose = 4,scoring = 'recall')

df = obtenerDFTraining()
x_train, x_test, y_train, y_test  = preprocesar_df_pca(df,48)

clf.fit(x_train, y_train)
print(classification_report(y_test,clf.predict(x_test)))
clf.best_params_

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 84.9min
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed: 211.3min finished


              precision    recall  f1-score   support

           0       0.86      0.95      0.90      6196
           1       0.75      0.52      0.61      1936

    accuracy                           0.84      8132
   macro avg       0.81      0.73      0.76      8132
weighted avg       0.84      0.84      0.83      8132



{'criterion': 'entropy',
 'max_depth': 9,
 'max_features': 'auto',
 'n_estimators': 10000,
 'random_state': 1971915}

In [8]:
parametros = {'n_estimators': [10000], 'criterion' : ['entropy'], 'max_depth':range(13,16), 
              "max_features": ["auto"],
              'random_state':[RANDOM_STATE]} 

random_forest = RandomForestClassifier()
clf_1 = GridSearchCV(random_forest, parametros, n_jobs = -1, verbose = 4,scoring = 'recall')


clf_1.fit(x_train, y_train)
print(classification_report(y_test,clf_1.predict(x_test)))
clf_1.best_params_

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  15 | elapsed: 95.7min remaining: 23.9min
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 122.9min finished


              precision    recall  f1-score   support

           0       0.88      0.92      0.90      6196
           1       0.70      0.58      0.64      1936

    accuracy                           0.84      8132
   macro avg       0.79      0.75      0.77      8132
weighted avg       0.83      0.84      0.84      8132



{'criterion': 'entropy',
 'max_depth': 9,
 'max_features': 'auto',
 'n_estimators': 10000,
 'random_state': 1971915}

In [20]:
clf_1.best_params_


df = obtenerDFTraining()
x_train, x_test, y_train, y_test  = preprocesar_df_pca(df,48)

rf = RandomForestClassifier(n_estimators = 1000,criterion = "entropy", max_depth = 16, n_jobs = -1, verbose = 1 ,random_state = RANDOM_STATE)
rf.fit(x_train, y_train)
print(classification_report(y_test,rf.predict(x_test)))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   26.5s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:   50.9s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  1.1min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.4s


              precision    recall  f1-score   support

           0       0.88      0.92      0.90      6196
           1       0.70      0.58      0.63      1936

    accuracy                           0.84      8132
   macro avg       0.79      0.75      0.77      8132
weighted avg       0.83      0.84      0.83      8132



[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    0.5s finished
