In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score
import pickle
from sklearn.preprocessing import StandardScaler,MinMaxScaler 

In [38]:
df = pd.read_csv("variables.csv")
df

Unnamed: 0,puesto_ord,sex_ Male,sex_ Female,age,rango_edad,hours-per-week,educacion_superior,education-num,marital-status_Casado/a (civil),occ_ord,edad_ajustada,horas_puesto,marital-status_Nunca casado/a,income
0,2,True,False,39,2,40,1,13,False,4,507,160,True,0
1,5,True,False,50,3,13,1,13,True,15,650,195,False,0
2,2,True,False,38,2,40,0,9,False,2,342,80,False,0
3,2,True,False,53,3,40,0,7,True,2,371,80,False,0
4,2,False,True,28,1,40,1,13,True,17,364,680,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,2,False,True,27,1,38,0,12,True,7,324,266,False,0
32557,2,True,False,40,2,40,0,9,True,3,360,120,False,1
32558,2,False,True,58,4,40,0,9,False,4,522,160,False,0
32559,2,True,False,22,0,20,0,9,False,4,198,80,True,0


In [39]:
X = df.drop(columns=["income"])
y = df["income"]

x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=21)

In [40]:
modelo_inicio = RandomForestClassifier(max_depth=7,min_samples_split=30)
modelo_inicio.fit(x_train,y_train)

In [41]:
pred = modelo_inicio.predict(x_test)
print(accuracy_score(y_test,pred))
print(precision_score(y_test,pred))
print(recall_score(y_test,pred))

0.8276179752277613
0.7018561484918794
0.5084033613445378


In [42]:
scores = cross_val_score( modelo_inicio,x_train,y_train,cv=10,scoring="accuracy")

In [43]:
print(scores)
scores.mean()

[0.84166667 0.84210526 0.8372093  0.84466871 0.84335235 0.83633172
 0.8490566  0.84203598 0.82272927 0.83896446]


np.float64(0.8398120328244921)

In [None]:
pipe = Pipeline(steps =[("scaler",StandardScaler()),("classifier",RandomForestClassifier())])

RF_params = {
    "scaler":[StandardScaler(),MinMaxScaler(),None],
    "classifier":[RandomForestClassifier()],
    "classifier__max_depth":np.arange(3,12),
    "classifier__min_samples_split":[10,30,50],
    "classifier__min_samples_leaf":[20,30,50],
    "classifier__n_estimators":[100,150,200]
}
LR_params = {
    "scaler":[StandardScaler(),MinMaxScaler(),None],
    "classifier":[LogisticRegression()],
    "classifier__C":[0.1,0.5,1,5]
}


busqueda = [RF_params,LR_params]

gs  = GridSearchCV(estimator=pipe,param_grid=busqueda,cv=5,verbose=2,scoring="accuracy")
gs.fit(x_train,y_train)

In [45]:
print(gs.best_score_)
print(gs.best_estimator_)
print(gs.best_params_)

0.8422691384500866
Pipeline(steps=[('scaler', MinMaxScaler()),
                ('classifier',
                 RandomForestClassifier(max_depth=np.int64(10),
                                        min_samples_leaf=20,
                                        min_samples_split=10,
                                        n_estimators=150))])
{'classifier': RandomForestClassifier(), 'classifier__max_depth': np.int64(10), 'classifier__min_samples_leaf': 20, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 150, 'scaler': MinMaxScaler()}


In [46]:
rf_h = gs.best_estimator_
rf_h.fit(x_train,y_train)
predh = rf_h.predict(x_test)
print(accuracy_score(y_test,predh))
print(precision_score(y_test,predh))
print(recall_score(y_test,predh))


0.8312007370252841
0.7054525014052838
0.5273109243697479


In [52]:
scores2 = cross_val_score( rf_h,x_train,y_train,cv=10,scoring="recall")

In [53]:
print(scores2)
scores2.mean()

[0.53663004 0.53747715 0.54029304 0.55494505 0.53479853 0.53846154
 0.55677656 0.55128205 0.49267399 0.54029304]


np.float64(0.5383630994234285)

In [54]:
filename = "modelo_viernes.pkl"
with open(filename,"wb")as archivo:
    pickle.dump(rf_h,archivo)