
# Predecir si el cliente subscribira un deposito

## Problema

Hay un dataset que contiene informacion de una campaña de mercadeo de una institucion bancaria portuguesa. La base de datos se genero por llamadas telefonicas, que en ocasiones fueron necesarias varias llamadas por cliente. Para poder definir si se haria o no el deposito. 

In [1]:
from ucimlrepo import fetch_ucirepo 

# fetch dataset 
bank_marketing = fetch_ucirepo(id=222) 

# data (as pandas dataframes) 
X = bank_marketing.data.features 
y = bank_marketing.data.targets 

# metadata 
print(bank_marketing.metadata) 

# variable information 
print(bank_marketing.variables) 

{'uci_id': 222, 'name': 'Bank Marketing', 'repository_url': 'https://archive.ics.uci.edu/dataset/222/bank+marketing', 'data_url': 'https://archive.ics.uci.edu/static/public/222/data.csv', 'abstract': 'The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 45211, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Occupation', 'Marital Status', 'Education Level'], 'target_col': ['y'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2014, 'last_updated': 'Fri Aug 18 2023', 'dataset_doi': '10.24432/C5K306', 'creators': ['S. Moro', 'P. Rita', 'P. Cortez'], 'intro_paper': {'ID': 277, 'type': 'NATIVE', 'title': 'A data-driven approach to predict the s

In [2]:
y

Unnamed: 0,y
0,no
1,no
2,no
3,no
4,no
...,...
45206,yes
45207,yes
45208,yes
45209,no


In [3]:
X

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,


In [4]:
X.dtypes

age             int64
job            object
marital        object
education      object
default        object
balance         int64
housing        object
loan           object
contact        object
day_of_week     int64
month          object
duration        int64
campaign        int64
pdays           int64
previous        int64
poutcome       object
dtype: object

In [5]:
## dummy variables
import pandas as pd

categorical_columns = X.select_dtypes(include=['object']).columns
numerical_columns = X.select_dtypes(exclude=['object']).columns

pd.get_dummies(X[categorical_columns]).astype(int)

## NO guardamos el orden de las columnas
## NO sabemos que pasa con columnas nuevas en test

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
45207,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
45208,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
45209,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [6]:
X[categorical_columns]

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome
0,management,married,tertiary,no,yes,no,,may,
1,technician,single,secondary,no,yes,no,,may,
2,entrepreneur,married,secondary,no,yes,yes,,may,
3,blue-collar,married,,no,yes,no,,may,
4,,single,,no,no,no,,may,
...,...,...,...,...,...,...,...,...,...
45206,technician,married,tertiary,no,no,no,cellular,nov,
45207,retired,divorced,primary,no,no,no,cellular,nov,
45208,retired,married,secondary,no,no,no,cellular,nov,success
45209,blue-collar,married,secondary,no,no,no,telephone,nov,


In [7]:
### Para evitar estos limitantes usamos Encoders de Sklearn

### Sklearn tiene un encoder que se llama OneHotEncoder: Funciona con columnas categóricas como dummies
### También esta OrdinalEncoder: Funciona con columnas ordinales aquí se usan enteros para representar las categorías y se respeta el orden.
### LabelEncoder: Funciona con columnas objetivo. Como usamos una variable objetivo binaria, no es necesario usarlo.

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

ohencoder = OneHotEncoder(sparse_output=True) ## Instancio el objeto OneHotEncoder
ohencoder.fit(X[categorical_columns])  ## Ajusto el objeto OneHotEncoder a los datos
ohencoder.get_feature_names_out() ## Muestra las columnas que se crearon
ohencoder.transform(X[categorical_columns])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 406899 stored elements and shape (45211, 44)>

In [8]:
ohencoder.get_feature_names_out()

array(['job_admin.', 'job_blue-collar', 'job_entrepreneur',
       'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student',
       'job_technician', 'job_unemployed', 'job_nan', 'marital_divorced',
       'marital_married', 'marital_single', 'education_primary',
       'education_secondary', 'education_tertiary', 'education_nan',
       'default_no', 'default_yes', 'housing_no', 'housing_yes',
       'loan_no', 'loan_yes', 'contact_cellular', 'contact_telephone',
       'contact_nan', 'month_apr', 'month_aug', 'month_dec', 'month_feb',
       'month_jan', 'month_jul', 'month_jun', 'month_mar', 'month_may',
       'month_nov', 'month_oct', 'month_sep', 'poutcome_failure',
       'poutcome_other', 'poutcome_success', 'poutcome_nan'], dtype=object)

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#categoricas
X_train_cat = ohencoder.transform(X_train[categorical_columns])
X_test_cat = ohencoder.transform(X_test[categorical_columns])

#numericas
X_train_num = X_train[numerical_columns].values
X_test_num = X_test[numerical_columns].values

#unir las bases para una completa de testeo y una de entrenamiento
X_train = pd.concat([pd.DataFrame(X_train_cat.toarray()), pd.DataFrame(X_train_num)], axis=1)

X_test = pd.concat([pd.DataFrame(X_test_cat.toarray()), pd.DataFrame(X_test_num)], axis=1)

In [10]:
X_train.columns = ohencoder.get_feature_names_out().tolist() + numerical_columns.tolist()
X_test.columns = ohencoder.get_feature_names_out().tolist() + numerical_columns.tolist()

X_train

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,poutcome_other,poutcome_success,poutcome_nan,age,balance,day_of_week,duration,campaign,pdays,previous
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,41,849,15,72,1,-1,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,49,1415,30,269,2,-1,0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,42,3842,31,130,4,-1,0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,37,-119,11,375,11,-1,0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,56,3498,15,264,2,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36163,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,44,1059,18,2093,1,-1,0
36164,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,23,508,8,210,1,92,1
36165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,34,1317,15,239,1,-1,0
36166,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,33,165,7,111,1,-1,0


In [11]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
y_pred = model.predict(X_test)


from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
accuracy_score(y_test, y_pred)


0.9000331748313612

In [13]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          no       0.92      0.98      0.94      7952
         yes       0.66      0.35      0.46      1091

    accuracy                           0.90      9043
   macro avg       0.79      0.66      0.70      9043
weighted avg       0.89      0.90      0.89      9043



In [14]:
### Armamos un Pipeline para no tener que perder sparse cuando usamos onehotencoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


preprocesador = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_columns),
    ('cat', OneHotEncoder(sparse_output=True,handle_unknown='ignore'), categorical_columns)
    ])
preprocesador

In [15]:
X.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,


In [16]:
preprocesador.fit(X)

In [17]:
preprocesador.transform(X)

array([[5.800e+01, 2.143e+03, 5.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [4.400e+01, 2.900e+01, 5.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [3.300e+01, 2.000e+00, 5.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       ...,
       [7.200e+01, 5.715e+03, 1.700e+01, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [5.700e+01, 6.680e+02, 1.700e+01, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [3.700e+01, 2.971e+03, 1.700e+01, ..., 1.000e+00, 0.000e+00,
        0.000e+00]])

In [18]:
preprocesador.transform(X)

array([[5.800e+01, 2.143e+03, 5.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [4.400e+01, 2.900e+01, 5.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [3.300e+01, 2.000e+00, 5.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       ...,
       [7.200e+01, 5.715e+03, 1.700e+01, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [5.700e+01, 6.680e+02, 1.700e+01, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [3.700e+01, 2.971e+03, 1.700e+01, ..., 1.000e+00, 0.000e+00,
        0.000e+00]])

## Creamos el pipeline




In [19]:
mi_primer_pipeline=Pipeline(steps=[('preprocesador', preprocesador), 
                ('modelo', LogisticRegression(max_iter=1000))])

mi_primer_pipeline

In [20]:
### Entrenamos el modelo con el pipeline

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

mi_primer_pipeline.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
X.loc[1]

age                    44
job            technician
marital            single
education       secondary
default                no
balance                29
housing               yes
loan                   no
contact               NaN
day_of_week             5
month                 may
duration              151
campaign                1
pdays                  -1
previous                0
poutcome              NaN
Name: 1, dtype: object

In [22]:
X['education'].unique()

array(['tertiary', 'secondary', nan, 'primary'], dtype=object)

In [24]:
testeo_prueba={'age': [39], 'job': ['technician'], 'marital': ['single'], 'education': ['secondary'], 'default': ['no'], 'balance': [29],'housing': ['yes'],'loan':['no'],'contact':['NaN'],'day_of_week':[5],'month':['may'],'duration':[151],'campaign':[1],'pdays':[-1],'previous':[0],'poutcome':['NaN']}

testeo_prueba=pd.DataFrame(testeo_prueba)
testeo_prueba

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome
0,39,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,


In [25]:
preprocesador.transform(testeo_prueba)

array([[ 39.,  29.,   5., 151.,   1.,  -1.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   1.,   0.,   0.,   0.,   0.,   1.,
          0.,   1.,   0.,   0.,   1.,   0.,   0.,   1.,   1.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   1.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.]])

In [26]:
model.predict(preprocesador.transform(testeo_prueba))



array(['no'], dtype=object)

In [27]:
mi_primer_pipeline.predict_proba(testeo_prueba)

array([[0.89292304, 0.10707696]])

In [28]:
## Ahora con un arbol de decisión

from sklearn.tree import DecisionTreeClassifier

mi_segundo_pipeline=Pipeline(steps=[('preprocesador', preprocesador), 
                ('modelo', DecisionTreeClassifier(max_depth=5))])

mi_segundo_pipeline

In [29]:
X_test

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome
3776,40,blue-collar,married,secondary,no,580,yes,no,,16,may,192,1,-1,0,
9928,47,services,single,secondary,no,3644,no,no,,9,jun,83,2,-1,0,
33409,25,student,single,tertiary,no,538,yes,no,cellular,20,apr,226,1,-1,0,
31885,42,management,married,tertiary,no,1773,no,no,cellular,9,apr,311,1,336,1,failure
15738,56,management,married,tertiary,no,217,no,yes,cellular,21,jul,121,2,-1,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9016,46,technician,single,tertiary,no,2800,no,no,,5,jun,47,1,-1,0,
380,38,blue-collar,married,secondary,no,757,yes,no,,6,may,133,1,-1,0,
7713,41,admin.,married,secondary,no,4539,no,no,,30,may,298,3,-1,0,
12188,41,student,married,secondary,no,1309,no,no,,20,jun,28,4,-1,0,


In [30]:
y_test

Unnamed: 0,y
3776,no
9928,no
33409,no
31885,no
15738,no
...,...
9016,no
380,no
7713,no
12188,no


In [31]:
mi_segundo_pipeline.fit(X_train, y_train)

y_pred=mi_segundo_pipeline.predict(X_test)

accuracy_score(y_test, y_pred)

0.901061633736361

In [32]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          no       0.92      0.98      0.95     11966
         yes       0.65      0.35      0.45      1598

    accuracy                           0.90     13564
   macro avg       0.78      0.66      0.70     13564
weighted avg       0.89      0.90      0.89     13564



In [33]:
mi_segundo_pipeline.predict_proba(testeo_prueba)

array([[0.97885609, 0.02114391]])

In [34]:
testeo_prueba

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome
0,39,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,


In [35]:
mi_segundo_pipeline.predict(testeo_prueba)

array(['no'], dtype=object)

In [36]:
import joblib

joblib.dump(mi_primer_pipeline, '../Datos/mi_primer_pipeline.pkl')

joblib.dump(mi_segundo_pipeline, '../Datos/mi_segundo_pipeline.pkl')

['../Datos/mi_segundo_pipeline.pkl']