# EDA

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

bank_df = pd.read_csv("/workspaces/logistic-regression-project-tutorial-pilarzarco/data/bank-marketing-campaign-data.csv", sep = ";")
bank_df.head()

In [None]:
bank_df.shape

In [None]:
bank_df.info()

In [None]:
# Eliminar duplicados
bank_df = bank_df.drop_duplicates().reset_index(drop = True)
bank_df.head()

In [None]:
# Contar cuántas veces aparece "nonexistent" en la columna 'poutcome', deci
non_existent = (bank_df['poutcome'] == "nonexistent").sum()
non_existent

# Decido eliminarla ya que no aporta datos suficientes
bank_df = bank_df.drop(["poutcome"], axis=1, inplace=False)
bank_df.head()

#### - Análisis de variables univariante

In [None]:
# Hacemos el análisis univariante de todas las variables
import seaborn as sns
import matplotlib.pyplot as plt

fig, axis= plt.subplots(2, 3, figsize=(10, 7))

sns.histplot(ax=axis[0, 0], data=bank_df, x="age", color="yellow")
sns.histplot(ax=axis[0, 1], data=bank_df, x="job", color="green").set_xticklabels([])
sns.histplot(ax=axis[0, 2], data=bank_df, x="marital", color="pink").set(ylabel=None)
sns.histplot(ax=axis[1, 0], data=bank_df, x="education", color="blue").set_xticklabels([])
sns.histplot(ax=axis[1, 1], data=bank_df, x="default", color="purple").set(ylabel=None)
sns.histplot(ax=axis[1, 2], data=bank_df, x="housing", color="red").set(ylabel=None)

plt.tight_layout()
plt.show()

In [None]:
fig, axis= plt.subplots(2, 3, figsize=(10, 7))

sns.histplot(ax=axis[0, 0], data=bank_df, x="loan", color="yellow")
sns.histplot(ax=axis[0, 1], data=bank_df, x="contact", color="green").set(ylabel=None)
sns.histplot(ax=axis[0, 2], data=bank_df, x="month", color="pink").set_xticklabels([])
sns.histplot(ax=axis[1, 0], data=bank_df, x="day_of_week", color="blue")
sns.histplot(ax=axis[1, 1], data=bank_df, x="duration", color="purple").set(ylabel=None)
sns.histplot(ax=axis[1, 2], data=bank_df, x="campaign", color="red").set(ylabel=None)

plt.tight_layout()
plt.show()

In [None]:
fig, axis= plt.subplots(2, 3, figsize=(10, 7))

sns.histplot(ax=axis[0, 0], data=bank_df, x="pdays", color="yellow")
sns.histplot(ax=axis[0, 1], data=bank_df, x="previous", color="green").set(ylabel=None)
sns.histplot(ax=axis[0, 2], data=bank_df, x="y", color="pink").set(ylabel=None)
sns.histplot(ax=axis[1, 0], data=bank_df, x="emp.var.rate", color="blue")
sns.histplot(ax=axis[1, 1], data=bank_df, x="cons.price.idx", color="purple").set(ylabel=None)
sns.histplot(ax=axis[1, 2], data=bank_df, x="cons.conf.idx", color="red").set(ylabel=None)

plt.tight_layout()
plt.show()

In [None]:
fig, axis= plt.subplots(2, figsize=(10, 7))

sns.histplot(ax=axis[0], data=bank_df, x="euribor3m", color="yellow")
sns.histplot(ax=axis[1], data=bank_df, x="nr.employed", color="green").set(ylabel=None)


plt.tight_layout()
plt.show()

#### - Análisis de variables multivariante

In [None]:
# Análisis Categórico-categórico
fig, axis = plt.subplots(2, 2, figsize = (15, 7))

sns.countplot(ax = axis[0, 0], data = bank_df, x = "housing", hue = "marital")
sns.countplot(ax = axis[0, 1], data = bank_df, x = "loan", hue = "marital")
sns.countplot(ax = axis[1, 0], data = bank_df, x = "default", hue = "marital")
sns.countplot(ax = axis[1, 1], data = bank_df, x = "loan", hue = "housing")


In [None]:
# Análisis numérico-numérico
fig, axis = plt.subplots(4, 2, figsize = (15, 15))


sns.regplot(ax = axis[0, 0], data = bank_df, x = "cons.price.idx", y = "cons.conf.idx")
sns.heatmap(bank_df[["cons.price.idx", "cons.conf.idx"]].corr(), annot = True, fmt = ".2f", ax = axis[1, 0], cbar = False)

sns.regplot(ax = axis[0, 1], data = bank_df, x = "emp.var.rate", y = "nr.employed")
sns.heatmap(bank_df[["emp.var.rate", "nr.employed"]].corr(), annot = True, fmt = ".2f", ax = axis[1, 1], cbar = False)

sns.regplot(ax = axis[2, 0], data = bank_df, x = "euribor3m", y = "emp.var.rate")
sns.heatmap(bank_df[["euribor3m", "emp.var.rate"]].corr(), annot = True, fmt = ".2f", ax = axis[3, 0], cbar = False)

sns.regplot(ax = axis[2, 1], data = bank_df, x = "euribor3m", y = "nr.employed")
sns.heatmap(bank_df[["euribor3m", "nr.employed"]].corr(), annot = True, fmt = ".2f", ax = axis[3, 1], cbar = False)

plt.tight_layout()
plt.show()


In [None]:
# Relacionamos varias predictoras
fig, axis = plt.subplots(figsize = (10, 5), ncols = 2)

sns.barplot(ax = axis[0], data = bank_df, x = "marital", y = "age", hue = "housing")
sns.barplot(ax = axis[1], data = bank_df, x = "loan", y = "age", hue = "default")

plt.tight_layout()

plt.show()

In [None]:
# Factorizar variables categóricas
bank_df["job_n"] = pd.factorize(bank_df["job"])[0]
bank_df["marital_n"] = pd.factorize(bank_df["marital"])[0]
bank_df["education_n"] = pd.factorize(bank_df["education"])[0]
bank_df["default_n"] = pd.factorize(bank_df["default"])[0]
bank_df["housing_n"] = pd.factorize(bank_df["housing"])[0]
bank_df["loan_n"] = pd.factorize(bank_df["loan"])[0]
bank_df["contact_n"] = pd.factorize(bank_df["contact"])[0]
bank_df["month_n"] = pd.factorize(bank_df["month"])[0]
bank_df["day_of_week_n"] = pd.factorize(bank_df["day_of_week"])[0]
bank_df["y_n"] = pd.factorize(bank_df["y"])[0]

num_variables_n = ["job_n", "marital_n", "education_n", "default_n", "housing_n", "loan_n", "contact_n", "month_n", "day_of_week_n",
                 "age", "duration", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed", "y_n"]

In [None]:
# Mapa de correlación completo
fig, axis = plt.subplots(figsize=(20, 15))

sns.heatmap(bank_df[["job_n", "marital_n", "education_n", "default_n", "housing_n", "loan_n", "contact_n", "month_n", "day_of_week_n",
                 "age", "duration", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed", "y_n"]].corr(), annot=True, cmap="cividis")

plt.show()


* Tenemos un mapa de calor débil, con pocas correlaciones.
* Podemos decir que hay correlación positiva fuerte entre:
    * La 'tasa de Euribor' y la 'tasa de variación del empleo'.
    * La 'tasa de Euribor' y el 'número de empleados'.
    * La 'tasa de variación del empleo' y 'número de empleados'.
    * El 'Número de contactos realizados durante la campaña anterior al cliente ' y el 'Resultado de la campaña de marketing anterior'.
    * La 'tasa de variación del empleo ' y 'Índice de precios al consumidor'.
* Analizamos la correlación de 'Y', tenemos correlaciones débiles y/o generales:
    * Débil con 'Duración del contacto previo en segundos'
    * General con:
        * 'Tipo de comunicación de contacto'
        * 'Último mes en el que se le ha contactado'
        * 'Número de contactos realizados durante la campaña anterior al cliente'
        * 'Resultado de la campaña de marketing anterior'

#### - Análisis de valores atípicos

In [41]:
# Análisis descriptivo
bank_df.describe()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,marital_n,education_n,default_n,housing_n,loan_n,contact_n,month_n,day_of_week_n,y_n,columna
count,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,...,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0
mean,40.0238,258.315815,2.275476,962.46481,0.173013,0.081922,93.57572,-40.502863,3.621293,5167.03487,...,0.510637,3.27737,0.208908,0.571959,0.199825,0.634714,2.249951,1.9796,0.112663,2.567879
std,10.42068,259.305321,1.550606,186.937102,0.494964,1.570883,0.578839,4.62786,1.734437,72.251364,...,0.696794,2.185558,0.406713,0.541216,0.456055,0.481516,2.429157,1.411539,0.316184,2.770318
min,17.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,32.0,102.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
50%,38.0,180.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0,...,0.0,3.0,0.0,1.0,0.0,1.0,2.0,2.0,0.0,2.0
75%,47.0,319.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1,...,1.0,6.0,0.0,1.0,0.0,1.0,3.0,3.0,0.0,3.0
max,98.0,4918.0,6.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1,...,3.0,7.0,2.0,2.0,2.0,1.0,9.0,4.0,1.0,56.0


In [None]:
# Análisis de outliers
fig, axis = plt.subplots(4, 3, figsize = (15, 10))

sns.boxplot(ax = axis[0, 0], data = bank_df, y = "age")
sns.boxplot(ax = axis[0, 1], data = bank_df, y = "duration")
sns.boxplot(ax = axis[0, 2], data = bank_df, y = "campaign")
sns.boxplot(ax = axis[1, 0], data = bank_df, y = "pdays")
sns.boxplot(ax = axis[1, 1], data = bank_df, y = "previous")
sns.boxplot(ax = axis[1, 2], data = bank_df, y = "emp.var.rate")
sns.boxplot(ax = axis[2, 0], data = bank_df, y = "cons.price.idx")
sns.boxplot(ax = axis[2, 1], data = bank_df, y = "cons.conf.idx")
sns.boxplot(ax = axis[2, 2], data = bank_df, y = "euribor3m")
sns.boxplot(ax = axis[3, 0], data = bank_df, y = "nr.employed")

plt.tight_layout()
fig.delaxes(axis[3, 1])
fig.delaxes(axis[3, 2])
plt.show()



In [None]:
# Análisis de valores faltantes
bank_df.isnull().sum()
bank_df

#### - Dividir en Train y Test

In [None]:
from sklearn.model_selection import train_test_split

X = bank_df.drop("y_n", axis = 1)
y = bank_df["y_n"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 71)

X_train.head()

In [None]:
X_test.head()

#### - Escalado de variables

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scal_features = scaler.fit_transform(bank_df[num_variables_n])
bank_df_scal = pd.DataFrame(scal_features, index = bank_df.index, columns = num_variables_n)
bank_df_scal.head()

#### - Selección de características

In [None]:
# Feature selection
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.model_selection import train_test_split

X = bank_df_scal.drop("y_n", axis = 1)
y = bank_df_scal["y_n"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42)

selection_model = SelectKBest(chi2, k = 5)
selection_model.fit(X_train, y_train)
ix = selection_model.get_support()
X_train_sel = pd.DataFrame(selection_model.transform(X_train), columns = X_train.columns.values[ix])
X_test_sel = pd.DataFrame(selection_model.transform(X_test), columns = X_test.columns.values[ix])

X_train_sel.head()

In [None]:
X_test_sel.head()

In [None]:
# Guardamos nuestro Train y Test

X_train_sel["y_n"] = list(y_train)
X_test_sel["y_n"] = list(y_test)
X_train_sel.to_csv("/workspaces/logistic-regression-project-tutorial-pilarzarco/interini/clean_train.csv", index = False)
X_test_sel.to_csv("/workspaces/logistic-regression-project-tutorial-pilarzarco/interini/clean_test.csv", index = False)