<h1> Projeto da disciplina Tópicos Avançados em SI 6 (Ciência dos Dados)</h1>
<h4> Centro de Informática - Universidade Federal de Pernambuco (CIn - UFPE) </h4>
<h4> Professor: Fernando Neto </h4>
<h4> Equipe: Márcio de Aquino, Vanessa Vieira </h4>
<br>
<h2> Base de dados: <a href=https://archive.ics.uci.edu/ml/datasets/Bank+Marketing> Bank Marketing </a> </h2>

## Hipóteses:
- É possível identificar e classificar um grupo mais provável de aceitar o depósito a prazo
- Há uma relação entre os grupos que aceitam o depósito e a situação sócio-econômica do período
- Há uma relação que engloba o intervalo entre ligações e a resposta
- O tempo de duração da ligação está relacionado com a resposta
- O fato do cliente ter algum empréstimo influencia na resposta


## Pré-processamento

In [1]:
import pandas as pd
import numpy as np
import seaborn as sb

from sklearn import preprocessing
from sklearn.cluster import KMeans

%matplotlib inline

In [2]:
phone_calls = pd.read_csv('./bank-additional/bank-additional-full.csv')

In [3]:
phone_calls.isnull().sum()
#verificando se tem algum atribudo ausente

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

In [4]:
phone_calls.describe()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0
mean,40.02406,258.28501,2.567593,962.475454,0.172963,0.081886,93.575664,-40.5026,3.621291,5167.035911
std,10.42125,259.279249,2.770014,186.910907,0.494901,1.57096,0.57884,4.628198,1.734447,72.251528
min,17.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6
25%,32.0,102.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1
50%,38.0,180.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0
75%,47.0,319.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1
max,98.0,4918.0,56.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1


<h2> Convertendo atributos categóricos em binários </h2>


In [5]:
#executar SE for necessário converter o atributo de saída para binário
results = {'no': 0, 'yes': 1}
phone_calls.y = [ results[el] for el in phone_calls.y]

In [6]:
#executar SE for necessário converter o atributo de contact para binário
results = {'telephone': 0, 'cellular': 1}
phone_calls.contact = [ results[el] for el in phone_calls.contact]

In [7]:
#executar para converter o atributo categórico job em binário
phone_calls =  phone_calls.join(pd.get_dummies(phone_calls.pop('job')), lsuffix="job_", rsuffix="_job")

In [8]:
#executar para converter o atributo categórico marital em binário
phone_calls =  phone_calls.join(pd.get_dummies(phone_calls.pop('marital')), rsuffix="_marital" )

In [9]:
#executar para converter o atributo categórico default em binário
phone_calls =  phone_calls.join(pd.get_dummies(phone_calls.pop('default')), rsuffix="_default" )

In [10]:
#executar para converter o atributo categórico education em binário
phone_calls =  phone_calls.join(pd.get_dummies(phone_calls.pop('education')), rsuffix="_education" )

In [11]:
#executar para converter o atributo categórico housing em binário
phone_calls =  phone_calls.join(pd.get_dummies(phone_calls.pop('housing')), rsuffix="_housing" )

In [12]:
#executar para converter o atributo categórico loan em binário
phone_calls =  phone_calls.join(pd.get_dummies(phone_calls.pop('loan')), rsuffix="_loan" )

In [13]:
#executar para converter o atributo categórico month em binário
phone_calls =  phone_calls.join(pd.get_dummies(phone_calls.pop('month')), rsuffix="_month" )

In [14]:
#executar para converter o atributo categórico days_of_week em binário
phone_calls =  phone_calls.join(pd.get_dummies(phone_calls.pop('day_of_week')), rsuffix="_day_of_week" )

In [15]:
#executar para converter o atributo categórico poutcome em binário
phone_calls =  phone_calls.join(pd.get_dummies(phone_calls.pop('poutcome')), lsuffix="poutcome_", rsuffix="_poutcome" )

In [16]:
phone_calls.head()

Unnamed: 0,age,contact,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,...,oct,sep,fri,mon,thu,tue,wed,failure,nonexistent,success
0,56,0,261,1,999,0,1.1,93.994,-36.4,4.857,...,0,0,0,1,0,0,0,0,1,0
1,57,0,149,1,999,0,1.1,93.994,-36.4,4.857,...,0,0,0,1,0,0,0,0,1,0
2,37,0,226,1,999,0,1.1,93.994,-36.4,4.857,...,0,0,0,1,0,0,0,0,1,0
3,40,0,151,1,999,0,1.1,93.994,-36.4,4.857,...,0,0,0,1,0,0,0,0,1,0
4,56,0,307,1,999,0,1.1,93.994,-36.4,4.857,...,0,0,0,1,0,0,0,0,1,0


<h2> Discretizando os dados em quantil </h2>

In [17]:
quartiles = pd.cut(phone_calls['emp.var.rate'], 4, labels=range(1,5))
phone_calls = phone_calls.assign(emp_var_rate_cat=quartiles.values)
phone_calls['emp_var_rate_cat'].value_counts()

4    23997
2    10592
3     3693
1     2906
Name: emp_var_rate_cat, dtype: int64

In [18]:
quartiles = pd.cut(phone_calls['cons.price.idx'], 4, labels=range(1,5))
phone_calls = phone_calls.assign(cons_price_idx_cat=quartiles.values)
phone_calls['cons_price_idx_cat'].value_counts()

2    18304
3    15363
4     5320
1     2201
Name: cons_price_idx_cat, dtype: int64

In [19]:
quartiles = pd.cut(phone_calls['cons.conf.idx'], 4, labels=range(1,5))
phone_calls = phone_calls.assign(cons_conf_idx_cat=quartiles.values)
phone_calls['cons_conf_idx_cat'].value_counts()

2    16209
3    14262
1     8876
4     1841
Name: cons_conf_idx_cat, dtype: int64

In [20]:
quartiles = pd.cut(phone_calls['euribor3m'], 4, labels=range(1,5))
phone_calls = phone_calls.assign(euribor3m_cat=quartiles.values)
phone_calls['euribor3m_cat'].value_counts()

4    27676
1    13430
2       68
3       14
Name: euribor3m_cat, dtype: int64

In [21]:
quartiles = pd.cut(phone_calls['nr.employed'], 4, labels=range(1,5))
phone_calls = phone_calls.assign(nr_employed_cat=quartiles.values)
phone_calls['nr_employed_cat'].value_counts()

4    27690
3     8534
1     3301
2     1663
Name: nr_employed_cat, dtype: int64

In [22]:
#phone_calls['in_debt'] = (
 #    phone_calls.apply(lambda x: 1 if (x.housing == 'yes' or x.loan  == 'yes' or x.default == 'yes') else 0 , axis=1)
  #   )
#phone_calls['in_debt'].value_counts()

In [23]:
criteria = [phone_calls['age'].between(0, 30), phone_calls['age'].between(31, 50), phone_calls['age'].between(50, 200)]
values = [1, 2, 3]

phone_calls['age_cat'] = np.select(criteria, values, 0)
phone_calls['age_cat'].value_counts()
print("chegou aqui")

chegou aqui


In [24]:
#criteria = [phone_calls['duration'].between(0, 20), phone_calls['duration'].between(21, phone_calls['duration'].mean()), phone_calls['duration'].between(phone_calls['duration'].mean(), phone_calls['duration'].max())]
#values = [1, 2, 3]

#phone_calls['duration_cat'] = np.select(criteria, values, 0)
#phone_calls['duration_cat'].value_counts()
print("aqui tambem")

aqui tambem


In [25]:
#criteria = [phone_calls['campaign'].between(0, phone_calls['campaign'].mean()), phone_calls['campaign'].between(phone_calls['campaign'].mean(), phone_calls['campaign'].max())]
#values = [1, 2]

#phone_calls['campaign_cat'] = np.select(criteria, values, 0)
#phone_calls['campaign_cat'].value_counts()

In [26]:
#pdays_without_999 = np.array([x for x in phone_calls['pdays'] if x != 999])

#criteria = [phone_calls['pdays'].between(999,999), phone_calls['pdays'].between(0, pdays_without_999.mean()), phone_calls['pdays'].between(pdays_without_999.mean(), pdays_without_999.max())]
#values = [1, 2, 3]

#phone_calls['pdays_cat'] = np.select(criteria, values, 0)
#phone_calls['pdays_cat'].value_counts()

In [27]:
#criteria = [phone_calls['previous'].between(0, phone_calls['previous'].mean()), phone_calls['previous'].between(phone_calls['previous'].mean(), phone_calls['previous'].max())]
#values = [1, 2]

#phone_calls['previous_cat'] = np.select(criteria, values, 0)
#phone_calls['previous_cat'].value_counts()

<h2> K-means </h2>

In [28]:
phone_calls.head()

Unnamed: 0,age,contact,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,...,wed,failure,nonexistent,success,emp_var_rate_cat,cons_price_idx_cat,cons_conf_idx_cat,euribor3m_cat,nr_employed_cat,age_cat
0,56,0,261,1,999,0,1.1,93.994,-36.4,4.857,...,0,0,1,0,4,3,3,4,4,3
1,57,0,149,1,999,0,1.1,93.994,-36.4,4.857,...,0,0,1,0,4,3,3,4,4,3
2,37,0,226,1,999,0,1.1,93.994,-36.4,4.857,...,0,0,1,0,4,3,3,4,4,2
3,40,0,151,1,999,0,1.1,93.994,-36.4,4.857,...,0,0,1,0,4,3,3,4,4,2
4,56,0,307,1,999,0,1.1,93.994,-36.4,4.857,...,0,0,1,0,4,3,3,4,4,3


In [29]:
ARR =  np.array((phone_calls.drop([ 'y' ],  axis=1))) #preparando array para o k-means, retirando todos os valores não númericos

In [30]:
kmeans = KMeans(n_clusters=2, random_state=0) #definindo a quantidade de clusters que queremos e a aleatoriedade

In [31]:
kmeans.fit(ARR) # treinamendo a base

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=0, tol=0.0001, verbose=0)

In [32]:
kmeans.labels_ #conferindo as labels que foram criadas 

array([1, 1, 1, ..., 1, 1, 1])

In [33]:
phone_calls['K-classes'] = kmeans.labels_ #adicionando o resultado do k-means à base

In [34]:
phone_calls['K-classes'].value_counts()

1    36037
0     5151
Name: K-classes, dtype: int64

In [35]:
phone_calls['y'].value_counts()

0    36548
1     4640
Name: y, dtype: int64

In [39]:
acertos = 0 
erros = 0 
c = 0

for x in phone_calls['K-classes']:
    if x  != phone_calls.y[c]: acertos+=1
    else: erros+=1
    c+=1
print("K-means:\tAcertos: ", acertos, "Erros: ", erros)
    

K-means:	Acertos:  35755 Erros:  5433


In [37]:
#sb.pairplot(phone_calls, hue='K-classes') #plotar gráficos de resultado do K-means

In [43]:
phone_calls[['y', 'K-classes']].head(50)

Unnamed: 0,y,K-classes
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1
5,0,1
6,0,1
7,0,1
8,0,1
9,0,1
