## <font color='blue'>Spark MLLib - Classificação - Random Forest</font>

## Classificar clientes de acordo com a possibilidade de pagar ou não o crédito

In [1]:
# Imports
import math
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import PCA
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
# Spark Session - usada quando se trabalha com Dataframes no Spark
spSession = SparkSession.builder.master("local").appName("SparkMLLib").getOrCreate()

In [3]:
# Carregando os dados e gerando um RDD
bankRDD = sc.textFile("data/bank.csv")

In [4]:
bankRDD.cache()

data/bank.csv MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [5]:
bankRDD.count()

542

In [6]:
bankRDD.take(5)

['"age";"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"',
 '30;"unemployed";"married";"primary";"no";1787;"no";"no";"cellular";19;"oct";79;1;-1;0;"unknown";"no"',
 '33;"services";"married";"secondary";"no";4789;"yes";"yes";"cellular";11;"may";220;1;339;4;"failure";"yes"',
 '35;"management";"single";"tertiary";"no";1350;"yes";"no";"cellular";16;"apr";185;1;330;1;"failure";"yes"',
 '30;"management";"married";"tertiary";"no";1476;"yes";"yes";"unknown";3;"jun";199;4;-1;0;"unknown";"yes"']

In [7]:
# Removendo a primeira linha do arquivo (cabeçalho)
firstLine = bankRDD.first()
bankRDD2 = bankRDD.filter(lambda x: x != firstLine)
bankRDD2.count()

541

In [8]:
bankRDD.take(10)

['"age";"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"',
 '30;"unemployed";"married";"primary";"no";1787;"no";"no";"cellular";19;"oct";79;1;-1;0;"unknown";"no"',
 '33;"services";"married";"secondary";"no";4789;"yes";"yes";"cellular";11;"may";220;1;339;4;"failure";"yes"',
 '35;"management";"single";"tertiary";"no";1350;"yes";"no";"cellular";16;"apr";185;1;330;1;"failure";"yes"',
 '30;"management";"married";"tertiary";"no";1476;"yes";"yes";"unknown";3;"jun";199;4;-1;0;"unknown";"yes"',
 '59;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";226;1;-1;0;"unknown";"no"',
 '35;"management";"single";"tertiary";"no";747;"no";"no";"cellular";23;"feb";141;2;176;3;"failure";"yes"',
 '36;"self-employed";"married";"tertiary";"no";307;"yes";"no";"cellular";14;"may";341;1;330;2;"other";"yes"',
 '39;"technician";"married";"secondary";"no";147;"yes";"no";"cellular";6;"may";151;2;-1;0;"

In [9]:
# RDD sem o cabeçalho
bankRDD2.take(2)

['30;"unemployed";"married";"primary";"no";1787;"no";"no";"cellular";19;"oct";79;1;-1;0;"unknown";"no"',
 '33;"services";"married";"secondary";"no";4789;"yes";"yes";"cellular";11;"may";220;1;339;4;"failure";"yes"']

In [10]:
type(bankRDD2)

pyspark.rdd.PipelinedRDD

###### Verificando os dados Distintos de cada uma das colunas do RDD para realizar a limpeza dos dados

In [11]:
# Transforma o RDD sem cabeçalho em uma Lista
List_Bank = bankRDD2.map(lambda l: l.split(";")).collect()

In [12]:
# Transforma a lista criada em um Dataframe do Pandas, para melhor visualização dos dados
import pandas as pd
dfteste = pd.DataFrame(List_Bank, columns=['age','job','marital','education','default','balance','housing','loan','contact',
                                       'day','month','duration','campaign','pdays','previous','poutcome','y'])

In [13]:
# Retorna um Dictionary dos dados distintos onde a Key é o nome das colunas e o Values é um array de valores distintos
dictn={}
for colname in dfteste:
    dictn[colname]=dfteste[colname].unique()
dictn

{'age': array(['30', '33', '35', '59', '36', '39', '41', '43', '20', '31', '40',
        '56', '37', '25', '38', '42', '44', '26', '55', '67', '53', '68',
        '32', '49', '78', '23', '52', '34', '61', '45', '48', '57', '54',
        '63', '51', '29', '50', '27', '60', '28', '21', '58', '22', '46',
        '24', '77', '75', '47', '70', '65', '64', '62', '66', '19'],
       dtype=object),
 'job': array(['"unemployed"', '"services"', '"management"', '"blue-collar"',
        '"self-employed"', '"technician"', '"entrepreneur"', '"admin."',
        '"student"', '"housemaid"', '"retired"', '"unknown"'], dtype=object),
 'marital': array(['"married"', '"single"', '"divorced"'], dtype=object),
 'education': array(['"primary"', '"secondary"', '"tertiary"', '"unknown"'],
       dtype=object),
 'default': array(['"no"', '"yes"'], dtype=object),
 'balance': array(['1787', '4789', '1350', '1476', '0', '747', '307', '147', '221',
        '-88', '9374', '264', '1109', '502', '360', '194', '4073', '

## Limpeza dos Dados

In [14]:
# Transformando os dados categóricos (Strings) para valores numéricos
# Separando todos dados categóricos em novas colunas, para que possuam o mesmo "peso" durante a análise de correlação futura
def transformToNumeric( inputStr) :
    
    attList = inputStr.replace("\"","").split(";")
    
    age = float(attList[0])    
    
    #Job
    unemployed_job = 1 if attList[1] == "unemployed" else 0
    services_job = 1 if attList[1] == "services" else 0
    management_job = 1 if attList[1] == "management" else 0
    bluecollar_job = 1 if attList[1] == "blue-collar" else 0
    selfemployed_job = 1 if attList[1] == "self-employed" else 0
    technician_job = 1 if attList[1] == "technician" else 0
    entrepreneur_job = 1 if attList[1] == "entrepreneur" else 0
    admin_job = 1 if attList[1] == "admin." else 0
    student_job = 1 if attList[1] == "student" else 0
    housemaid_job = 1 if attList[1] == "housemaid" else 0
    retired_job = 1 if attList[1] == "retired" else 0
    unknown_job = 1 if attList[1] == "unknown" else 0
            
    # Marital
    single = 1.0 if attList[2] == "single" else 0.0
    married = 1.0 if attList[2] == "married" else 0.0
    divorced = 1.0 if attList[2] == "divorced" else 0.0
            
    # Education
    primary = 1.0 if attList[3] == "primary" else 0.0
    secondary = 1.0 if attList[3] == "secondary" else 0.0
    tertiary = 1.0 if attList[3] == "tertiary" else 0.0
    unknown_education = 1.0 if attList[3] == 'unknown' else 0.0
    
    default = 0.0 if attList[4] == "no" else 1.0
    balance = float(attList[5])
    
    housing = 0 if attList[6] == "no" else 1
    
    loan = 0.0 if attList[7] == "no" else 1.0
    
    campaign = int(attList[12])
    previous = int(attList[14])
    
    outcome = 0.0 if attList[16] == "no" else 1.0
    
    # Cria as linhas com os objetos transformados
    linhas = Row(OUTCOME = outcome, 
                 AGE = age, 
                 
                 UNEMPLOYED_JOB = unemployed_job, SERVICE_JOB = services_job, MANAGEMENT_JOB = management_job,
                 BLUECOLLAR_JOB = bluecollar_job, SELFEMPLOYED_JOB = selfemployed_job, TECHNICIAN_JOB = technician_job,
                 ENTREPRENEUR_JOB = entrepreneur_job, ADMIN_JOB = admin_job, STUDENT_JOB = student_job, 
                 HOUSEMAID_JOB = housemaid_job, RETIRED_JOB = retired_job, UNKNOWN_JOB = unknown_job,
                 
                 SINGLE = single, MARRIED = married, DIVORCED = divorced,
                 
                 PRIMARY = primary, SECONDARY = secondary, TERTIARY = tertiary, UNKNOWN_EDU = unknown_education,
                 
                 DEFAULT = default, BALANCE = balance, HOUSING = housing, LOAN = loan, CAMPAING = campaign, PREVIOUS = previous) 
    return linhas

In [15]:
# Aplicando a função de limpeza ao conjunto de dados
bankRDD3 = bankRDD2.map(transformToNumeric)

In [16]:
# [:5] -> Slicing para retornar os 5 primeiros registros
bankRDD3.collect()[:5]

[Row(ADMIN_JOB=0, AGE=30.0, BALANCE=1787.0, BLUECOLLAR_JOB=0, CAMPAING=1, DEFAULT=0.0, DIVORCED=0.0, ENTREPRENEUR_JOB=0, HOUSEMAID_JOB=0, HOUSING=0, LOAN=0.0, MANAGEMENT_JOB=0, MARRIED=1.0, OUTCOME=0.0, PREVIOUS=0, PRIMARY=1.0, RETIRED_JOB=0, SECONDARY=0.0, SELFEMPLOYED_JOB=0, SERVICE_JOB=0, SINGLE=0.0, STUDENT_JOB=0, TECHNICIAN_JOB=0, TERTIARY=0.0, UNEMPLOYED_JOB=1, UNKNOWN_EDU=0.0, UNKNOWN_JOB=0),
 Row(ADMIN_JOB=0, AGE=33.0, BALANCE=4789.0, BLUECOLLAR_JOB=0, CAMPAING=1, DEFAULT=0.0, DIVORCED=0.0, ENTREPRENEUR_JOB=0, HOUSEMAID_JOB=0, HOUSING=1, LOAN=1.0, MANAGEMENT_JOB=0, MARRIED=1.0, OUTCOME=1.0, PREVIOUS=4, PRIMARY=0.0, RETIRED_JOB=0, SECONDARY=1.0, SELFEMPLOYED_JOB=0, SERVICE_JOB=1, SINGLE=0.0, STUDENT_JOB=0, TECHNICIAN_JOB=0, TERTIARY=0.0, UNEMPLOYED_JOB=0, UNKNOWN_EDU=0.0, UNKNOWN_JOB=0),
 Row(ADMIN_JOB=0, AGE=35.0, BALANCE=1350.0, BLUECOLLAR_JOB=0, CAMPAING=1, DEFAULT=0.0, DIVORCED=0.0, ENTREPRENEUR_JOB=0, HOUSEMAID_JOB=0, HOUSING=1, LOAN=0.0, MANAGEMENT_JOB=1, MARRIED=0.0, OUTC

## Análise Exploratória de Dados

In [17]:
# Transforma para Dataframe com a limpeza dos dados realizadas.
bankDF = spSession.createDataFrame(bankRDD3)

In [18]:
# Estatística descritiva
bankDF.describe().show()

+-------+-------------------+------------------+------------------+-------------------+-----------------+--------------------+-------------------+-------------------+-------------------+------------------+-------------------+-------------------+------------------+-------------------+------------------+------------------+--------------------+------------------+--------------------+-------------------+------------------+--------------------+-------------------+------------------+--------------------+--------------------+--------------------+
|summary|          ADMIN_JOB|               AGE|           BALANCE|     BLUECOLLAR_JOB|         CAMPAING|             DEFAULT|           DIVORCED|   ENTREPRENEUR_JOB|      HOUSEMAID_JOB|           HOUSING|               LOAN|     MANAGEMENT_JOB|           MARRIED|            OUTCOME|          PREVIOUS|           PRIMARY|         RETIRED_JOB|         SECONDARY|    SELFEMPLOYED_JOB|        SERVICE_JOB|            SINGLE|         STUDENT_JOB|     TECHNI

In [19]:
# Correlação entre as variáveis
for i in bankDF.columns:
    if not( isinstance(bankDF.select(i).take(1)[0][0], str)) :
        print( "Correlação da variável OUTCOME com", i, bankDF.stat.corr('OUTCOME',i))

Correlação da variável OUTCOME com ADMIN_JOB 0.06509288927958984
Correlação da variável OUTCOME com AGE -0.1823210432736525
Correlação da variável OUTCOME com BALANCE 0.03657486611997681
Correlação da variável OUTCOME com BLUECOLLAR_JOB -0.09458950823484219
Correlação da variável OUTCOME com CAMPAING -0.06764280984247219
Correlação da variável OUTCOME com DEFAULT -0.04536965206737378
Correlação da variável OUTCOME com DIVORCED -0.07812659940926987
Correlação da variável OUTCOME com ENTREPRENEUR_JOB -0.003231193491671572
Correlação da variável OUTCOME com HOUSEMAID_JOB -0.03670638351707513
Correlação da variável OUTCOME com HOUSING 0.05032284653513401
Correlação da variável OUTCOME com LOAN -0.030420586112717318
Correlação da variável OUTCOME com MANAGEMENT_JOB -0.008182879282413407
Correlação da variável OUTCOME com MARRIED -0.3753241299133561
Correlação da variável OUTCOME com OUTCOME 1.0
Correlação da variável OUTCOME com PREVIOUS 0.41675833565578474
Correlação da variável OUTCOME co

Analisando a correlação entre os dados, podemos observar que informações abaixo NÃO possuem uma Correlação significativa em relação a variável "OUTCOME" (que representa se o cliente pagou o créditou ou não):
- ADMIN_JOB, BLUECOLLAR_JOB, ENTREPRENEUR_JOB, HOUSEMAID_JOB, MANAGEMENT_JOB, RETIRED_JOB, SELFEMPLOYED_JOB, SERVICE_JOB,
- STUDENT_JOB, TECHNICIAN_JOB, UNEMPLOYED_JOB, UNKNOWN_JOB, CAMPAING, DEFAULT  

## Pré-Processamento dos Dados
Apache Spark Requer que os dados estejam em um padrão de dados, em um formato de Vetores, seja denso ou sparso.

In [20]:
# Criando um LabeledPoint (target, Vector[features])
def transformaVar(row) :
    obj = (row["OUTCOME"], Vectors.dense([row["AGE"], row["BALANCE"], row["DIVORCED"], row["MARRIED"], row["SINGLE"],
                                          row["LOAN"], row["HOUSING"], row["PREVIOUS"], 
                                          row["PRIMARY"], row["SECONDARY"], row["TERTIARY"]
                                         ]
                                        )
          )
    return obj

In [21]:
# Transforma o Dataframe em RDD para ser possível aplicar a operação Map utilizando a função criada
bankRDD4 = bankDF.rdd.map(transformaVar)

In [22]:
bankRDD4.collect()

[(0.0,
  DenseVector([30.0, 1787.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0])),
 (1.0,
  DenseVector([33.0, 4789.0, 0.0, 1.0, 0.0, 1.0, 1.0, 4.0, 0.0, 1.0, 0.0])),
 (1.0,
  DenseVector([35.0, 1350.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0])),
 (1.0,
  DenseVector([30.0, 1476.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0])),
 (0.0, DenseVector([59.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0])),
 (1.0,
  DenseVector([35.0, 747.0, 0.0, 0.0, 1.0, 0.0, 0.0, 3.0, 0.0, 0.0, 1.0])),
 (1.0,
  DenseVector([36.0, 307.0, 0.0, 1.0, 0.0, 0.0, 1.0, 2.0, 0.0, 0.0, 1.0])),
 (0.0,
  DenseVector([39.0, 147.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0])),
 (0.0,
  DenseVector([41.0, 221.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0])),
 (1.0,
  DenseVector([43.0, -88.0, 0.0, 1.0, 0.0, 1.0, 1.0, 2.0, 1.0, 0.0, 0.0])),
 (0.0,
  DenseVector([39.0, 9374.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0])),
 (0.0,
  DenseVector([43.0, 264.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0])),
 (0

In [23]:
# Transforma de volta o RDD em Dataframe do Spark
bankDF = spSession.createDataFrame(bankRDD4,["label", "features"])
bankDF.select("features", "label").show(10) # bankDF.show(10)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[30.0,1787.0,0.0,...|  0.0|
|[33.0,4789.0,0.0,...|  1.0|
|[35.0,1350.0,0.0,...|  1.0|
|[30.0,1476.0,0.0,...|  1.0|
|[59.0,0.0,0.0,1.0...|  0.0|
|[35.0,747.0,0.0,0...|  1.0|
|[36.0,307.0,0.0,1...|  1.0|
|[39.0,147.0,0.0,1...|  0.0|
|[41.0,221.0,0.0,1...|  0.0|
|[43.0,-88.0,0.0,1...|  1.0|
+--------------------+-----+
only showing top 10 rows



In [24]:
# Aplicando Redução de Dimensionalidade com PCA
# PCA é bastante utilizado quando existe muitas variáveis que dificulta o treinamento do algoritmo.

# Reduzindo as N variáveis para 3 componentes, que possuem um grupo de varíaveis.
bankPCA = PCA(k = 3, inputCol = "features", outputCol = "pcaFeatures")
# Criando o modelo
pcaModel = bankPCA.fit(bankDF)
pcaResult = pcaModel.transform(bankDF).select("label","pcaFeatures")
pcaResult.show(truncate = False) # Opção Truncate não corta o resultado, apresentando a célula inteira

+-----+------------------------------------------------------------+
|label|pcaFeatures                                                 |
+-----+------------------------------------------------------------+
|0.0  |[-1787.0188962546333,28.8599549493223,0.15377467234126085]  |
|1.0  |[-4789.020058802154,29.892705418583432,4.242267949451632]   |
|1.0  |[-1350.022165592906,34.085055842132235,1.286915395184415]   |
|1.0  |[-1476.0189270653755,29.039815776091352,0.19406025913088729]|
|0.0  |[-0.03786531127040594,58.97706863483137,0.31263203714582266]|
|1.0  |[-747.0222684181689,34.473283897278,3.259124618016967]      |
|1.0  |[-307.0229990984655,35.77975048225049,2.213743543645403]    |
|0.0  |[-147.02498820998787,38.88946403100969,0.20468912524276459] |
|0.0  |[-221.026274544068,40.84189033615178,0.24281920951824973]   |
|1.0  |[87.97245667221671,43.04230063637094,2.1883241112601373]    |
|0.0  |[-9374.023076730495,32.962130252532184,0.3809793434578735]  |
|0.0  |[-264.02753330179553,42.812

In [25]:
# Indexação é pré-requisito para Decision Trees com apache Spark
# RandomForest é um algoritmo de Decision Tree

# Utiliza a coluna "label" para criar uma nova coluna Indexada
stringIndexer = StringIndexer(inputCol = "label", outputCol = "indexed")
si_model = stringIndexer.fit(pcaResult)
obj_final = si_model.transform(pcaResult)
obj_final.collect()

[Row(label=0.0, pcaFeatures=DenseVector([-1787.0189, 28.86, 0.1538]), indexed=0.0),
 Row(label=1.0, pcaFeatures=DenseVector([-4789.0201, 29.8927, 4.2423]), indexed=1.0),
 Row(label=1.0, pcaFeatures=DenseVector([-1350.0222, 34.0851, 1.2869]), indexed=1.0),
 Row(label=1.0, pcaFeatures=DenseVector([-1476.0189, 29.0398, 0.1941]), indexed=1.0),
 Row(label=0.0, pcaFeatures=DenseVector([-0.0379, 58.9771, 0.3126]), indexed=0.0),
 Row(label=1.0, pcaFeatures=DenseVector([-747.0223, 34.4733, 3.2591]), indexed=1.0),
 Row(label=1.0, pcaFeatures=DenseVector([-307.023, 35.7798, 2.2137]), indexed=1.0),
 Row(label=0.0, pcaFeatures=DenseVector([-147.025, 38.8895, 0.2047]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([-221.0263, 40.8419, 0.2428]), indexed=0.0),
 Row(label=1.0, pcaFeatures=DenseVector([87.9725, 43.0423, 2.1883]), indexed=1.0),
 Row(label=0.0, pcaFeatures=DenseVector([-9374.0231, 32.9621, 0.381]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([-264.0275, 42.8129, 0.2291])

## Machine Learning

In [32]:
# Dados de Treino e de Teste
(dados_treino, dados_teste) = obj_final.randomSplit([0.7, 0.3])

In [33]:
dados_treino.count()

387

In [34]:
dados_teste.count()

154

In [36]:
# Criando o modelo
# O tipo de dado esperado no paramêtro "featuresCol" é um Vector
rfClassifer = RandomForestClassifier(labelCol = "indexed", featuresCol = "pcaFeatures")
modelo = rfClassifer.fit(dados_treino)

In [39]:
# Previsões com dados de teste
predictions = modelo.transform(dados_teste)

# Seleciona os campos necessários
# prediction: Previsão feita pelo modelo
# indexed e label são os target (dado que o label possui apenas 2 valores [0 e 1])
# pcaFeatures representa os componentes utilizados durante a criação do modelo
predictions.select("prediction", "indexed", "label", "pcaFeatures").collect()

[Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-5887.0296, 44.2092, 0.3766])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-3762.0275, 41.5756, 0.3138])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-3571.025, 37.6997, 0.288])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-3222.034, 51.9138, 0.3812])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-3064.0302, 46.0125, 0.3103])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-2980.0264, 40.078, 0.2622])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-1981.0227, 34.7218, 0.2099])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-1890.0195, 29.7672, 0.216])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-1811.0266, 40.8142, 0.2754])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-1808.0189, 28.82

In [40]:
# Avaliando a acurácia

# Informa que a Coluna "prediction" representa a previsão feita pelo modelo,
# que a coluna "indexed" representa a coluna target que queriamos prever,
evaluator = MulticlassClassificationEvaluator(predictionCol = "prediction", labelCol = "indexed", metricName = "accuracy")
# Aplica a classificação criada ao conjunto de dados predictions
evaluator.evaluate(predictions) 

# Existem várias formas de melhorar a acurácia do modelo criado: Ajustes de # dos PCA, Incluir ou remover variáveis... 

0.8441558441558441

In [42]:
# Confusion Matrix
predictions.groupBy("indexed", "prediction").count().show()

+-------+----------+-----+
|indexed|prediction|count|
+-------+----------+-----+
|    1.0|       1.0|   44|
|    0.0|       1.0|    6|
|    1.0|       0.0|   18|
|    0.0|       0.0|   86|
+-------+----------+-----+

