##Importar bibliotecas

###  1) Importar bases da entrega 2
###  2) Aplicar modelos
###  3) Metricas de avaliação
###  4) Comparação dos modelos

In [3]:
from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel
from pyspark.ml.classification import DecisionTreeClassifier, DecisionTreeClassificationModel
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
from pyspark.ml.classification import GBTClassifier, GBTClassificationModel
from pyspark.ml.classification import LinearSVC, LinearSVCModel
from pyspark.sql import DataFrame
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.regression import LinearRegressionSummary

#1. Importar bases da entrega 2

In [5]:
display(dbutils.fs.ls('/FileStore/olist/ENTREGA_2/0_bases'))

path,name,size
dbfs:/FileStore/olist/ENTREGA_2/0_bases/ABT_com_nulo.parquet/,ABT_com_nulo.parquet/,0
dbfs:/FileStore/olist/ENTREGA_2/0_bases/ABT_com_nulo_e_unique_id.parquet/,ABT_com_nulo_e_unique_id.parquet/,0
dbfs:/FileStore/olist/ENTREGA_2/0_bases/ABT_nulos_replace_zero_entrega2.parquet/,ABT_nulos_replace_zero_entrega2.parquet/,0
dbfs:/FileStore/olist/ENTREGA_2/0_bases/DataFrame_completo_para_modelo.parquet/,DataFrame_completo_para_modelo.parquet/,0
dbfs:/FileStore/olist/ENTREGA_2/0_bases/teste_entrega2.parquet/,teste_entrega2.parquet/,0
dbfs:/FileStore/olist/ENTREGA_2/0_bases/treino_entrega2.parquet/,treino_entrega2.parquet/,0


In [6]:
df_tabelao = spark.read.parquet('/FileStore/olist/ENTREGA_2/0_bases/ABT_com_nulo.parquet').cache().repartition(2)
teste = spark.read.parquet('/FileStore/olist/ENTREGA_2/0_bases/teste_entrega2.parquet/').cache().repartition(2)
treino = spark.read.parquet('/FileStore/olist/ENTREGA_2/0_bases/treino_entrega2.parquet/').cache().repartition(2)

# 2. Aplicar modelos

* Modelo linear  Support Vector Machine (classifier)
* Modelo Gradient-boosted tree classifier
* Modelo Random forest classifier
*
*

In [8]:
# # Modelo linear  Support Vector Machine (classifier)
svm = LinearSVC(labelCol='target')
modelo_svm = svm.fit(treino)

# # Modelo Gradient-boosted tree classifier
gbt = GBTClassifier(labelCol='target')
modelo_gbt = gbt.fit(treino)

# # Modelo Random forest classifier
randforest = RandomForestClassifier(labelCol='target')
modelo_randforest = randforest.fit(treino)

# # Modelo Logistic Regression
modelo_LogReg_entrega2 = LogisticRegressionModel.load('/FileStore/olist/ENTREGA_2/1_modelos/LogisticRegression_entrega2')

# # Modelo Decision Tree
modelo_DecTree_entrega2 = DecisionTreeClassificationModel.load('/FileStore/olist/ENTREGA_2/1_modelos/DecisionTreeClassifier_entrega2')

# 3. Metricas de avaliação

In [10]:
def metricas(modelo,base):
  """
  Função para retornar as metricas de avaliação do modelo.
  """
  predicao = modelo.transform(base)
  #MulticlassClassificationEvaluator param for metric name in evaluation (supports "f1" (default), "weightedPrecision", "weightedRecall", "accuracy")
  accuracy = MulticlassClassificationEvaluator(labelCol='target',metricName='accuracy')
  Precision = MulticlassClassificationEvaluator(labelCol='target',metricName='weightedPrecision')
  Recall = MulticlassClassificationEvaluator(labelCol='target',metricName='weightedRecall')
  #BinaryClassificationEvaluator param for metric name in evaluation (supports "areaUnderROC" (default), "areaUnderPR")
  UnderROC = BinaryClassificationEvaluator(labelCol='target',metricName='areaUnderROC')
  UnderPR = BinaryClassificationEvaluator(labelCol='target',metricName='areaUnderPR')
  
  acuracia = accuracy.evaluate(predicao)
  Precision = Precision.evaluate(predicao)
  Recall = Recall.evaluate(predicao)
  roc = UnderROC.evaluate(predicao)
  PR = UnderPR.evaluate(predicao)
  
  return acuracia,Precision,Recall,roc,PR

In [11]:
def criar_spark_dataframe_por_dicionario(dicionario):
  colunas = list(dicionario.keys())
  nlin = len(dicionario[colunas[0]])
  lista_nova = []
  linhas = []
  for i in range(nlin):
    for j in colunas:
      lista_nova.append(dicionario[j][i])
    print(lista_nova)
    linhas.append(tuple(lista_nova))
    lista_nova = []
  return spark.createDataFrame(linhas,colunas)

In [12]:
def df_metricas(modelos, bases, nome_bases = ['treino', 'teste']):
  bases = zip(bases,nome_bases)
  dados_modelos_bases = {'base': list(),'modelo': list(), 'acuracia': list(), 
                                'Precision': list(), 'Recall': list(), "underROC": list(), "UnderPR": list()}
  for base in bases:
    for modelo in modelos :
      #calcula as métricas
      acuracia,Precision,Recall,roc,PR = metricas(modelo,base[0])
      #ajusta o nome do modelo para compor o dataframe de resposta
      nome_modelo = str(modelo).split()[0].split("_")[0]
      dados_modelos_bases['base'].append(base[1])
      dados_modelos_bases['modelo'].append(nome_modelo)
      dados_modelos_bases['acuracia'].append(round(acuracia,5))
      dados_modelos_bases['Precision'].append(round(Precision,5))
      dados_modelos_bases['Recall'].append(round(Recall,5))
      dados_modelos_bases['underROC'].append(round(roc,5))
      dados_modelos_bases['UnderPR'].append(round(PR,5))
  return criar_spark_dataframe_por_dicionario(dados_modelos_bases)

#4. Comparação dos modelos

In [14]:
#define os modelos que vão ser avaliads
modelos = [modelo_LogReg_entrega2 ,modelo_DecTree_entrega2,modelo_svm,modelo_randforest,modelo_gbt]
#define as bases que serão avaliadas
bases = [treino,teste]
#se não for base treino e teste devemos passar mais um parâmetro com o nome das bases
nome_bases = ['treino_entrega2', 'teste_entrega2']
# df_metricas_primeiro =  df_metricas(modelos, bases, nome_bases)
df_metricas_treino_teste_inicial =  df_metricas(modelos, bases)
display(df_metricas_treino_teste_inicial)

base,modelo,acuracia,Precision,Recall,underROC,UnderPR
treino,LogisticRegressionModel:,0.82669,0.81916,0.82669,0.76114,0.58005
treino,DecisionTreeClassificationModel,0.83432,0.82242,0.83432,0.31369,0.15458
treino,LinearSVC,0.8153,0.80319,0.8153,0.7207,0.52307
treino,RandomForestClassificationModel,0.82807,0.82422,0.82807,0.75359,0.58806
treino,GBTClassificationModel,0.83713,0.82563,0.83713,0.77956,0.61932
teste,LogisticRegressionModel:,0.82461,0.81563,0.82461,0.76067,0.57653
teste,DecisionTreeClassificationModel,0.82963,0.81609,0.82963,0.31268,0.15485
teste,LinearSVC,0.81345,0.80126,0.81345,0.71321,0.51229
teste,RandomForestClassificationModel,0.8245,0.81908,0.8245,0.7535,0.5794
teste,GBTClassificationModel,0.83171,0.81869,0.83171,0.77077,0.59828


In [15]:
df_metricas_treino_teste_inicial.write.format('csv').option('header',True).mode('overwrite').option('sep',',').save('/FileStore/olist/ENTREGA_3/0_modelos_sem_hiperparametros/Comparacao_metricas_treino_teste_inicial.csv')

In [16]:
%fs
ls /FileStore/olist/ENTREGA_3/0_modelos_sem_hiperparametros/

path,name,size
dbfs:/FileStore/olist/ENTREGA_3/0_modelos_sem_hiperparametros/Comparacao_metricas_treino_teste.csv/,Comparacao_metricas_treino_teste.csv/,0
dbfs:/FileStore/olist/ENTREGA_3/0_modelos_sem_hiperparametros/Comparacao_metricas_treino_teste_inicial.csv/,Comparacao_metricas_treino_teste_inicial.csv/,0
dbfs:/FileStore/olist/ENTREGA_3/0_modelos_sem_hiperparametros/GBTClassifier_sem_hiperparametros/,GBTClassifier_sem_hiperparametros/,0
dbfs:/FileStore/olist/ENTREGA_3/0_modelos_sem_hiperparametros/RandomForestClassifier_sem_hiperparametros/,RandomForestClassifier_sem_hiperparametros/,0
dbfs:/FileStore/olist/ENTREGA_3/0_modelos_sem_hiperparametros/SVM_sem_hiperparametros/,SVM_sem_hiperparametros/,0


In [17]:
df_metricas_treino_teste_inicial = spark.read.csv('/FileStore/olist/ENTREGA_3/0_modelos_sem_hiperparametros/Comparacao_metricas_treino_teste_inicial.csv', header=True)
#dbutils.notebook.exit("dbfs:/FileStore/olist/ENTREGA_3/0_modelos_sem_hiperparametros/Comparacao_metricas_treino_teste_inicial.csv")

In [18]:
display(df_metricas_treino_teste_inicial)

base,modelo,acuracia,Precision,Recall,underROC,UnderPR
treino,RandomForestClassificationModel,0.82807,0.82422,0.82807,0.75359,0.58806
treino,GBTClassificationModel,0.83713,0.82563,0.83713,0.77956,0.61932
teste,RandomForestClassificationModel,0.8245,0.81908,0.8245,0.7535,0.5794
teste,GBTClassificationModel,0.83171,0.81869,0.83171,0.77077,0.59828
treino,DecisionTreeClassificationModel,0.83432,0.82242,0.83432,0.31369,0.15458
teste,DecisionTreeClassificationModel,0.82963,0.81609,0.82963,0.31268,0.15485
treino,LogisticRegressionModel:,0.82669,0.81916,0.82669,0.76114,0.58005
teste,LogisticRegressionModel:,0.82461,0.81563,0.82461,0.76067,0.57653
teste,LinearSVC,0.81345,0.80126,0.81345,0.71321,0.51229
treino,LinearSVC,0.8153,0.80319,0.8153,0.7207,0.52307
