# Projeto Python IA: Inteligência Artificial e Previsões

### Case: Score de Crédito dos Clientes

Você foi contratado por um banco para conseguir definir o score de crédito dos clientes. Você precisa analisar todos os clientes do banco e, com base nessa análise, criar um modelo que consiga ler as informações do cliente e dizer automaticamente o score de crédito dele: Ruim, Ok, Bom

Arquivos da aula: https://drive.google.com/drive/folders/1FbDqVq4XLvU85VBlVIMJ73p9oOu6u2-J?usp=drive_link

### Instalando as bibliotecas necessárias

In [108]:
#!pip install scikit-learn

### Importando os dados dos clientes

In [109]:
import pandas as pd

clientes = pd.read_csv('clientes.csv')
clientes

Unnamed: 0,id_cliente,mes,idade,profissao,salario_anual,num_contas,num_cartoes,juros_emprestimo,num_emprestimos,dias_atraso,...,idade_historico_credito,investimento_mensal,comportamento_pagamento,saldo_final_mes,score_credito,emprestimo_carro,emprestimo_casa,emprestimo_pessoal,emprestimo_credito,emprestimo_estudantil
0,3392,1,23.0,cientista,19114.12,3.0,4.0,3.0,4.0,3.0,...,265.0,21.465380,alto_gasto_pagamento_baixos,312.494089,Good,1,1,1,1,0
1,3392,2,23.0,cientista,19114.12,3.0,4.0,3.0,4.0,3.0,...,266.0,21.465380,baixo_gasto_pagamento_alto,284.629162,Good,1,1,1,1,0
2,3392,3,23.0,cientista,19114.12,3.0,4.0,3.0,4.0,3.0,...,267.0,21.465380,baixo_gasto_pagamento_medio,331.209863,Good,1,1,1,1,0
3,3392,4,23.0,cientista,19114.12,3.0,4.0,3.0,4.0,5.0,...,268.0,21.465380,baixo_gasto_pagamento_baixo,223.451310,Good,1,1,1,1,0
4,3392,5,23.0,cientista,19114.12,3.0,4.0,3.0,4.0,6.0,...,269.0,21.465380,alto_gasto_pagamento_medio,341.489231,Good,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,37932,4,25.0,mecanico,39628.99,4.0,6.0,7.0,2.0,23.0,...,378.0,24.028477,alto_gasto_pagamento_alto,479.866228,Poor,1,0,0,0,1
99996,37932,5,25.0,mecanico,39628.99,4.0,6.0,7.0,2.0,18.0,...,379.0,24.028477,alto_gasto_pagamento_medio,496.651610,Poor,1,0,0,0,1
99997,37932,6,25.0,mecanico,39628.99,4.0,6.0,7.0,2.0,27.0,...,380.0,24.028477,alto_gasto_pagamento_alto,516.809083,Poor,1,0,0,0,1
99998,37932,7,25.0,mecanico,39628.99,4.0,6.0,7.0,2.0,20.0,...,381.0,24.028477,baixo_gasto_pagamento_alto,319.164979,Standard,1,0,0,0,1


In [110]:
# Coletando informações da tabela e verificando se há dados faltantes
clientes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 25 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   id_cliente                100000 non-null  int64  
 1   mes                       100000 non-null  int64  
 2   idade                     100000 non-null  float64
 3   profissao                 100000 non-null  object 
 4   salario_anual             100000 non-null  float64
 5   num_contas                100000 non-null  float64
 6   num_cartoes               100000 non-null  float64
 7   juros_emprestimo          100000 non-null  float64
 8   num_emprestimos           100000 non-null  float64
 9   dias_atraso               100000 non-null  float64
 10  num_pagamentos_atrasados  100000 non-null  float64
 11  num_verificacoes_credito  100000 non-null  float64
 12  mix_credito               100000 non-null  object 
 13  divida_total              100000 non-null  fl

In [111]:
# Utilizando o label encoder para transformar os valores categóricos em númericos
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
clientes['score_credito_cod'] = encoder.fit_transform(clientes['score_credito'])
clientes['profissao'] = encoder.fit_transform(clientes['profissao'])
clientes['mix_credito'] = encoder.fit_transform(clientes['mix_credito'])
clientes['comportamento_pagamento'] = encoder.fit_transform(clientes['comportamento_pagamento'])

display(clientes['score_credito'].value_counts())
display(clientes['score_credito_cod'].value_counts())

Standard    53174
Poor        28998
Good        17828
Name: score_credito, dtype: int64

2    53174
1    28998
0    17828
Name: score_credito_cod, dtype: int64

In [112]:
# Correlação entre as colunas
matriz_correlacao = clientes.drop(columns = ['score_credito','id_cliente'], axis = 1).corr().round(2)

display(matriz_correlacao)

Unnamed: 0,mes,idade,profissao,salario_anual,num_contas,num_cartoes,juros_emprestimo,num_emprestimos,dias_atraso,num_pagamentos_atrasados,...,idade_historico_credito,investimento_mensal,comportamento_pagamento,saldo_final_mes,emprestimo_carro,emprestimo_casa,emprestimo_pessoal,emprestimo_credito,emprestimo_estudantil,score_credito_cod
mes,1.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.02,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,-0.04
idade,0.02,1.0,-0.01,0.09,-0.19,-0.15,-0.22,-0.21,-0.17,-0.18,...,0.23,0.07,-0.03,0.12,-0.08,-0.11,-0.1,-0.08,-0.08,-0.06
profissao,0.0,-0.01,1.0,-0.01,0.01,0.01,0.01,0.0,0.01,-0.01,...,-0.01,-0.01,0.01,-0.01,-0.0,0.01,-0.01,-0.01,0.0,-0.0
salario_anual,0.0,0.09,-0.01,1.0,-0.28,-0.22,-0.3,-0.26,-0.25,-0.29,...,0.27,0.81,-0.25,0.63,-0.1,-0.12,-0.1,-0.12,-0.1,-0.08
num_contas,0.0,-0.19,0.01,-0.28,1.0,0.44,0.58,0.47,0.56,0.6,...,-0.49,-0.23,0.08,-0.29,0.19,0.24,0.19,0.19,0.16,0.18
num_cartoes,0.0,-0.15,0.01,-0.22,0.44,1.0,0.5,0.42,0.48,0.42,...,-0.42,-0.17,0.06,-0.23,0.16,0.2,0.16,0.17,0.16,0.11
juros_emprestimo,0.0,-0.22,0.01,-0.3,0.58,0.5,1.0,0.56,0.59,0.57,...,-0.58,-0.24,0.09,-0.32,0.23,0.28,0.21,0.23,0.21,0.12
num_emprestimos,0.0,-0.21,0.0,-0.26,0.47,0.42,0.56,1.0,0.5,0.47,...,-0.61,-0.21,0.07,-0.43,0.39,0.51,0.39,0.41,0.38,0.06
dias_atraso,0.0,-0.17,0.01,-0.25,0.56,0.48,0.59,0.5,1.0,0.54,...,-0.49,-0.2,0.07,-0.27,0.19,0.24,0.2,0.19,0.19,0.1
num_pagamentos_atrasados,0.0,-0.18,-0.01,-0.29,0.6,0.42,0.57,0.47,0.54,1.0,...,-0.48,-0.23,0.08,-0.29,0.19,0.23,0.18,0.2,0.18,0.2


In [113]:
# Analisando as correlações existentes que são maiores do que 20%, em especial os valores com a coluna score_credito_cod(coluna alvo)
# Isso pode nos ajudar a formular hipóteses e também a escolher posteriormente as melhores features para o modelo

taxa_correlacao_analisar = 0.2
lista_correlacoes = []
for coluna in matriz_correlacao:

  for coluna2, valor_correlacao in matriz_correlacao[coluna].items():
    if coluna != coluna2:
      if valor_correlacao >= taxa_correlacao_analisar:
        lista_correlacoes.append((coluna, valor_correlacao, coluna2))

def correlacao(tupla):
  return tupla[1]


lista_correlacoes.sort(key = correlacao,reverse = True)
tabela_correlacoes = pd.DataFrame(lista_correlacoes, columns = ['coluna1', 'correlacao', 'coluna2'])
display(tabela_correlacoes)

Unnamed: 0,coluna1,correlacao,coluna2
0,salario_anual,0.81,investimento_mensal
1,investimento_mensal,0.81,salario_anual
2,num_pagamentos_atrasados,0.76,mix_credito
3,mix_credito,0.76,num_pagamentos_atrasados
4,juros_emprestimo,0.75,mix_credito
...,...,...,...
149,num_pagamentos_atrasados,0.20,score_credito_cod
150,emprestimo_casa,0.20,num_cartoes
151,emprestimo_pessoal,0.20,dias_atraso
152,emprestimo_credito,0.20,num_pagamentos_atrasados


In [134]:
# Divisão dos dados em variável resposta Y e as features X

y = clientes['score_credito_cod']

x = clientes.drop(columns = ['score_credito_cod','id_cliente','score_credito'], axis = 1)

In [152]:
# Utilizando o train test split para realizar as divisões dos dados entre dados de treino e dados de teste
from sklearn.model_selection import train_test_split, cross_val_score

x_treino, x_teste, y_treino, y_teste = train_test_split(x, y, test_size = 0.25)

In [153]:
# Criar um modelo de IA
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

#Criar
arvore = RandomForestClassifier()
knn = KNeighborsClassifier(n_neighbors = 5)

#Treinar
knn = knn.fit(x_treino, y_treino)
arvore = arvore.fit(x_treino, y_treino)

In [154]:
# Escolher o melhor modelo
from sklearn.metrics import accuracy_score

arvore_scores = cross_val_score(arvore, x_treino, y_treino, cv=5, )
knn_scores = cross_val_score(knn, x_treino, y_treino, cv=5)

print('Acurácia média Random Forest (validação cruzada):', arvore_scores.mean())
print('Acurácia média KNN (validação cruzada):', knn_scores.mean())

# Fazer previsões
previsao_arvore = arvore.predict(x_teste)
previsao_knn = knn.predict(x_teste)

print('Previsão do Random Forest para a acurácia:', accuracy_score(y_teste, previsao_arvore))
print('Previsão do KNN para a acurácia:', accuracy_score(y_teste, previsao_knn))

Acurácia média Random Forest (validação cruzada): 0.8183866666666667
Acurácia média KNN (validação cruzada): 0.7204666666666666
Previsão do Random Forest para a acurácia: 0.82704
Previsão do KNN para a acurácia: 0.73868


In [155]:
# Previsão de dados dos novos clientes
novos_clientes = pd.read_csv("novos_clientes.csv")
print(novos_clientes.info())

# Tratar a nova base de dados com o tratamento que foi aplicado no treino
novos_clientes['profissao'] = encoder.fit_transform(novos_clientes['profissao'])
novos_clientes['mix_credito'] = encoder.fit_transform(novos_clientes['mix_credito'])
novos_clientes['comportamento_pagamento'] = encoder.fit_transform(novos_clientes['comportamento_pagamento'])

# display(novos_clientes)
previsao = arvore.predict(novos_clientes)

def mudanca_categoria(dados):
  if dados == 0:
    return 'Good'
  elif dados == 1:
    return 'Poor'
  else:
    return 'Standard'


novos_clientes['score_credito'] = previsao
novos_clientes['score_credito'] = novos_clientes['score_credito'].apply(mudanca_categoria)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   mes                       3 non-null      int64  
 1   idade                     3 non-null      float64
 2   profissao                 3 non-null      object 
 3   salario_anual             3 non-null      float64
 4   num_contas                3 non-null      float64
 5   num_cartoes               3 non-null      float64
 6   juros_emprestimo          3 non-null      float64
 7   num_emprestimos           3 non-null      float64
 8   dias_atraso               3 non-null      float64
 9   num_pagamentos_atrasados  3 non-null      float64
 10  num_verificacoes_credito  3 non-null      float64
 11  mix_credito               3 non-null      object 
 12  divida_total              3 non-null      float64
 13  taxa_uso_credito          3 non-null      float64
 14  idade_historic

In [156]:
novos_clientes

Unnamed: 0,mes,idade,profissao,salario_anual,num_contas,num_cartoes,juros_emprestimo,num_emprestimos,dias_atraso,num_pagamentos_atrasados,...,idade_historico_credito,investimento_mensal,comportamento_pagamento,saldo_final_mes,emprestimo_carro,emprestimo_casa,emprestimo_pessoal,emprestimo_credito,emprestimo_estudantil,score_credito
0,1,31.0,1,19300.34,6.0,7.0,17.0,5.0,52.0,19.0,...,218.0,44.50951,1,312.487689,1,1,0,0,0,Poor
1,4,32.0,0,12600.445,5.0,5.0,10.0,3.0,25.0,18.0,...,12.0,0.0,2,300.994163,0,0,0,0,1,Good
2,2,48.0,1,20787.69,8.0,6.0,14.0,7.0,24.0,14.0,...,215.0,0.0,0,345.081577,0,1,0,1,0,Good


In [157]:
previsao

array([1, 0, 0])