# **Código de Escoragem**

## **Bibliotecas necessárias para o projeto**

In [None]:
# Bibliotecas necessárias para o projeto
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import itertools
import pickle
from sklearn.metrics import confusion_matrix, roc_curve, precision_recall_curve, roc_auc_score
from sklearn.model_selection import train_test_split


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## **Leitura dos dados a serem escorados**

In [None]:
# Leitura dos dados a serem escorados
datapath = f'/content/drive/MyDrive/2. Study  Work/Pod Academy/Tarefas/Ciência de Dados/Projetos/Predição de Diabetes/diabetes.csv'

df00 = pd.read_csv(datapath, sep = ',')

df00.shape

(768, 9)

## **Separar 70% dos dados para treino e 30% para validação**

In [None]:
# Separar 70% dos dados para treino e 30% para validação
train, test = train_test_split(df00, test_size=0.3, random_state=42)
train.shape,test.shape

((537, 9), (231, 9))

In [None]:
train.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
334,1,95,60,18,58,23.9,0.26,22,0
139,5,105,72,29,325,36.9,0.159,28,0
485,0,135,68,42,250,42.3,0.365,24,1
547,4,131,68,21,166,33.1,0.16,28,0
18,1,103,30,38,83,43.3,0.183,33,0


In [None]:
test.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
668,6,98,58,33,190,34.0,0.43,43,0
324,2,112,75,32,0,35.7,0.148,21,0
624,2,108,64,0,0,30.8,0.158,21,0
690,8,107,80,0,0,24.6,0.856,34,0
473,7,136,90,0,0,29.9,0.21,50,0


In [None]:
# Criando um novo dataframe baseado no original
df_train_01 = train.copy()
df_test_01 = test.copy()

In [None]:
df_train_01.shape

(537, 9)

In [None]:
# Retirar ID e Target das tabelas (para escoragem não é necessário e em produção não teremos target)
df_train_01 = df_train_01.drop(axis=1, columns=['Outcome'])
df_test_01 = df_test_01.drop(axis=1, columns=['Outcome'])

df_train_01.shape,df_test_01.shape

((537, 8), (231, 8))

In [None]:
df_train_01.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
334,1,95,60,18,58,23.9,0.26,22
139,5,105,72,29,325,36.9,0.159,28
485,0,135,68,42,250,42.3,0.365,24
547,4,131,68,21,166,33.1,0.16,28
18,1,103,30,38,83,43.3,0.183,33


## **Carregar os encoders e a lista de colunas**

In [None]:
# Carregar os encoders e a lista de colunas
with open('/content/drive/MyDrive/2. Study  Work/Pod Academy/Tarefas/Ciência de Dados/Projetos/Predição de Diabetes/01 Data Prep/prd_labelenc_a014.pkl', 'rb') as f:
    loaded_data = pickle.load(f)

loaded_encoders = loaded_data['encoders']
loaded_columns = loaded_data['columns']

# Aplicando para base de treino
for col in loaded_columns:
    if col in loaded_encoders:
        # Transforma a coluna usando o encoder carregado
        df_train_01[col] = loaded_encoders[col].transform(df_train_01[col])

# Aplicando para base de teste
for col in loaded_columns:
    if col in loaded_encoders:
        # Transforma a coluna usando o encoder carregado
        df_test_01[col] = loaded_encoders[col].transform(df_test_01[col])

In [None]:
# Carregar o encoder e a lista de colunas
with open('/content/drive/MyDrive/2. Study  Work/Pod Academy/Tarefas/Ciência de Dados/Projetos/Predição de Diabetes/01 Data Prep/prd_onehotenc_a014.pkl', 'rb') as f:
    loaded_data = pickle.load(f)

loaded_encoder = loaded_data['encoder']
loaded_columns = loaded_data['columns']

# Aplicando para base completa de treino
encoded_data_test = loaded_encoder.transform(df_train_01[loaded_columns])
encoded_cols_test = loaded_encoder.get_feature_names_out(loaded_columns)
encoded_df_test = pd.DataFrame(encoded_data_test, columns=encoded_cols_test, index=df_train_01.index)
df_train_03 = pd.concat([df_train_01.drop(loaded_columns, axis=1), encoded_df_test], axis=1)

# Aplicando para base de teste
encoded_data_test = loaded_encoder.transform(df_test_01[loaded_columns])
encoded_cols_test = loaded_encoder.get_feature_names_out(loaded_columns)
encoded_df_test = pd.DataFrame(encoded_data_test, columns=encoded_cols_test, index=df_test_01.index)
df_test_03 = pd.concat([df_test_01.drop(loaded_columns, axis=1), encoded_df_test], axis=1)

In [None]:
df_train_03.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
334,1,95,60,18,58,23.9,0.26,22
139,5,105,72,29,325,36.9,0.159,28
485,0,135,68,42,250,42.3,0.365,24
547,4,131,68,21,166,33.1,0.16,28
18,1,103,30,38,83,43.3,0.183,33


In [None]:
df_train_04 = df_train_03
df_test_04 = df_test_03

## **Carregar a padronização**

In [None]:
# Carregar o scaler
with open('/content/drive/MyDrive/2. Study  Work/Pod Academy/Tarefas/Ciência de Dados/Projetos/Predição de Diabetes/01 Data Prep/prd_scaler_a014.pkl', 'rb') as f:
    loaded_scaler = pickle.load(f)

# Aplicando na base de treino
df_train_04_scaled = loaded_scaler.transform(df_train_04)
df_train_05 = pd.DataFrame(df_train_04_scaled, columns=df_train_04.columns, index=df_train_04.index)

# Aplicando na base de teste
df_test_04s = loaded_scaler.transform(df_test_04)
df_test_05 = pd.DataFrame(df_test_04s, columns=df_test_04.columns, index=df_test_04.index)
df_test_05.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
668,0.697483,-0.707199,-0.646399,0.812079,0.957202,0.26576,-0.116804,0.850192
324,-0.529539,-0.273888,0.293996,0.747464,-0.693688,0.488933,-0.941923,-1.034268
624,-0.529539,-0.397691,-0.314495,-1.320215,-0.693688,-0.154332,-0.912664,-1.034268
690,1.310994,-0.428642,0.570582,-1.320215,-0.693688,-0.968258,1.129653,0.079277
473,1.004239,0.46893,1.123756,-1.320215,-0.693688,-0.272482,-0.760514,1.449793


## **Carregar lista de variáveis que passaram pelo Feature Sepection (utilizadas no treinamento do modelo)**

In [None]:
# Carregar lista de variáveis que passaram pelo Feature Sepection (utilizadas no treinamento do modelo)
with open('/content/drive/MyDrive/2. Study  Work/Pod Academy/Tarefas/Ciência de Dados/Projetos/Predição de Diabetes/02 Feature Selection/prd_list_features_a014.pkl', 'rb') as f:
    loaded_features = pickle.load(f)

loaded_features

['Glucose', 'BMI', 'DiabetesPedigreeFunction', 'Age']

In [None]:
abt_train = df_train_05[loaded_features]
abt_test = df_test_05[loaded_features]

In [None]:
abt_train.head()

Unnamed: 0,Glucose,BMI,DiabetesPedigreeFunction,Age
334,-0.800051,-1.060153,-0.614216,-0.94861
139,-0.490543,0.646467,-0.909738,-0.434667
485,0.437979,1.355371,-0.306991,-0.777296
547,0.314176,0.147609,-0.906812,-0.434667
18,-0.552445,1.48665,-0.839515,-0.00638


## **Carregando modelo campeão**

In [None]:
# Carregando modelo campeão
with open('/content/drive/MyDrive/2. Study  Work/Pod Academy/Tarefas/Ciência de Dados/Projetos/Predição de Diabetes/03 Modelos/best_model_rf.pkl', 'rb') as f:
  loaded_model = pickle.load(f)

loaded_model

## **Escorando base de treino e teste**

In [None]:
# Escorando base de treino

predprob = loaded_model.predict_proba(abt_train)
predict = loaded_model.predict(abt_train)
abt_train['score_1']=predprob[:,1]
abt_train['class']=predict

# Escorando base de teste

predprob = loaded_model.predict_proba(abt_test)
predict = loaded_model.predict(abt_test)
abt_test['score_1']=predprob[:,1]
abt_test['class']=predict

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abt_train['score_1']=predprob[:,1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abt_train['class']=predict
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abt_test['score_1']=predprob[:,1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

In [None]:
abt_train = pd.merge(train, abt_train[['score_1', 'class']], left_index=True, right_index=True)
abt_test = pd.merge(test, abt_test[['score_1', 'class']], left_index=True, right_index=True)

In [None]:
abt_train.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,score_1,class
334,1,95,60,18,58,23.9,0.26,22,0,0.026492,0
139,5,105,72,29,325,36.9,0.159,28,0,0.199819,0
485,0,135,68,42,250,42.3,0.365,24,1,0.477085,0
547,4,131,68,21,166,33.1,0.16,28,0,0.285119,0
18,1,103,30,38,83,43.3,0.183,33,0,0.351661,0


In [None]:
abt_test.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,score_1,class
668,6,98,58,33,190,34.0,0.43,43,0,0.334777,0
324,2,112,75,32,0,35.7,0.148,21,0,0.159352,0
624,2,108,64,0,0,30.8,0.158,21,0,0.129842,0
690,8,107,80,0,0,24.6,0.856,34,0,0.222774,0
473,7,136,90,0,0,29.9,0.21,50,0,0.40854,0


## **Salvando como arquivo csv**

In [None]:
# Salvando como arquivo csv

abt_train.to_csv('/content/drive/MyDrive/2. Study  Work/Pod Academy/Tarefas/Ciência de Dados/Projetos/Predição de Diabetes/abt_scored_001.csv', index=False)

## **Conclusão**

Ao finalizar o projeto, o modelo desenvolvido revela a capacidade de prever a probabilidade de um indivíduo desenvolver diabetes. Essa informação permite a implementação de medidas preventivas antecipadas para mitigar os impactos da doença ou adiar sua manifestação.

O modelo demonstra uma sólida capacidade de predição, sendo eficaz na identificação de indivíduos mais ou menos propensos à condição. Nesse sentido, é recomendado que aqueles com pontuações intermediárias busquem uma avaliação mais detalhada por profissionais de saúde. Assim, o modelo serve como uma ferramenta valiosa para auxiliar na tomada de decisão desses profissionais.

## **Próximos Passos**

O desenvolvimento subsequente do projeto envolverá a implementação em produção, possibilitando a criação de um aplicativo para entrada de informações dos pacientes e obtenção de resultados em tempo real. Isso permitirá identificar rapidamente se uma pessoa é propensa, não é propensa ou requer uma análise mais detalhada.