# Installing the libraries

In [3]:
#!pip3 install pandas
!pip3 install impyute

# Importing needed libraries

In [4]:
import pandas as pd
import numpy as np

import sys
sys.setrecursionlimit(100000) #Increase the recursion limit of the OS

from impyute.imputation.cs import fast_knn

# Exploratory Data Analysis (EDA)

In [5]:
data = pd.read_csv('diabetes_dataset.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
4,5,116.0,74.0,,,25.6,0.201,30,0


In [6]:
data.dtypes

Pregnancies                   int64
Glucose                     float64
BloodPressure               float64
SkinThickness               float64
Insulin                     float64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

### Taking a look at the **missing** data

In [7]:
data.isna().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

Here we can see that there is a lot of missing data at the *Glucose, BloodPressure, SkinThickness, Insulin, BMI*. And they all are **numeric** values.

# Data Imputation:

## Imputation Using k-NN:

In [33]:


# start the KNN training
imputed_training = fast_knn(data.values, k=30)

In [29]:
imputed_data = pd.DataFrame(imputed_training, columns=data.columns)

# Checking again if there is any missing value.
imputed_data.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

## Imputation Using Multivariate Imputation by Chained Equation (MICE)

In [37]:
from impyute.imputation.cs import mice

# Reading dataset with missing data again
data = pd.read_csv('diabetes_dataset.csv')

# start the MICE training
imputed_training = mice(data.values)

imputed_data = pd.DataFrame(imputed_training, columns=data.columns)

# Checking again if there is any missing value.
imputed_data.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [8]:
from sklearn import preprocessing
data_normalized = preprocessing.normalize(impute, norm='l2')
data_normalized

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

## Imputation Using Random Forest Imputation (MissForest)

In [49]:
# Reading dataset with missing data again
data = pd.read_csv('diabetes_dataset.csv')

# Let data be an array containing missing values
from missingpy import MissForest
imputer = MissForest()
imputed_training = imputer.fit_transform(data)

imputed_data = pd.DataFrame(imputed_training, columns=data.columns)

# Checking again if there is any missing value.
imputed_data.isna().sum()

Iteration: 0
Iteration: 1
Iteration: 2


Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

# Write 'diabetes_dataset_franca.csv'

In [51]:
imputed_data.to_csv('diabetes_dataset_franca.csv')

# Running Aydano's script to send to server:

In [58]:
!python3 diabetes_csv.py


 - Lendo o arquivo com o dataset sobre diabetes
 - Criando X e y para o algoritmo de aprendizagem a partir do arquivo diabetes_dataset
 - Criando modelo preditivo
 - Aplicando modelo e enviando para o servidor
 - Resposta do servidor:
 {"status":"success","dev_key":"Vov\u00f3Learn","accuracy":0.7346938775510204,"old_accuracy":0.74489795918367} 

