# Binary Classification of Diabetic Patients

# 1. Load .csv file using Python Standard Library

In [2]:
import csv
import numpy
filename = 'pima-indians-diabetes.csv'
raw_data = open(filename , 'rt')
reader = csv.reader(raw_data, delimiter=' ', quoting=csv.QUOTE_NONE)
x = list(reader)
data=numpy.array(x)
print(data.shape)

(768, 1)


# 2. Load .csv file using NumPy

In [3]:
import csv
import numpy
filename = 'pima-indians-diabetes.csv'
raw_data = open(filename, 'rt')
reader = csv.reader(raw_data, delimiter=',', quoting=csv.QUOTE_NONE)
x = list(reader)
data = numpy.array(x)
print(data.shape)

(768, 9)


# 3. Load .csv file using Pandas

In [4]:
import pandas
filename = 'pima-indians-diabetes.csv'
names = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age','Outcome']
data = pandas.read_csv(filename, names=names)
print(data.shape)

(768, 9)


# Describing the data

### top 5 rows

In [5]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


### Bottom 5 rows

In [6]:
data.tail()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.34,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1
767,1,93,70,31,0,30.4,0.315,23,0


In [8]:
data.shape

(768, 9)

In [9]:
data.size

6912

In [11]:
data.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [12]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
Pregnancies                 768 non-null int64
Glucose                     768 non-null int64
BloodPressure               768 non-null int64
SkinThickness               768 non-null int64
Insulin                     768 non-null int64
BMI                         768 non-null float64
DiabetesPedigreeFunction    768 non-null float64
Age                         768 non-null int64
Outcome                     768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


# Remove Insignificant Columns


In [27]:
import pandas as pd
import numpy as np
df = pd.DataFrame(np.arange(12).reshape(3, 4),columns=['A', 'B', 'C', 'D'])
df


Unnamed: 0,A,B,C,D
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


Suppose we have to remove column C so let's check below

In [28]:
df.drop(columns=['C'])

Unnamed: 0,A,B,D
0,0,1,3
1,4,5,7
2,8,9,11


# Marking the Missing Values 

In [21]:
from pandas import read_csv
dataset = read_csv('pima-indians-diabetes.csv', header=None)
print(dataset.describe())

                0           1           2           3           4           5  \
count  768.000000  768.000000  768.000000  768.000000  768.000000  768.000000   
mean     3.845052  120.894531   69.105469   20.536458   79.799479   31.992578   
std      3.369578   31.972618   19.355807   15.952218  115.244002    7.884160   
min      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   
25%      1.000000   99.000000   62.000000    0.000000    0.000000   27.300000   
50%      3.000000  117.000000   72.000000   23.000000   30.500000   32.000000   
75%      6.000000  140.250000   80.000000   32.000000  127.250000   36.600000   
max     17.000000  199.000000  122.000000   99.000000  846.000000   67.100000   

                6           7           8  
count  768.000000  768.000000  768.000000  
mean     0.471876   33.240885    0.348958  
std      0.331329   11.760232    0.476951  
min      0.078000   21.000000    0.000000  
25%      0.243750   24.000000    0.000000  
50%   

Let's take example of 50 rows of data and analyze it.

In [12]:
from pandas import read_csv
dataset = read_csv('pima-indians-diabetes.csv', header=None)
print(dataset.head(50))

     0    1    2   3    4     5      6   7  8
0    6  148   72  35    0  33.6  0.627  50  1
1    1   85   66  29    0  26.6  0.351  31  0
2    8  183   64   0    0  23.3  0.672  32  1
3    1   89   66  23   94  28.1  0.167  21  0
4    0  137   40  35  168  43.1  2.288  33  1
5    5  116   74   0    0  25.6  0.201  30  0
6    3   78   50  32   88  31.0  0.248  26  1
7   10  115    0   0    0  35.3  0.134  29  0
8    2  197   70  45  543  30.5  0.158  53  1
9    8  125   96   0    0   0.0  0.232  54  1
10   4  110   92   0    0  37.6  0.191  30  0
11  10  168   74   0    0  38.0  0.537  34  1
12  10  139   80   0    0  27.1  1.441  57  0
13   1  189   60  23  846  30.1  0.398  59  1
14   5  166   72  19  175  25.8  0.587  51  1
15   7  100    0   0    0  30.0  0.484  32  1
16   0  118   84  47  230  45.8  0.551  31  1
17   7  107   74   0    0  29.6  0.254  31  1
18   1  103   30  38   83  43.3  0.183  33  0
19   1  115   70  30   96  34.6  0.529  32  1
20   3  126   88  41  235  39.3  0

Now, let's count how many values are missing column-wise. 

In [22]:
from pandas import read_csv
dataset = read_csv('pima-indians-diabetes.csv', header=None)
num_missing = (dataset[[1,2,3,4,5,6,7]] == 0).sum()
print(num_missing)

1      5
2     35
3    227
4    374
5     11
6      0
7      0
dtype: int64


We can see that 
column 1 have 5 missing values,
column 2 have 35 missing values,
column 3 have 227 missing values,
column 4 have 374 missing values,
column 5 have 11 missing values,
column 6 have 0 missing values,
column 7 have 0 missing values.

# Removing Rows having missing values

In [23]:
from numpy import nan
from pandas import read_csv
dataset = read_csv('pima-indians-diabetes.csv', header=None)
dataset[[1,2,3,4,5]] = dataset[[1,2,3,4,5]].replace(0, nan)
print(dataset.isnull().sum())

0      0
1      5
2     35
3    227
4    374
5     11
6      0
7      0
8      0
dtype: int64


In [24]:
from numpy import nan
from pandas import read_csv
dataset = read_csv('pima-indians-diabetes.csv', header=None)
print(dataset.shape)
dataset[[1,2,3,4,5]] = dataset[[1,2,3,4,5]].replace(0, nan)
dataset.dropna(inplace=True)
print(dataset.shape)

(768, 9)
(392, 9)


So, here we can see the difference that earlier the no. of rows were 768, and after removing the rows having missing values the number of rows are now 392. 

# Formatting the Data

Formatting: The data you have selected may not be in a format that is suitable for you to work with. The data may be in a relational database and you would like it in a flat file, or the data may be in a proprietary file format and you would like it in a relational database or a text file.


So to make the dataset compatible we have converted the dataset into .csv file.