Data loading, pre-processing and representation
---



In [1]:
#adding libraries
from sklearn import linear_model  
from sklearn.linear_model import LinearRegression  
import csv          
import requests     
import numpy as np  
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn import metrics
import pandas

In [2]:
my_dataset = pd.read_csv('diabetes.csv')
#Representing datatypes of the dataset columns
print(my_dataset.dtypes) 

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object


### Datatype matching

In [3]:
# In the dataset there are no categorical feature, therefore, datatypes don't need to be changed
print(my_dataset.dtypes) 

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object


###Representing and analysing data about the dataset

In [4]:

print(my_dataset.describe())
print(my_dataset.info()) 

       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
min      0.000000                  

You can see above that there are two data objects in the dataset with missing values for the attributes "culmen_length_mm", "culmen_depth_mm", "flipper_length_mm", "body_mass_g". To obtain the values, we replace the missing values by the mean value of the attribute


###Replacing missing values

In [8]:
#In our dataset neither object misses a feature, so the functions for willing blanks are not needed
#On the other hand there are some outliers which are replaced with the mean values
my_dataset['BloodPressure'].replace(0, my_dataset['BloodPressure'].mean(), inplace=True)
my_dataset['SkinThickness'].replace(0, my_dataset['SkinThickness'].mean(), inplace=True)
my_dataset['BMI'].replace(0, my_dataset['BMI'].mean(), inplace=True)
print(my_dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    float64
 1   Glucose                   768 non-null    float64
 2   BloodPressure             768 non-null    float64
 3   SkinThickness             768 non-null    float64
 4   Insulin                   768 non-null    float64
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    float64
 8   Outcome                   768 non-null    int64  
dtypes: float64(8), int64(1)
memory usage: 54.1 KB
None


In [6]:
#For attributes of categorical type, the values are checked against possible values
print(my_dataset['Outcome'].value_counts())

0    500
1    268
Name: Outcome, dtype: int64


###Data normalization

In [9]:
#Data normalisation is applied for four attributes to make the data more consistent
my_dataset['Pregnancies'] = (my_dataset['Pregnancies'] - my_dataset['Pregnancies'].min())/(my_dataset['Pregnancies'].max() - my_dataset['Pregnancies'].min())
my_dataset['Glucose'] = (my_dataset['Glucose'] - my_dataset['Glucose'].min())/(my_dataset['Glucose'].max() - my_dataset['Glucose'].min())
my_dataset['BloodPressure'] = (my_dataset['BloodPressure'] - my_dataset['BloodPressure'].min())/(my_dataset['BloodPressure'].max() - my_dataset['BloodPressure'].min())
my_dataset['SkinThickness'] = (my_dataset['SkinThickness'] - my_dataset['SkinThickness'].min())/(my_dataset['SkinThickness'].max() - my_dataset['SkinThickness'].min())
my_dataset['Insulin'] = (my_dataset['Insulin'] - my_dataset['Insulin'].min())/(my_dataset['Insulin'].max() - my_dataset['Insulin'].min())
my_dataset['BMI'] = (my_dataset['BMI'] - my_dataset['BMI'].min())/(my_dataset['BMI'].max() - my_dataset['BMI'].min())
my_dataset['DiabetesPedigreeFunction'] = (my_dataset['DiabetesPedigreeFunction'] - my_dataset['DiabetesPedigreeFunction'].min())/(my_dataset['DiabetesPedigreeFunction'].max() - my_dataset['DiabetesPedigreeFunction'].min())
my_dataset['Age'] = (my_dataset['Age'] - my_dataset['Age'].min())/(my_dataset['Age'].max() - my_dataset['Age'].min())
print(my_dataset.head(10))
print(my_dataset.describe())

   Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
0     0.352941  0.743719       0.456522       0.296703  0.000000  0.314928   
1     0.058824  0.427136       0.391304       0.230769  0.000000  0.171779   
2     0.470588  0.919598       0.369565       0.137763  0.000000  0.104294   
3     0.058824  0.447236       0.391304       0.164835  0.111111  0.202454   
4     0.000000  0.688442       0.108696       0.296703  0.198582  0.509202   
5     0.294118  0.582915       0.478261       0.137763  0.000000  0.151329   
6     0.176471  0.391960       0.217391       0.263736  0.104019  0.261759   
7     0.588235  0.577889       0.425059       0.137763  0.000000  0.349693   
8     0.117647  0.989950       0.434783       0.406593  0.641844  0.251534   
9     0.470588  0.628141       0.717391       0.137763  0.000000  0.282057   

   DiabetesPedigreeFunction       Age  Outcome  
0                  0.234415  0.483333        1  
1                  0.116567  0.166667      