# MODULE 5: PREPROCESSING YOUR DATA

    *************************************************************
    Author:  Adeyemi Adedoyin Simeon
    Program: MSc, Computer Science, University of Ibadan
    Course:  Machine Learning
    Date:    26th May, 2019
    Version: 1.2
    E-mail:  adeyemi.sa1@gmail.com
    *************************************************************
    
    *Note: Please reference the author whenever and wherever you use all/portion of this code*

## Importing Libraries

In [1]:
from sklearn.datasets import load_diabetes
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

## Loading digits Dataset and describing the features of the dataset

In [4]:
diabetes = load_diabetes()

In [5]:
diabetes.keys()

dict_keys(['data', 'target', 'DESCR', 'feature_names'])

In [6]:
print(diabetes.DESCR)

Diabetes dataset

Notes
-----

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

Data Set Characteristics:

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attributes:
    :Age:
    :Sex:
    :Body mass index:
    :Average blood pressure:
    :S1:
    :S2:
    :S3:
    :S4:
    :S5:
    :S6:

Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).

Source URL:
http://www4.stat.ncsu.edu/~boos/var.select/diabetes.html

For more information see:
Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani

In [7]:
diabetes.feature_names

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [11]:
diabetes_df = pd.DataFrame(data=diabetes.data,columns=diabetes.feature_names)

In [12]:
diabetes_df['target'] = diabetes.target

In [15]:
diabetes_df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0


In [16]:
diabetes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
age       442 non-null float64
sex       442 non-null float64
bmi       442 non-null float64
bp        442 non-null float64
s1        442 non-null float64
s2        442 non-null float64
s3        442 non-null float64
s4        442 non-null float64
s5        442 non-null float64
s6        442 non-null float64
target    442 non-null float64
dtypes: float64(11)
memory usage: 38.1 KB


# Preprocessing: Separating the Independent, X, variable and the dependent, y, variables

In [19]:
X = diabetes_df.drop('target',axis=1)
y = diabetes_df['target']

# Data Preprocessing: Splitting into Training set and 0.2 (20%) Testing Set

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [22]:
X_train.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
17,0.070769,0.05068,0.012117,0.056301,0.034206,0.049416,-0.039719,0.034309,0.027368,-0.001078
66,-0.009147,0.05068,-0.018062,-0.033214,-0.020832,0.012152,-0.072854,0.07121,0.000271,0.019633
137,0.005383,-0.044642,0.04984,0.097616,-0.015328,-0.016345,-0.006584,-0.002592,0.017037,-0.013504
245,-0.02731,-0.044642,-0.035307,-0.029771,-0.056607,-0.05862,0.030232,-0.039493,-0.049868,-0.129483
31,-0.023677,-0.044642,-0.065486,-0.081414,-0.03872,-0.05361,0.059685,-0.076395,-0.037128,-0.042499


In [23]:
y_train.head()

17     144.0
66     150.0
137    280.0
245    125.0
31      59.0
Name: target, dtype: float64

In [24]:
X_test.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
287,0.045341,-0.044642,-0.006206,-0.015999,0.125019,0.125198,0.019187,0.034309,0.032433,-0.00522
211,0.092564,-0.044642,0.036907,0.021872,-0.02496,-0.016658,0.000779,-0.039493,-0.022512,-0.021788
72,0.063504,0.05068,-0.00405,-0.012556,0.103003,0.04879,0.056003,-0.002592,0.084495,-0.017646
321,0.096197,-0.044642,0.051996,0.079254,0.054845,0.036577,-0.076536,0.141322,0.098646,0.061054
73,0.012648,0.05068,-0.020218,-0.002228,0.038334,0.053174,-0.006584,0.034309,-0.005145,-0.009362


In [25]:
y_test.head()

287    219.0
211     70.0
72     202.0
321    230.0
73     111.0
Name: target, dtype: float64

In [27]:
print('The length of X_train:', len(X_train))
print('The length of y_train:', len(y_train))
print('The length of X_test:', len(X_test))
print('The length of y_test:', len(y_test))
print('\n')

print('The shape of X_train:', X_train.shape)
print('The shape of y_train:', y_train.shape)
print('The shape of X_test:', X_test.shape)
print('The shape of y_test:', y_test.shape)
print('\n')

print('The length of Unique y_train:', y_train.nunique())
print('The length of Unique y_test:', y_test.nunique())

The length of X_train: 353
The length of y_train: 353
The length of X_test: 89
The length of y_test: 89


The shape of X_train: (353, 10)
The shape of y_train: (353,)
The shape of X_test: (89, 10)
The shape of y_test: (89,)


The length of Unique y_train: 199
The length of Unique y_test: 75
