# load and summarize the diabetes dataset

In [None]:
# load the dataset
from pandas import read_csv
dataset = read_csv('pima-indians-diabetes.csv', header=None)

In [None]:
# summarize the shape of the dataset
print(dataset.shape)

In [None]:
# summarize each variable
print(dataset.describe())

In [None]:
# histograms of the variables
fig = dataset.hist(xlabelsize=4, ylabelsize=4)
[x.title.set_size(4) for x in fig.ravel()]

In [None]:
# show the plot
from matplotlib import pyplot
pyplot.show()

# evaluate knn on the raw diabetes dataset

In [None]:
# load the dataset
from pandas import read_csv
dataset = read_csv('pima-indians-diabetes.csv', header=None)
data = dataset.values

In [None]:
# separate into input and output columns
X, y = data[:, :-1], data[:, -1]

In [None]:
# ensure inputs are floats and output is an integer label
from sklearn.preprocessing import LabelEncoder
X = X.astype('float32')
y = LabelEncoder().fit_transform(y.astype('str'))

In [None]:
# define and configure the model
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()

In [None]:
# evaluate the model
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

In [None]:
# report model performance
from numpy import mean
from numpy import std
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

# visualize a minmax scaler transform of the diabetes dataset

In [None]:
# load the dataset
from pandas import read_csv
dataset = read_csv('pima-indians-diabetes.csv', header=None)

In [None]:
# retrieve just the numeric input values
data = dataset.values[:, :-1]

In [None]:
# perform a robust scaler transform of the dataset
from sklearn.preprocessing import MinMaxScaler
trans = MinMaxScaler()
data = trans.fit_transform(data)

In [None]:
# convert the array back to a dataframe
from pandas import DataFrame
dataset = DataFrame(data)

In [None]:
# summarize
print(dataset.describe())

In [None]:
# histograms of the variables
fig = dataset.hist(xlabelsize=4, ylabelsize=4)
[x.title.set_size(4) for x in fig.ravel()]

In [None]:
# show the plot
from matplotlib import pyplot
pyplot.show()

# evaluate knn on the diabetes dataset with minmax scaler transform

In [None]:
# load the dataset
from pandas import read_csv
dataset = read_csv('pima-indians-diabetes.csv', header=None)
data = dataset.values

In [None]:
# separate into input and output columns
X, y = data[:, :-1], data[:, -1]

In [None]:
# ensure inputs are floats and output is an integer label
from sklearn.preprocessing import LabelEncoder
X = X.astype('float32')
y = LabelEncoder().fit_transform(y.astype('str'))

In [None]:
# define the pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
trans = MinMaxScaler()
model = KNeighborsClassifier()
pipeline = Pipeline(steps=[('t', trans), ('m', model)])

In [None]:
# evaluate the pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

In [None]:
# report pipeline performance
from numpy import mean
from numpy import std
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

# visualize a standard scaler transform of the diabetes dataset

In [None]:
# load the dataset
from pandas import read_csv
dataset = read_csv('pima-indians-diabetes.csv', header=None)

In [None]:
# retrieve just the numeric input values
data = dataset.values[:, :-1]

In [None]:
# perform a robust scaler transform of the dataset
from sklearn.preprocessing import StandardScaler
trans = StandardScaler()
data = trans.fit_transform(data)

In [None]:
# convert the array back to a dataframe
from pandas import DataFrame
dataset = DataFrame(data)

In [None]:
# summarize
print(dataset.describe())

In [None]:
# histograms of the variables
fig = dataset.hist(xlabelsize=4, ylabelsize=4)
[x.title.set_size(4) for x in fig.ravel()]

In [None]:
# show the plot
from matplotlib import pyplot
pyplot.show()

# evaluate knn on the diabetes dataset with standard scaler transform

In [None]:
# load the dataset
from pandas import read_csv
dataset = read_csv('pima-indians-diabetes.csv', header=None)
data = dataset.values

In [None]:
# separate into input and output columns
X, y = data[:, :-1], data[:, -1]

In [None]:
# ensure inputs are floats and output is an integer label
from sklearn.preprocessing import LabelEncoder
X = X.astype('float32')
y = LabelEncoder().fit_transform(y.astype('str'))

In [None]:
# define the pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
trans = StandardScaler()
model = KNeighborsClassifier()
pipeline = Pipeline(steps=[('t', trans), ('m', model)])

In [None]:
# evaluate the pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

In [None]:
# report pipeline performance
from numpy import mean
from numpy import std
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

# load and summarize the diabetes dataset

In [None]:
# load dataset
from pandas import read_csv
dataset = read_csv('pima-indians-diabetes.csv', header=None)

In [None]:
# summarize the shape of the dataset
print(dataset.shape)

In [None]:
# summarize each variable
print(dataset.describe())

In [None]:
# histograms of the variables
fig = dataset.hist(xlabelsize=4, ylabelsize=4)
[x.title.set_size(4) for x in fig.ravel()]

In [None]:
# show the plot
from matplotlib import pyplot
pyplot.show()

# evaluate knn on the raw diabetes dataset

In [None]:
# load dataset
from pandas import read_csv
dataset = read_csv('pima-indians-diabetes.csv', header=None)
data = dataset.values

In [None]:
# separate into input and output columns
X, y = data[:, :-1], data[:, -1]

In [None]:
# ensure inputs are floats and output is an integer label
from sklearn.preprocessing import LabelEncoder
X = X.astype('float32')
y = LabelEncoder().fit_transform(y.astype('str'))

In [None]:
# define and configure the model
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()

In [None]:
# evaluate the model
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

In [None]:
# report model performance
from numpy import mean
from numpy import std
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

# visualize a robust scaler transform of the diabetes dataset

In [None]:
# load dataset
from pandas import read_csv
dataset = read_csv('pima-indians-diabetes.csv', header=None)

In [None]:
# retrieve just the numeric input values
data = dataset.values[:, :-1]

In [None]:
# perform a robust scaler transform of the dataset
from sklearn.preprocessing import RobustScaler
trans = RobustScaler()
data = trans.fit_transform(data)

In [None]:
# convert the array back to a dataframe
from pandas import DataFrame
dataset = DataFrame(data)

In [None]:
# summarize
print(dataset.describe())

In [None]:
# histograms of the variables
fig = dataset.hist(xlabelsize=4, ylabelsize=4)
[x.title.set_size(4) for x in fig.ravel()]

In [None]:
# show the plot
from matplotlib import pyplot
pyplot.show()

# evaluate knn on the diabetes dataset with robust scaler transform

In [None]:
# load dataset
from pandas import read_csv
dataset = read_csv('pima-indians-diabetes.csv', header=None)
data = dataset.values

In [None]:
# separate into input and output columns
X, y = data[:, :-1], data[:, -1]

In [None]:
# ensure inputs are floats and output is an integer label
from sklearn.preprocessing import LabelEncoder
X = X.astype('float32')
y = LabelEncoder().fit_transform(y.astype('str'))

In [None]:
# define the pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
trans = RobustScaler()
model = KNeighborsClassifier()
pipeline = Pipeline(steps=[('t', trans), ('m', model)])

In [None]:
# evaluate the pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

In [None]:
# report pipeline performance
from numpy import mean
from numpy import std
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

# explore the scaling range of the robust scaler transform

In [None]:
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [None]:
# get the dataset
from pandas import read_csv
from sklearn.preprocessing import LabelEncoder
def get_dataset():
    # load dataset
    dataset = read_csv('pima-indians-diabetes.csv')
                       # separate into input and output columns
    X, y = data[:, :-1], data[:, -1]
    # ensure inputs are floats and output is an integer label
    X = X.astype('float32')
    y = LabelEncoder().fit_transform(y.astype('str'))
    return X, y

In [None]:
# get a list of models to evaluate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
def get_models():
    models = dict()
    for value in [1, 5, 10, 15, 20, 25, 30]:
        # define the pipeline
        trans = RobustScaler(quantile_range=(value, 100-value))
        model = KNeighborsClassifier()
        models[str(value)] = Pipeline(steps=[('t', trans), ('m', model)])
    return models

In [None]:
# evaluate a given model using cross-validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    return scores

In [None]:
# define dataset
X, y = get_dataset()

In [None]:
# get the models to evaluate
models = get_models()

In [None]:
# evaluate the models and store results
from numpy import mean
from numpy import std
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X, y)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

In [None]:
# plot model performance for comparison
from matplotlib import pyplot
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()