# demonstrate the types of features created

In [None]:
# define the dataset
from numpy import asarray
data = asarray([[2,3],[2,3],[2,3]])
print(data)

In [None]:
# perform a polynomial features transform of the dataset
from sklearn.preprocessing import PolynomialFeatures
trans = PolynomialFeatures(degree=2)
data = trans.fit_transform(data)
print(data)

# evaluate knn on the raw sonar dataset

In [None]:
# load dataset
from pandas import read_csv
dataset = read_csv('sonar.csv', header=None)
data = dataset.values

In [None]:
# separate into input and output columns
X, y = data[:, :-1], data[:, -1]

In [None]:
# ensure inputs are floats and output is an integer label
from sklearn.preprocessing import LabelEncoder
X = X.astype('float32')
y = LabelEncoder().fit_transform(y.astype('str'))

In [None]:
# define and configure the model
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()

In [None]:
# evaluate the model
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

In [None]:
# report model performance
from numpy import mean
from numpy import std
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

# visualize a polynomial features transform of the sonar dataset

In [None]:
# load dataset
from pandas import read_csv
dataset = read_csv('sonar.csv', header=None)

In [None]:
# retrieve just the numeric input values
data = dataset.values[:, :-1]

In [None]:
# perform a polynomial features transform of the dataset
from sklearn.preprocessing import PolynomialFeatures
trans = PolynomialFeatures(degree=3)
data = trans.fit_transform(data)

In [None]:
# convert the array back to a dataframe
from pandas import DataFrame
dataset = DataFrame(data)

In [None]:
# summarize
print(dataset.shape)

# evaluate knn on the sonar dataset with polynomial features transform

In [None]:
# load dataset
from pandas import read_csv
dataset = read_csv('sonar.csv', header=None)
data = dataset.values

In [None]:
# separate into input and output columns
X, y = data[:, :-1], data[:, -1]

In [None]:
# ensure inputs are floats and output is an integer label
from sklearn.preprocessing import LabelEncoder
X = X.astype('float32')
y = LabelEncoder().fit_transform(y.astype('str'))

In [None]:
# define the pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
trans = PolynomialFeatures(degree=3)
model = KNeighborsClassifier()
pipeline = Pipeline(steps=[('t', trans), ('m', model)])

In [None]:
# evaluate the pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

In [None]:
# report pipeline performance
from numpy import mean
from numpy import std
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

# compare the effect of the degree on the number of created features

In [None]:
# get the dataset
from pandas import read_csv
from sklearn.preprocessing import LabelEncoder
def get_dataset():
	# load dataset
	url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv"
	dataset = read_csv(url, header=None)
	data = dataset.values
	# separate into input and output columns
	X, y = data[:, :-1], data[:, -1]
	# ensure inputs are floats and output is an integer label
	X = X.astype('float32')
	y = LabelEncoder().fit_transform(y.astype('str'))
	return X, y

In [None]:
# define dataset
X, y = get_dataset()

In [None]:
# calculate change in number of features
from sklearn.preprocessing import PolynomialFeatures
num_features = list()
degress = [i for i in range(1, 6)]
for d in degress:
	# create transform
	trans = PolynomialFeatures(degree=d)
	# fit and transform
	data = trans.fit_transform(X)
	# record number of features
	num_features.append(data.shape[1])
	# summarize
	print('Degree: %d, Features: %d' % (d, data.shape[1]))

In [None]:
# plot degree vs number of features
from matplotlib import pyplot
pyplot.plot(degress, num_features)
pyplot.show()

# explore the effect of degree on accuracy for the polynomial features transform

In [None]:
# get the dataset
from pandas import read_csv
from sklearn.preprocessing import LabelEncoder
def get_dataset():
	# load dataset
	url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv"
	dataset = read_csv(url, header=None)
	data = dataset.values
	# separate into input and output columns
	X, y = data[:, :-1], data[:, -1]
	# ensure inputs are floats and output is an integer label
	X = X.astype('float32')
	y = LabelEncoder().fit_transform(y.astype('str'))
	return X, y

In [None]:
# get a list of models to evaluate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
def get_models():
	models = dict()
	for d in range(1,5):
		# define the pipeline
		trans = PolynomialFeatures(degree=d)
		model = KNeighborsClassifier()
		models[str(d)] = Pipeline(steps=[('t', trans), ('m', model)])
	return models

In [None]:
# evaluate a give model using cross-validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
def evaluate_model(model, X, y):
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
	return scores

In [None]:
# define dataset
X, y = get_dataset()

In [None]:
# get the models to evaluate
models = get_models()

In [None]:
# evaluate the models and store results
from numpy import mean
from numpy import std
results, names = list(), list()
for name, model in models.items():
	scores = evaluate_model(model, X, y)
	results.append(scores)
	names.append(name)
	print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

In [None]:
# plot model performance for comparison
from matplotlib import pyplot
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()