# test classification dataset

In [None]:
# define dataset
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)

In [None]:
# summarize the dataset
print(X.shape, y.shape) 

# naive approach to normalizing the data before splitting the data and evaluating the model

In [None]:
# define dataset
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)

In [None]:
# standardize the dataset
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [None]:
# split into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [None]:
# fit the model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
# evaluate the model
yhat = model.predict(X_test)

In [None]:
# evaluate predictions
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.3f' % (accuracy*100))

# correct approach for normalizing the data after the data is split before the model is evaluated

In [None]:
# define dataset
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)

In [None]:
# split into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [None]:
# define the scaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [None]:
# fit on the training dataset
scaler.fit(X_train)

In [None]:
# scale the training dataset
X_train = scaler.transform(X_train)
# scale the test dataset
X_test = scaler.transform(X_test)

In [None]:
# fit the model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
# evaluate the model
yhat = model.predict(X_test)

In [None]:
# evaluate predictions
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.3f' % (accuracy*100))

# naive data preparation for model evaluation with k-fold cross-validation

In [None]:
# define dataset
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)

In [None]:
# standardize the dataset
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [None]:
# define the model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [None]:
# define the evaluation procedure
from sklearn.model_selection import RepeatedStratifiedKFold
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

In [None]:
# evaluate the model using cross-validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

In [None]:
# report performance
from numpy import mean
from numpy import std
print('Accuracy: %.3f (%.3f)' % (mean(scores)*100, std(scores)*100))

# correct data preparation for model evaluation with k-fold cross-validation

In [None]:
# define dataset
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)

In [None]:
# define the pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
steps = list()
steps.append(('scaler', MinMaxScaler()))
steps.append(('model', LogisticRegression()))
pipeline = Pipeline(steps=steps)

In [None]:
# define the evaluation procedure
from sklearn.model_selection import RepeatedStratifiedKFold
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

In [None]:
# evaluate the model using cross-validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

In [None]:
# report performance
from numpy import mean
from numpy import std
print('Accuracy: %.3f (%.3f)' % (mean(scores)*100, std(scores)*100))