# naive approach to normalizing the data before splitting the data and evaluating the model

In [1]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## define dataset

In [2]:
make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)

(array([[ 0.2929949 , -4.21223056, -1.288332  , ..., -4.43170633,
         -2.82646737,  0.44916808],
        [-0.06839901,  5.51884147, 11.2389773 , ..., -3.08994781,
          1.19029898,  1.62025622],
        [ 0.73161622, -0.68468633, -0.98174194, ...,  5.65429655,
         -0.64659866, -3.15652999],
        ...,
        [ 0.81230832,  0.29333773,  3.55727154, ...,  7.52278375,
         -4.50067701, -1.92525878],
        [ 2.62760166, -1.9607565 , -7.1050466 , ...,  0.02433393,
         -0.77573778,  4.04660465],
        [-0.97292653,  0.76166769,  3.98307684, ...,  0.85864477,
          2.406057  ,  2.33338943]]),
 array([1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
        0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1,
        0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0,
        0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1,
        0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
   

In [3]:
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)

In [4]:
X[999]

array([-0.97292653,  0.76166769,  3.98307684,  2.88730396, -0.18573542,
        1.14466856, -0.3314352 , -2.83137486,  1.68227882,  1.84135798,
        0.08549488, -1.33610226,  0.81205043,  1.85272268,  0.44333436,
       -0.99440135, -2.91979512,  0.85864477,  2.406057  ,  2.33338943])

In [5]:
y

array([1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1,

## standardize the dataset

In [6]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [7]:
X[999]

array([0.38291588, 0.4668585 , 0.57016463, 0.76355631, 0.59782353,
       0.56896985, 0.48785869, 0.42733993, 0.62324747, 0.66733958,
       0.47264676, 0.38000391, 0.5246097 , 0.53736985, 0.48444447,
       0.45088788, 0.34607813, 0.46162311, 0.71607991, 0.65700275])

## split into train and test sets

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [9]:
X_train.shape

(670, 20)

In [10]:
X_test.shape

(330, 20)

## fit the model

In [11]:
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## evaluate the model

In [12]:
yhat = model.predict(X_test)

In [13]:
yhat.shape

(330,)

## evaluate predictions

In [14]:
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.3f' % (accuracy*100))

Accuracy: 84.848


# Correct approach for normalizing the data after the data is split before the model is evaluated

## define dataset

In [15]:
X2, y2 = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)

## split into train and test sets

In [16]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.33, random_state=1)

## define the scaler

In [17]:
scaler = MinMaxScaler()

## fit on the training dataset

In [18]:
scaler.fit(X2_train)

MinMaxScaler(copy=True, feature_range=(0, 1))

## scale the training dataset

In [19]:
X2_train = scaler.transform(X2_train)

## scale the test dataset

In [20]:
X2_test = scaler.transform(X2_test)

## fit the model

In [21]:
model = LogisticRegression()
model.fit(X2_train, y2_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## evaluate the model

In [22]:
yhat2 = model.predict(X2_test)

## evaluate predictions

In [23]:
accuracy2 = accuracy_score(y2_test, yhat2)
print('Accuracy: %.3f' % (accuracy2*100))

Accuracy: 85.455
