## Scaling (Dataset - Image segmentation)

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss,accuracy_score

In [2]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

In [3]:
lbl=LabelEncoder()

In [4]:
image=pd.read_csv('Image_Segmention.csv')

In [5]:
image['Class']=lbl.fit_transform(image['Class'])

In [6]:
X=image.drop('Class',axis=1)
y=image['Class']

In [7]:
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.7,random_state=23,stratify=y)

In [8]:
kfold=StratifiedKFold(n_splits=5,shuffle=True,random_state=23)

In [9]:
from sklearn.preprocessing import StandardScaler

In [10]:
scaler=StandardScaler()
X_scl_train=scaler.fit_transform(X_train)
X_scl_test=scaler.transform(X_test)

In [11]:
knn=KNeighborsClassifier(n_neighbors=1)

In [12]:
knn.fit(X_scl_train,y_train)

In [13]:
y_pred=knn.predict(X_scl_test)

In [14]:
print(accuracy_score(y_test,y_pred))

0.8253968253968254


In [15]:
y_pred_prob=knn.predict_proba(X_scl_test)
print(log_loss(y_test,y_pred_prob))

6.29333630603633


Here, accuracy is improving and logloss is decreasing which is good for the model building. But the scaling portion can not be applied to every algorithm, recommended to apply for the problems of the deep neural network and support vector machine.

# Using the same as above but by using pipeline

In [16]:
from sklearn.pipeline import Pipeline

In [17]:
scaler=StandardScaler()
knn=KNeighborsClassifier(n_neighbors=1)

In [18]:
pipe=Pipeline([('SCL',scaler),('KNN',knn)])

In [19]:
pipe.fit(X_train,y_train)
y_pred=pipe.predict(X_test)
accuracy_score(y_test,y_pred)

0.8253968253968254

In [20]:
y_pred_prob=pipe.predict_proba(X_test)
log_loss(y_test,y_pred_prob)

6.29333630603633

# grid search with pipeline

In [21]:
kfold=StratifiedKFold(n_splits=5,shuffle=True,random_state=23)
scaler=StandardScaler()
knn=KNeighborsClassifier()
pipe=Pipeline([('SCL',scaler),('KNN',knn)])

In [22]:
pipe.get_params()

{'memory': None,
 'steps': [('SCL', StandardScaler()), ('KNN', KNeighborsClassifier())],
 'verbose': False,
 'SCL': StandardScaler(),
 'KNN': KNeighborsClassifier(),
 'SCL__copy': True,
 'SCL__with_mean': True,
 'SCL__with_std': True,
 'KNN__algorithm': 'auto',
 'KNN__leaf_size': 30,
 'KNN__metric': 'minkowski',
 'KNN__metric_params': None,
 'KNN__n_jobs': None,
 'KNN__n_neighbors': 5,
 'KNN__p': 2,
 'KNN__weights': 'uniform'}

In [23]:
params={'KNN__n_neighbors':np.arange(1,11)}

In [24]:
gcv=GridSearchCV(pipe,param_grid=params,cv=kfold,scoring='neg_log_loss')

In [25]:
gcv.fit(X,y)

In [26]:
gcv.best_params_

{'KNN__n_neighbors': 10}

In [27]:
gcv.best_score_

-0.8809357429167448

# grid search with pip[eline and min max scaler]

In [28]:
from sklearn.preprocessing import MinMaxScaler

In [29]:
kfold=StratifiedKFold(n_splits=5,shuffle=True,random_state=23)
scaler=StandardScaler()
knn=KNeighborsClassifier()
pipe=Pipeline([('SCL',scaler),('KNN',knn)])

In [30]:
params={'KNN__n_neighbors':np.arange(1,11),'SCL':[StandardScaler(),MinMaxScaler()]}

In [32]:
gcv=GridSearchCV(pipe,param_grid=params,cv=kfold,scoring='neg_log_loss')
gcv.fit(X,y)

In [33]:
gcv.best_params_

{'KNN__n_neighbors': 10, 'SCL': StandardScaler()}

In [34]:
gcv.best_score_

-0.8809357429167448