In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score

After going through feature engineering and making the data delicious and ready to feed to an algorithm, how we divide
the data into train and test also matters a lot. This directly affects in the training process which will hamper the predictive
result. This is notebook, I have tested three types of spliting techniques to divide the dataset for training and testing.

In [2]:
data = pd.read_csv('D:/datasets/cancer_dataset.csv')
data.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [3]:
data.shape

(569, 33)

In [4]:
data.isnull().sum()

id                           0
diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_worst             0
compactness_worst            0
concavity_worst              0
concave points_worst         0
symmetry_worst               0
fractal_dimension_worst      0
Unnamed:

In [5]:
data.drop(['Unnamed: 32','id'], inplace=True, axis='columns')

Dropping the Unnamed: 32 column as it has whole data missing and also column 'id' because id has no significance value for now.

In [7]:
data.shape

(569, 31)

Dividing the data into features (X) and target(y).

In [8]:
X = data.iloc[:,1:]
X.head(3)

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


In [9]:
y = data.iloc[:,0]
y.head()

0    M
1    M
2    M
3    M
4    M
Name: diagnosis, dtype: object

In [10]:
y.value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

As the target column is in categorical form, I have converted it into the numerical form using LabelEncoder.

In [11]:
le = LabelEncoder()
y_transformed=le.fit_transform(y)
y_transformed[20:30]

array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1])

All the features were in different scale so used StandardScaler. I did this without much research for now :D

In [12]:
ss = StandardScaler()
X_transformed=ss.fit_transform(X)
X_transformed[0]

array([ 1.09706398, -2.07333501,  1.26993369,  0.9843749 ,  1.56846633,
        3.28351467,  2.65287398,  2.53247522,  2.21751501,  2.25574689,
        2.48973393, -0.56526506,  2.83303087,  2.48757756, -0.21400165,
        1.31686157,  0.72402616,  0.66081994,  1.14875667,  0.90708308,
        1.88668963, -1.35929347,  2.30360062,  2.00123749,  1.30768627,
        2.61666502,  2.10952635,  2.29607613,  2.75062224,  1.93701461])

In [13]:
#train_test_split is also known as Holdout validation approach 
xtrain,xtest,ytrain,ytest=train_test_split(X_transformed,y_transformed, test_size=0.1, random_state=2)
print(xtrain.shape, xtest.shape)

(512, 30) (57, 30)


10% of dataset will be seperated for the test data set and after training the model from 90% dataset we check how the model has
learned from the training dataset.
Disadvantages of train_test_split are: <br>
1)if we have the small dataset, we won't have enough data to train the model. <br>
2)it will have only one iteration. We will not know how the model will exactly perform on unseen data. <br>
3)if we change the random_state value in each run the accuracy/performance will be different which will make anaysis unstable.

In [14]:
lr_model = LogisticRegression()
lr_model.fit(xtrain,ytrain)
pred=lr_model.predict(xtest)
acc_test = accuracy_score(ytest,pred)
print(f'accuracy using train_test_split:{acc_test}')

accuracy using train_test_split:0.9649122807017544


In [15]:
k_val = KFold(10)
cross_val = cross_val_score(LogisticRegression(), X_transformed, y_transformed, cv=k_val)
cross_val

array([0.98245614, 0.96491228, 0.98245614, 0.94736842, 0.96491228,
       0.98245614, 0.98245614, 0.98245614, 1.        , 0.98214286])

KFold cross validation allow us to have multiple train_test_split. <br>
1)This will help us in case of less number of data, <br>
2)We will get the exact lowest and the highest accuracy that the model can give us using that dataset. <br>

In [16]:
cross_val.mean()

0.9771616541353383

StratifiedKFold cross validation will work the best if we have the imbalanced dataset.
StratifiedKFold cross validation solves one issue of cross_val_score. The cross_val_score will train and test the dataset for k times dividing the dataset into different train and test set for each iteration. During the process of divison of the dataset into train and test set for each iteration, there is also a possibility of division of the data in such a way that train set does not have a single data that is present in test set. And can result in false prediction. This needs to be solved by dividing the data in a balanced ratio/proportion. This is done by StratifiedKFold validation technique.<br>
https://stats.stackexchange.com/questions/49540/understanding-stratified-cross-validation <br>
https://stats.stackexchange.com/questions/49540/understanding-stratified-cross-validation#:~:text=A%20quick%20and,fold%20Cross%20Validation%3A

 Though the dataset is not imbalanced
I used this just to demonstrate how to use it.

In [17]:
skfold = StratifiedKFold(n_splits=10)
model = LogisticRegression()
c_v = cross_val_score(model, X_transformed, y_transformed, cv=skfold)
c_v


array([0.98245614, 0.98245614, 0.98245614, 0.96491228, 0.98245614,
       0.98245614, 0.94736842, 1.        , 1.        , 0.98214286])

In [18]:
c_v.mean()

0.9806704260651629