# Train Test Split

Libraries:

In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split

Data Import:

In [38]:
data = pd.read_csv("data.csv")
data.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


## Data Wrangling

Tidy *Drug* column.

In [39]:
data["Drug"] = data["Drug"].replace(r"[Dd]rug", "", regex=True)

In [40]:
data

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,Y
1,47,M,LOW,HIGH,13.093,C
2,47,M,LOW,HIGH,10.114,C
3,28,F,NORMAL,HIGH,7.798,X
4,61,F,LOW,HIGH,18.043,Y
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,C
196,16,M,LOW,HIGH,12.006,C
197,52,M,NORMAL,HIGH,9.894,X
198,23,M,NORMAL,NORMAL,14.020,X


### Map ordinal features

The features *BP* and *Cholesterol* are ordered categorical variables. For our algorithms to work, these must be encoded into integers.

In [41]:
bp_mapping = {"LOW": 0,
              "NORMAL": 1,
              "HIGH": 2}
cholesterol_mapping = {"NORMAL": 1,
                       "HIGH": 2}

In [42]:
data_encoded = data.copy()
data_encoded["BP"] = data_encoded["BP"].map(bp_mapping)
data_encoded["Cholesterol"] = data_encoded["Cholesterol"].map(cholesterol_mapping)

In [43]:
data_encoded

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,2,2,25.355,Y
1,47,M,0,2,13.093,C
2,47,M,0,2,10.114,C
3,28,F,1,2,7.798,X
4,61,F,0,2,18.043,Y
...,...,...,...,...,...,...
195,56,F,0,2,11.567,C
196,16,M,0,2,12.006,C
197,52,M,1,2,9.894,X
198,23,M,1,1,14.020,X


### One-hot encoding of nominal features

As logistic regression can't take multiple target variables, I will look at two cases. The first case doesn't encode the target, whereas the second does.

In [44]:
data = pd.get_dummies(data=data_encoded,
                      columns=["Sex"],
                      drop_first=True)
data = data[["Age", "Sex_M", "Cholesterol", "Na_to_K", "Drug"]]
data

Unnamed: 0,Age,Sex_M,Cholesterol,Na_to_K,Drug
0,23,0,2,25.355,Y
1,47,1,2,13.093,C
2,47,1,2,10.114,C
3,28,0,2,7.798,X
4,61,0,2,18.043,Y
...,...,...,...,...,...
195,56,0,2,11.567,C
196,16,1,2,12.006,C
197,52,1,2,9.894,X
198,23,1,1,14.020,X


The features *Sex* and *Drug* are nominal features (they can't be ordered), thus they are one-hot encoded. To reduce the correlation among the variables, the first feature of the one-hot encoded variables is removed. Here the column Sex_M explains if the patient is male, the drug columns explain which drugs are taken. If all are zero, this means that drug A is taken.

In [45]:
data_encoded = pd.get_dummies(data=data)

In [46]:
data_encoded

Unnamed: 0,Age,Sex_M,Cholesterol,Na_to_K,Drug_A,Drug_B,Drug_C,Drug_X,Drug_Y
0,23,0,2,25.355,0,0,0,0,1
1,47,1,2,13.093,0,0,1,0,0
2,47,1,2,10.114,0,0,1,0,0
3,28,0,2,7.798,0,0,0,1,0
4,61,0,2,18.043,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
195,56,0,2,11.567,0,0,1,0,0
196,16,1,2,12.006,0,0,1,0,0
197,52,1,2,9.894,0,0,0,1,0
198,23,1,1,14.020,0,0,0,1,0


## Train test split

Before we start, check version of scikit-learn, so that the seed yields same results. Here I have version 1.2.2.

In [47]:
!conda list scikit-learn

# packages in environment at /Users/Alexander/opt/anaconda3:
#
# Name                    Version                   Build  Channel
scikit-learn              1.2.2                    pypi_0    pypi


In [48]:
!pip show scikit-learn

Name: scikit-learn
Version: 1.2.2
Summary: A set of python modules for machine learning and data mining
Home-page: http://scikit-learn.org
Author: 
Author-email: 
License: new BSD
Location: /Users/Alexander/opt/anaconda3/lib/python3.9/site-packages
Requires: joblib, numpy, scipy, threadpoolctl
Required-by: 


### Not encoded target

Perform train test split with sklearn.

In [55]:
train, test = train_test_split(data,
                     test_size = 0.2,
                     random_state = 10,
                     stratify = y)

Check if it worked.

In [56]:
train

Unnamed: 0,Age,Sex_M,Cholesterol,Na_to_K,Drug
57,40,1,2,27.826,Y
67,17,1,1,10.832,X
69,18,0,1,24.276,Y
35,46,1,1,7.285,X
8,60,1,2,15.171,Y
...,...,...,...,...,...
54,68,0,1,10.189,B
106,22,1,2,11.953,X
154,37,1,1,16.724,Y
159,34,0,1,12.923,X


In [57]:
test

Unnamed: 0,Age,Sex_M,Cholesterol,Na_to_K,Drug
66,29,1,2,12.856,A
48,23,1,2,31.686,Y
30,18,0,1,8.75,X
62,67,1,1,20.693,Y
39,15,1,2,9.084,X
108,72,1,1,9.677,B
28,39,0,1,22.697,Y
68,54,1,2,24.658,Y
171,45,1,1,10.017,X
127,35,1,1,7.845,X


### Encoded target

Perform train test split with sklearn.

In [58]:
train_encoded, test_encoded = train_test_split(
    data_encoded, test_size=0.2, random_state=10, stratify=y
)

Check if it worked.

In [59]:
train_encoded

Unnamed: 0,Age,Sex_M,Cholesterol,Na_to_K,Drug_A,Drug_B,Drug_C,Drug_X,Drug_Y
57,40,1,2,27.826,0,0,0,0,1
67,17,1,1,10.832,0,0,0,1,0
69,18,0,1,24.276,0,0,0,0,1
35,46,1,1,7.285,0,0,0,1,0
8,60,1,2,15.171,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
54,68,0,1,10.189,0,1,0,0,0
106,22,1,2,11.953,0,0,0,1,0
154,37,1,1,16.724,0,0,0,0,1
159,34,0,1,12.923,0,0,0,1,0


In [60]:
test_encoded

Unnamed: 0,Age,Sex_M,Cholesterol,Na_to_K,Drug_A,Drug_B,Drug_C,Drug_X,Drug_Y
66,29,1,2,12.856,1,0,0,0,0
48,23,1,2,31.686,0,0,0,0,1
30,18,0,1,8.75,0,0,0,1,0
62,67,1,1,20.693,0,0,0,0,1
39,15,1,2,9.084,0,0,0,1,0
108,72,1,1,9.677,0,1,0,0,0
28,39,0,1,22.697,0,0,0,0,1
68,54,1,2,24.658,0,0,0,0,1
171,45,1,1,10.017,0,0,0,1,0
127,35,1,1,7.845,0,0,0,1,0


## Export

Export data to csv.

In [61]:
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

In [62]:
train_encoded.to_csv("train_encoded.csv", index=False)
test_encoded.to_csv("test_encoded.csv", index=False)