# **Intelligent Systems for Bioinformatics**
### Exercise 6: "train_test_split" vs "stratified_train_test_split" using the "iris.csv" dataset 

In [8]:
import numpy as np
import pandas as pd
from si.io.csv_file import read_csv
from si.model_selection.split import train_test_split, stratified_train_test_split

dataset = read_csv("../datasets/iris/iris.csv", sep=",", features=True, label=True)

labels, counts = np.unique(dataset.y, return_counts=True)
print("--- Original distribution ---")
for label, count in zip(labels, counts):
    print(f"Class {label}: {count} samples ({count/len(dataset.y):.2%})")

--- Original distribution ---
Class Iris-setosa: 50 samples (33.33%)
Class Iris-versicolor: 50 samples (33.33%)
Class Iris-virginica: 50 samples (33.33%)


**train_test_split**

In [9]:
# 80% - train / 20% - test
train_random, test_random = train_test_split(dataset, test_size=0.2, random_state=42)

print(f"Train shape: {train_random.shape()}")
print(f"Test shape:  {test_random.shape()}")

# Train
print("\n--- Train distribution ---")
labels_r_train, counts_r_train = np.unique(train_random.y, return_counts=True)
for label, count in zip(labels_r_train, counts_r_train):
    print(f"Class {label}: {count} samples ({count/len(train_random.y):.2%})")

# Test
print("\n--- Test distribution ---")
labels_r_test, counts_r_test = np.unique(test_random.y, return_counts=True)
for label, count in zip(labels_r_test, counts_r_test):
    print(f"Class {label}: {count} samples ({count/len(test_random.y):.2%})")

Train shape: (120, 4)
Test shape:  (30, 4)

--- Train distribution ---
Class Iris-setosa: 40 samples (33.33%)
Class Iris-versicolor: 41 samples (34.17%)
Class Iris-virginica: 39 samples (32.50%)

--- Test distribution ---
Class Iris-setosa: 10 samples (33.33%)
Class Iris-versicolor: 9 samples (30.00%)
Class Iris-virginica: 11 samples (36.67%)


**stratified_train_test_split**

In [10]:
# 80% - train / 20% - test
train_strat, test_strat = stratified_train_test_split(dataset, test_size=0.2, random_state=42)

print(f"Train shape: {train_strat.shape()}")
print(f"Test shape:  {test_strat.shape()}")

# Train
print("\n--- Train distribution ---")
labels_s_train, counts_s_train = np.unique(train_strat.y, return_counts=True)
for label, count in zip(labels_s_train, counts_s_train):
    print(f"Class {label}: {count} samples ({count/len(train_strat.y):.2%})")

# Test
print("\n--- Test distribution ---")
labels_s_test, counts_s_test = np.unique(test_strat.y, return_counts=True)
for label, count in zip(labels_s_test, counts_s_test):
    print(f"Class {label}: {count} samples ({count/len(test_strat.y):.2%})")

Train shape: (120, 4)
Test shape:  (30, 4)

--- Train distribution ---
Class Iris-setosa: 40 samples (33.33%)
Class Iris-versicolor: 40 samples (33.33%)
Class Iris-virginica: 40 samples (33.33%)

--- Test distribution ---
Class Iris-setosa: 10 samples (33.33%)
Class Iris-versicolor: 10 samples (33.33%)
Class Iris-virginica: 10 samples (33.33%)
