### Sampling visualization

In [2]:
import pandas as pd
import numpy as np


In [5]:
# Seed for reproducibility
np.random.seed(42)

# Create a dataset of 1000 entries
data = pd.DataFrame({
    'ID': range(1, 1001),
    'Feature1': np.random.randn(1000),  # Some numerical feature
    'Category': np.random.choice(['A', 'B', 'C'], size=1000, p=[0.7, 0.2, 0.1])  # Imbalanced classes
})

print(data['Category'].value_counts())

Category
A    699
B    209
C     92
Name: count, dtype: int64


In [6]:
#  Random Sampling

random_sample = data.sample(n=100, random_state=42)
print(random_sample['Category'].value_counts(normalize=True))

Category
A    0.75
B    0.15
C    0.10
Name: proportion, dtype: float64


In [7]:
# Stratified Sampling

from sklearn.model_selection import train_test_split

# Stratified sampling by splitting data while preserving Category distribution
_, stratified_sample = train_test_split(
    data,
    test_size=0.1,  # 10% sample
    stratify=data['Category'],
    random_state=42
)

print(stratified_sample['Category'].value_counts(normalize=True))


Category
A    0.70
B    0.21
C    0.09
Name: proportion, dtype: float64


In [8]:
# Systematic Sampling

k = 10  # Sample every 10th entry
start = np.random.randint(0, k)
systematic_sample = data.iloc[start::k]

print(systematic_sample['Category'].value_counts(normalize=True))

Category
A    0.69
B    0.23
C    0.08
Name: proportion, dtype: float64
