In [8]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification, load_breast_cancer
from sklearn.cluster import KMeans

# Sampling Algorithms

In [9]:
cancer = load_breast_cancer()
data = pd.DataFrame(cancer.data, columns=cancer.feature_names)
data["Target"] = cancer.target
data.loc[data["Target"] == 0, "Target"] = "malignant" # ממאיר
data.loc[data["Target"] == 1, "Target"] = "benign" # שפיר
data.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,malignant
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,malignant
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,malignant
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,malignant
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,malignant


In [10]:
data["Target"].value_counts()

benign       357
malignant    212
Name: Target, dtype: int64

In [11]:
data["Target"].value_counts(normalize=True)

benign       0.627417
malignant    0.372583
Name: Target, dtype: float64

* **Sampling** is a *process used in statistical analysis in which a predetermined number of observations are taken from a larger population.*

---

## 1. Simple Random Sampling
* **Simple random sampling** is the *basic sampling technique where we select a group of subjects (a sample) for study from a larger group (a population).* Each individual is chosen entirely by chance and each member of the population has an equal chance of being included in the sample. Every possible sample of a given size has the same chance of selection. 

![](https://research-methodology.net/wp-content/uploads/2015/04/Simple-random-sampling2.png)

In [14]:
data.sample(n=5)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Target
124,13.37,16.39,86.1,553.5,0.07115,0.07325,0.08092,0.028,0.1422,0.05823,...,22.75,91.99,632.1,0.1025,0.2531,0.3308,0.08978,0.2048,0.07628,benign
193,12.34,26.86,81.15,477.4,0.1034,0.1353,0.1085,0.04562,0.1943,0.06937,...,39.34,101.7,768.9,0.1785,0.4706,0.4425,0.1459,0.3215,0.1205,malignant
428,11.13,16.62,70.47,381.1,0.08151,0.03834,0.01369,0.0137,0.1511,0.06148,...,20.29,74.35,421.1,0.103,0.06219,0.0458,0.04044,0.2383,0.07083,benign
531,11.67,20.02,75.21,416.2,0.1016,0.09453,0.042,0.02157,0.1859,0.06461,...,28.81,87.0,550.6,0.155,0.2964,0.2758,0.0812,0.3206,0.0895,benign
411,11.04,16.83,70.92,373.2,0.1077,0.07804,0.03046,0.0248,0.1714,0.0634,...,26.44,79.93,471.4,0.1369,0.1482,0.1067,0.07431,0.2998,0.07881,benign


In [6]:
data.sample(frac=0.25)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Target
483,13.70,17.64,87.76,571.1,0.09950,0.07957,0.04548,0.031600,0.1732,0.06088,...,23.53,95.78,686.5,0.11990,0.13460,0.17420,0.09077,0.2518,0.06960,benign
292,12.95,16.02,83.14,513.7,0.10050,0.07943,0.06155,0.033700,0.1730,0.06470,...,19.93,88.81,585.4,0.14830,0.20680,0.22410,0.10560,0.3380,0.09584,benign
387,13.88,16.16,88.37,596.6,0.07026,0.04831,0.02045,0.008507,0.1607,0.05474,...,19.97,99.66,745.3,0.08484,0.12330,0.10910,0.04537,0.2542,0.06623,benign
455,13.38,30.72,86.34,557.2,0.09245,0.07426,0.02819,0.032640,0.1375,0.06016,...,41.61,96.69,705.6,0.11720,0.14210,0.07003,0.07763,0.2196,0.07675,benign
386,12.21,14.09,78.78,462.0,0.08108,0.07823,0.06839,0.025340,0.1646,0.06154,...,19.29,87.65,529.9,0.10260,0.24310,0.30760,0.09140,0.2677,0.08824,benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278,13.59,17.84,86.24,572.3,0.07948,0.04052,0.01997,0.012380,0.1573,0.05520,...,26.10,98.91,739.1,0.10500,0.07622,0.10600,0.05185,0.2335,0.06263,benign
417,15.50,21.08,102.90,803.1,0.11200,0.15710,0.15220,0.084810,0.2085,0.06864,...,27.65,157.10,1748.0,0.15170,0.40020,0.42110,0.21340,0.3003,0.10480,malignant
506,12.22,20.04,79.47,453.1,0.10960,0.11520,0.08175,0.021660,0.2124,0.06894,...,24.17,85.13,515.3,0.14020,0.23150,0.35350,0.08088,0.2709,0.08839,benign
17,16.13,20.68,108.10,798.8,0.11700,0.20220,0.17220,0.102800,0.2164,0.07356,...,31.48,136.80,1315.0,0.17890,0.42330,0.47840,0.20730,0.3706,0.11420,malignant


## 2. Stratified Sampling

---

* **Stratified random sampling** is a method of sampling that *involves the division of a population into smaller sub-groups known* as **strata** In stratified random sampling or stratification, the strata are formed based on members' shared attributes or characteristics such as income or educational attainment.

* **Stratified random sampling** is also called *proportional random sampling or quota random sampling.*

<img src="https://www.qualtrics.com/m/assets/wp-content/uploads/2021/08/Screen-Shot-2021-08-31-at-10.17.31-AM.png" alt="Drawing" style="width: 500px;"/>


In [15]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=["Target"]), 
                                                    data["Target"],
                                                    stratify=data["Target"],
                                                    test_size=0.2)

In [16]:
y_train.value_counts(normalize=True)

benign       0.626374
malignant    0.373626
Name: Target, dtype: float64

In [17]:
y_test.value_counts(normalize=True)

benign       0.631579
malignant    0.368421
Name: Target, dtype: float64

## 3. Systematic Sampling

Systematic sampling is defined as a probability sampling approach where the elements from a target population are selected from a random starting point and after a fixed sampling interval.

We calculate the sampling interval by dividing the entire population size by the desired sample size.

Note that, Systematic Sampling usually produces a random sample but <b>is not addressing the bias in the created sample</b>.

In [10]:
def systematic_sampling(df, step): 
    indexes = np.arange(0, len(df), step=step)
    systematic_sample = df.iloc[indexes]
    return systematic_sample

In [11]:
systematic_sampling(data, 5)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.1622,0.66560,0.7119,0.26540,0.4601,0.11890,malignant
5,12.45,15.70,82.57,477.1,0.12780,0.17000,0.15780,0.08089,0.2087,0.07613,...,23.75,103.40,741.6,0.1791,0.52490,0.5355,0.17410,0.3985,0.12440,malignant
10,16.02,23.24,102.70,797.8,0.08206,0.06669,0.03299,0.03323,0.1528,0.05697,...,33.88,123.80,1150.0,0.1181,0.15510,0.1459,0.09975,0.2948,0.08452,malignant
15,14.54,27.54,96.73,658.8,0.11390,0.15950,0.16390,0.07364,0.2303,0.07077,...,37.13,124.10,943.2,0.1678,0.65770,0.7026,0.17120,0.4218,0.13410,malignant
20,13.08,15.71,85.63,520.0,0.10750,0.12700,0.04568,0.03110,0.1967,0.06811,...,20.49,96.09,630.5,0.1312,0.27760,0.1890,0.07283,0.3184,0.08183,benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
545,13.62,23.23,87.19,573.2,0.09246,0.06747,0.02974,0.02443,0.1664,0.05801,...,29.09,97.58,729.8,0.1216,0.15170,0.1049,0.07174,0.2642,0.06953,benign
550,10.86,21.48,68.51,360.5,0.07431,0.04227,0.00000,0.00000,0.1661,0.05948,...,24.77,74.08,412.3,0.1001,0.07348,0.0000,0.00000,0.2458,0.06592,benign
555,10.29,27.61,65.67,321.4,0.09030,0.07658,0.05999,0.02738,0.1593,0.06127,...,34.91,69.57,357.6,0.1384,0.17100,0.2000,0.09127,0.2226,0.08283,benign
560,14.05,27.15,91.38,600.4,0.09929,0.11260,0.04462,0.04304,0.1537,0.06171,...,33.17,100.20,706.7,0.1241,0.22640,0.1326,0.10480,0.2250,0.08321,benign


## 4. Cluster Sampling

Cluster sampling is a probability sampling technique where we divide the population into multiple clusters(groups) based on certain clustering criteria. Then we select a random cluster(s) with simple random or systematic sampling techniques. So, in cluster sampling, the entire population is divided into clusters or segments and then cluster(s) are randomly selected.

Basic idea:
* Evaluate K-Means. 
* Sample <strong>equal number of observations</strong> from each cluster.

Note that, Systematic Sampling usually produces a random sample but is not addressing the bias in the created sample.


In [18]:
kmeans = KMeans(n_clusters = 4, n_init="auto")
kmeans.fit(data.drop(columns=["Target"]))

In [19]:
data["Cluster"] = kmeans.labels_
data

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Target,Cluster
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,malignant,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,malignant,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,malignant,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,malignant,3
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,malignant,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,malignant,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,malignant,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,malignant,1
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,malignant,0


In [20]:
data.groupby("Cluster").sample(n=3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Target,Cluster
32,17.02,23.98,112.8,899.3,0.1197,0.1496,0.2417,0.1203,0.2248,0.06382,...,136.1,1344.0,0.1634,0.3559,0.5588,0.1847,0.353,0.08482,malignant,0
563,20.92,25.09,143.0,1347.0,0.1099,0.2236,0.3174,0.1474,0.2149,0.06879,...,179.1,1819.0,0.1407,0.4186,0.6599,0.2542,0.2929,0.09873,malignant,0
365,20.44,21.78,133.8,1293.0,0.0915,0.1131,0.09799,0.07785,0.1618,0.05557,...,161.2,1780.0,0.1327,0.2376,0.2702,0.1765,0.2609,0.06735,malignant,0
235,14.03,21.25,89.79,603.4,0.0907,0.06945,0.01462,0.01896,0.1517,0.05835,...,98.27,715.5,0.1287,0.1513,0.06231,0.07963,0.2226,0.07617,benign,1
378,13.66,15.15,88.27,580.6,0.08268,0.07548,0.04249,0.02471,0.1792,0.05897,...,97.96,657.0,0.1275,0.3104,0.2569,0.1054,0.3387,0.09638,benign,1
439,14.02,15.66,89.59,606.5,0.07966,0.05581,0.02087,0.02652,0.1589,0.05586,...,96.53,688.9,0.1034,0.1017,0.0626,0.08216,0.2136,0.0671,benign,1
219,19.53,32.47,128.0,1223.0,0.0842,0.113,0.1145,0.06637,0.1428,0.05313,...,180.2,2477.0,0.1408,0.4097,0.3995,0.1625,0.2713,0.07568,malignant,2
122,24.25,20.2,166.2,1761.0,0.1447,0.2867,0.4268,0.2012,0.2655,0.06877,...,180.9,2073.0,0.1696,0.4244,0.5803,0.2248,0.3222,0.08009,malignant,2
521,24.63,21.6,165.5,1841.0,0.103,0.2106,0.231,0.1471,0.1991,0.06739,...,205.7,2642.0,0.1342,0.4188,0.4658,0.2475,0.3157,0.09671,malignant,2
294,12.72,13.78,81.78,492.1,0.09667,0.08393,0.01288,0.01924,0.1638,0.061,...,88.54,553.7,0.1298,0.1472,0.05233,0.06343,0.2369,0.06922,benign,3


## Random Undersampling and Oversampling

---

![](https://miro.medium.com/max/700/0*u6pKLqdCDsG_5kXa.png)

* A widely adopted technique for dealing with highly imbalanced datasets is called resampling. It consists of *removing samples from the majority class* (**under-sampling**) and/or *adding more examples from the minority class* (**over-sampling**).

In [22]:
x = 5 
noise = np.random.random()
print(x + noise)
print(x - noise)

5.199380865713106
4.800619134286894


In [27]:
X, y = make_classification(
    n_classes=2, class_sep=1.5, weights=[0.9, 0.1],
    n_informative=3, n_redundant=1, flip_y=0,
    n_features=20, n_clusters_per_class=1,
    n_samples=1000, random_state=10
)
X = pd.DataFrame(X)
X['Target'] = y

In [30]:
X["Target"].value_counts()

0    900
1    100
Name: Target, dtype: int64

We can now do random oversampling and undersampling using:

In [31]:
num_0 = len(X[X['Target']==0])
num_1 = len(X[X['Target']==1])

# random undersample
undersampled_data = pd.concat([X[X['Target']==0].sample(num_1, replace=True) , X[X['Target']==1] ])
print(len(undersampled_data))

200


In [32]:
undersampled_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,Target
353,-0.197353,1.201025,0.080468,0.067607,0.411315,1.961802,-0.101005,0.113312,0.240686,0.257085,...,-0.736062,1.466962,-0.384594,-0.254411,1.009644,-0.822120,2.242333,1.303336,-1.859453,0
595,0.394395,-0.427073,-0.372507,1.496266,1.134986,1.440154,0.550224,1.147878,-1.232716,1.397366,...,1.313225,0.316311,-1.398083,-0.424114,-0.659006,0.860434,0.777760,1.796460,-0.805790,0
262,-0.569452,2.755790,-0.283073,0.401598,0.323625,1.676214,-0.924292,-0.649776,-0.975441,0.379668,...,1.777768,-0.172674,1.288508,0.243777,1.071427,-0.019722,-0.852893,-0.672161,-1.881536,0
587,1.404030,0.580680,0.905312,-0.233846,0.781978,3.622751,-0.139904,-0.445441,0.056473,-0.616301,...,-1.308681,0.608213,-1.277433,-0.160129,-0.310222,1.909698,1.408826,1.215580,-2.658353,0
760,-0.368920,0.058697,-0.275348,0.364648,1.027323,1.164212,-0.529027,-0.883332,0.486737,-0.474294,...,-0.058214,0.681769,-0.140862,-0.096676,1.304003,0.377319,-0.005211,-0.756656,-1.341427,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
968,0.275983,1.428421,-2.126275,0.503742,1.173205,1.962755,1.198808,-0.416271,0.859950,-1.719662,...,-0.064215,1.029803,0.958506,0.502675,-1.062411,-1.732694,0.123666,-1.459266,-0.755807,1
971,-0.016212,-0.411330,-1.671569,-0.189156,-0.125167,1.249297,0.993967,-0.202658,-1.252595,-0.087107,...,0.280432,-0.273754,-1.988645,-1.657225,0.967760,-1.112068,0.950315,-0.170493,-0.879789,1
974,-0.613363,-1.203160,0.565044,-0.774142,-0.052489,2.501952,1.637122,1.242403,0.322285,0.242754,...,-1.300949,0.058802,-1.012772,-0.627606,-0.746415,0.565206,0.801977,1.216387,-0.608757,1
977,0.921729,-1.388586,0.254235,0.116883,0.307525,2.083753,1.146238,-1.559869,-0.437423,-2.067455,...,0.592164,-0.414504,0.860984,-0.742395,1.546997,0.404460,-1.390837,-0.193827,-0.431635,1


In [33]:
undersampled_data["Target"].value_counts()

0    100
1    100
Name: Target, dtype: int64

In [34]:
# random oversample
oversampled_data = pd.concat([X[X['Target']==1] , X[X['Target']==0].sample(num_0, replace=True) ])
print(len(oversampled_data))

1000
