In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification, load_breast_cancer
from sklearn.cluster import KMeans



# Sampling Algorithms

In [40]:
cancer = load_breast_cancer()
data = pd.DataFrame(cancer.data, columns=cancer.feature_names)
data["Target"] = cancer.target
data.loc[data["Target"] == 0, "Target"] = "malignant" # ממאיר
data.loc[data["Target"] == 1, "Target"] = "benign" # שפיר
data.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,malignant
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,malignant
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,malignant
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,malignant
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,malignant


In [3]:
data["Target"].value_counts()

benign       357
malignant    212
Name: Target, dtype: int64

In [4]:
data["Target"].value_counts(normalize=True)

benign       0.627417
malignant    0.372583
Name: Target, dtype: float64

* **Sampling** is a *process used in statistical analysis in which a predetermined number of observations are taken from a larger population.*

---

## 1. Simple Random Sampling
* **Simple random sampling** is the *basic sampling technique where we select a group of subjects (a sample) for study from a larger group (a population).* Each individual is chosen entirely by chance and each member of the population has an equal chance of being included in the sample. Every possible sample of a given size has the same chance of selection. 

![](https://research-methodology.net/wp-content/uploads/2015/04/Simple-random-sampling2.png)

In [5]:
data.sample(n=5)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Target
93,13.45,18.3,86.6,555.1,0.1022,0.08165,0.03974,0.0278,0.1638,0.0571,...,25.94,97.59,699.4,0.1339,0.1751,0.1381,0.07911,0.2678,0.06603,benign
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,malignant
503,23.09,19.83,152.1,1682.0,0.09342,0.1275,0.1676,0.1003,0.1505,0.05484,...,23.87,211.5,2782.0,0.1199,0.3625,0.3794,0.2264,0.2908,0.07277,malignant
191,12.77,21.41,82.02,507.4,0.08749,0.06601,0.03112,0.02864,0.1694,0.06287,...,23.5,89.04,579.5,0.09388,0.08978,0.05186,0.04773,0.2179,0.06871,benign
312,12.76,13.37,82.29,504.1,0.08794,0.07948,0.04052,0.02548,0.1601,0.0614,...,16.4,92.04,618.8,0.1194,0.2208,0.1769,0.08411,0.2564,0.08253,benign


In [6]:
data.sample(frac=0.25)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Target
295,13.77,13.27,88.06,582.7,0.09198,0.06221,0.010630,0.019170,0.1592,0.05912,...,16.93,94.17,661.1,0.11700,0.10720,0.037320,0.05802,0.2823,0.06794,benign
362,12.76,18.84,81.87,496.6,0.09676,0.07952,0.026880,0.017810,0.1759,0.06183,...,25.99,87.82,579.7,0.12980,0.18390,0.125500,0.08312,0.2744,0.07238,benign
496,12.65,18.17,82.69,485.6,0.10760,0.13340,0.080170,0.050740,0.1641,0.06854,...,22.15,95.29,633.7,0.15330,0.38420,0.358200,0.14070,0.3230,0.10330,benign
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.351400,0.152000,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.938700,0.26500,0.4087,0.12400,malignant
481,13.90,19.24,88.73,602.9,0.07991,0.05326,0.029950,0.020700,0.1579,0.05594,...,26.42,104.40,830.5,0.10640,0.14150,0.167300,0.08150,0.2356,0.07603,benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205,15.12,16.68,98.78,716.6,0.08876,0.09588,0.075500,0.040790,0.1594,0.05986,...,20.24,117.70,989.5,0.14910,0.33310,0.332700,0.12520,0.3415,0.09740,malignant
360,12.54,18.07,79.42,491.9,0.07436,0.02650,0.001194,0.005449,0.1528,0.05185,...,20.98,86.82,585.7,0.09293,0.04327,0.003581,0.01635,0.2233,0.05521,benign
164,23.27,22.04,152.10,1686.0,0.08439,0.11450,0.132400,0.097020,0.1801,0.05553,...,28.22,184.20,2403.0,0.12280,0.35830,0.394800,0.23460,0.3589,0.09187,malignant
237,20.48,21.46,132.50,1306.0,0.08355,0.08348,0.090420,0.060220,0.1467,0.05177,...,26.17,161.70,1750.0,0.12280,0.23110,0.315800,0.14450,0.2238,0.07127,malignant


## 2. Stratified Sampling

---

* **Stratified random sampling** is a method of sampling that *involves the division of a population into smaller sub-groups known* as **strata** In stratified random sampling or stratification, the strata are formed based on members' shared attributes or characteristics such as income or educational attainment.

* **Stratified random sampling** is also called *proportional random sampling or quota random sampling.*

<img src="https://www.qualtrics.com/m/assets/wp-content/uploads/2021/08/Screen-Shot-2021-08-31-at-10.17.31-AM.png" alt="Drawing" style="width: 500px;"/>


In [7]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=["Target"]), 
                                                    data["Target"],
                                                    stratify=data["Target"],
                                                    test_size=0.2)

In [8]:
y_train.value_counts(normalize=True)

benign       0.626374
malignant    0.373626
Name: Target, dtype: float64

In [9]:
y_test.value_counts(normalize=True)

benign       0.631579
malignant    0.368421
Name: Target, dtype: float64

## 3. Systematic Sampling

Systematic sampling is defined as a probability sampling approach where the elements from a target population are selected from a random starting point and after a fixed sampling interval.

We calculate the sampling interval by dividing the entire population size by the desired sample size.

Note that, Systematic Sampling usually produces a random sample but <b>is not addressing the bias in the created sample</b>.

In [10]:
def systematic_sampling(df, step): 
    indexes = np.arange(0, len(df), step=step)
    systematic_sample = df.iloc[indexes]
    return systematic_sample

In [11]:
systematic_sampling(data, 5)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.1622,0.66560,0.7119,0.26540,0.4601,0.11890,malignant
5,12.45,15.70,82.57,477.1,0.12780,0.17000,0.15780,0.08089,0.2087,0.07613,...,23.75,103.40,741.6,0.1791,0.52490,0.5355,0.17410,0.3985,0.12440,malignant
10,16.02,23.24,102.70,797.8,0.08206,0.06669,0.03299,0.03323,0.1528,0.05697,...,33.88,123.80,1150.0,0.1181,0.15510,0.1459,0.09975,0.2948,0.08452,malignant
15,14.54,27.54,96.73,658.8,0.11390,0.15950,0.16390,0.07364,0.2303,0.07077,...,37.13,124.10,943.2,0.1678,0.65770,0.7026,0.17120,0.4218,0.13410,malignant
20,13.08,15.71,85.63,520.0,0.10750,0.12700,0.04568,0.03110,0.1967,0.06811,...,20.49,96.09,630.5,0.1312,0.27760,0.1890,0.07283,0.3184,0.08183,benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
545,13.62,23.23,87.19,573.2,0.09246,0.06747,0.02974,0.02443,0.1664,0.05801,...,29.09,97.58,729.8,0.1216,0.15170,0.1049,0.07174,0.2642,0.06953,benign
550,10.86,21.48,68.51,360.5,0.07431,0.04227,0.00000,0.00000,0.1661,0.05948,...,24.77,74.08,412.3,0.1001,0.07348,0.0000,0.00000,0.2458,0.06592,benign
555,10.29,27.61,65.67,321.4,0.09030,0.07658,0.05999,0.02738,0.1593,0.06127,...,34.91,69.57,357.6,0.1384,0.17100,0.2000,0.09127,0.2226,0.08283,benign
560,14.05,27.15,91.38,600.4,0.09929,0.11260,0.04462,0.04304,0.1537,0.06171,...,33.17,100.20,706.7,0.1241,0.22640,0.1326,0.10480,0.2250,0.08321,benign


## 4. Cluster Sampling

Cluster sampling is a probability sampling technique where we divide the population into multiple clusters(groups) based on certain clustering criteria. Then we select a random cluster(s) with simple random or systematic sampling techniques. So, in cluster sampling, the entire population is divided into clusters or segments and then cluster(s) are randomly selected.

Basic idea:
* Evaluate K-Means. 
* Sample <strong>equal number of observations</strong> from each cluster.

Note that, Systematic Sampling usually produces a random sample but is not addressing the bias in the created sample.


In [12]:
kmeans = KMeans(n_clusters = 4, n_init="auto")
kmeans.fit(data.drop(columns=["Target"]))

In [13]:
data["Cluster"] = kmeans.labels_
data

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Target,Cluster
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,malignant,1
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,malignant,1
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,malignant,1
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,malignant,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,malignant,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,malignant,1
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,malignant,1
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,malignant,2
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,malignant,1


In [14]:
data.groupby("Cluster").sample(n=3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,Target,Cluster
539,7.691,25.44,48.34,170.4,0.08668,0.1199,0.09252,0.01364,0.2037,0.07751,...,54.49,223.6,0.1596,0.3064,0.3393,0.05,0.279,0.1066,benign,0
529,12.07,13.44,77.83,445.2,0.11,0.09009,0.03781,0.02798,0.1657,0.06608,...,86.92,549.9,0.1521,0.1632,0.1622,0.07393,0.2781,0.08052,benign,0
306,13.2,15.82,84.07,537.3,0.08511,0.05251,0.001461,0.003261,0.1632,0.05894,...,92.0,636.9,0.1128,0.1346,0.0112,0.025,0.2651,0.08385,benign,0
129,19.79,25.12,130.4,1192.0,0.1015,0.1589,0.2545,0.1149,0.2202,0.06113,...,148.7,1589.0,0.1275,0.3861,0.5673,0.1732,0.3305,0.08465,malignant,1
121,18.66,17.12,121.4,1077.0,0.1054,0.11,0.1457,0.08665,0.1966,0.06213,...,145.4,1549.0,0.1503,0.2291,0.3272,0.1674,0.2894,0.08456,malignant,1
218,19.8,21.56,129.7,1230.0,0.09383,0.1306,0.1272,0.08691,0.2094,0.05581,...,170.3,2009.0,0.1353,0.3235,0.3617,0.182,0.307,0.08255,malignant,1
64,12.68,23.84,82.69,499.0,0.1122,0.1262,0.1128,0.06873,0.1905,0.0659,...,111.8,888.3,0.1851,0.4061,0.4024,0.1716,0.3383,0.1031,malignant,2
370,16.35,23.29,109.0,840.4,0.09742,0.1497,0.1811,0.08773,0.2175,0.06218,...,129.3,1165.0,0.1415,0.4665,0.7087,0.2248,0.4824,0.09614,malignant,2
171,13.43,19.63,85.84,565.4,0.09048,0.06288,0.05858,0.03438,0.1598,0.05671,...,116.6,993.6,0.1401,0.1546,0.2644,0.116,0.2884,0.07371,malignant,2
503,23.09,19.83,152.1,1682.0,0.09342,0.1275,0.1676,0.1003,0.1505,0.05484,...,211.5,2782.0,0.1199,0.3625,0.3794,0.2264,0.2908,0.07277,malignant,3


## Random Undersampling and Oversampling

---

![](https://miro.medium.com/max/700/0*u6pKLqdCDsG_5kXa.png)

* A widely adopted technique for dealing with highly imbalanced datasets is called resampling. It consists of *removing samples from the majority class* (**under-sampling**) and/or *adding more examples from the minority class* (**over-sampling**).

In [15]:
x = 5 
noise = np.random.random()
print(x + noise)
print(x - noise)

5.027882416960219
4.972117583039781


In [28]:
X, y = make_classification(
    n_classes=2, class_sep=1.5, weights=[0.9, 0.1],
    n_informative=3, n_redundant=1, flip_y=0,
    n_features=20, n_clusters_per_class=1,
    n_samples=100, random_state=10
)
X = pd.DataFrame(X)
X['Target'] = y

We can now do random oversampling and undersampling using:

In [29]:
num_0 = len(X[X['Target']==0])
num_1 = len(X[X['Target']==1])

# random undersample
undersampled_data = pd.concat([X[X['Target']==0].sample(num_1, replace=True) , X[X['Target']==1] ])
print(len(undersampled_data))

20


In [30]:
# random oversample
oversampled_data = pd.concat([X[X['Target']==1] , X[X['Target']==0].sample(num_0, replace=True) ])
print(len(oversampled_data))

100
