## Name: Shivkumar G. Chauhan
### Roll No.: CS8A53
### Batch A-3
---
# Experiment - 2 : Explore different Sampling Techniques

## imports

In [1]:
import pandas
import numpy
import statistics
import scipy.stats
import matplotlib.pyplot as pyplot
import warnings
import random
from scipy.cluster.vq import kmeans, vq

## Configurations

initializing matplotlib configurations

hiding warnings

In [2]:
# %matplotlib qt
%matplotlib inline
pyplot.rcParams["figure.figsize"] = (15, 12)

In [3]:
warnings.filterwarnings('ignore')

initialization of functions

In [4]:
def generateClusters(data:pandas.DataFrame,column_name:str,number_of_clusters=5)->list:
    """ Function for getting clusters using Kmeans Method
        Only Numerical Column is allowed

    Args:
        data (pandas.DataFrame): DataFrame Copy
        column_name (str): Specific Column Name
        number_of_clusters (int, optional): number of required clusters . Defaults to 5.

    Returns:
        list: list of DataFrames
    """
    Samples=[]
    column_data=data[column_name]
    cluster_centers,_=kmeans(
        column_data.to_numpy(),
        k_or_guess=number_of_clusters
    )
    cluster_indexes,_ = vq(
        column_data.to_numpy(),
        cluster_centers
    )
    data["cluster_value"]=cluster_indexes
    for i in range(number_of_clusters):
        Samples.append(
            data[data["cluster_value"]==i].sort_values(by=column_name,
                                                       ascending=False
    ).reset_index(
                drop=True
    ).drop(
        ['cluster_value'],axis=1)
    )
    data=data.drop(['cluster_value'],axis=1)
    return Samples

In [5]:
def generateSystematicSamples(
    data:pandas.DataFrame,
    column_name:str,number_of_samples=3,
    number_of_interval_for_selection=2
)-> list:
    """Function for getting Systematic Intervaled samples

    Args:
        data (pandas.DataFrame): DataFrame Copy
        column_name (str): Specific Column Name
        number_of_samples (int, optional): number of required samples. Defaults to 3.
        number_of_interval_for_selection (int, optional): interval between rows. Defaults to 2.

    Returns:
        list: list of DataFrames
    """
    Samples=[]
    data=data.sort_values(
        by=column_name,
        ascending=False
    ).reset_index(drop=True)
    for i in range(number_of_samples):
        Samples.append(
            data[
                i::number_of_interval_for_selection
            ].sort_values(
                by=column_name,
                ascending=False
            ).reset_index(drop=True)
        )
    return Samples

reading dataset

In [6]:
Data=pandas.read_csv("../Data/Raw_Data/24_01_2023_WorldWide.csv")

selecting specific columns

In [7]:
Selected_Columns=['iso_code', 'continent', 'location', 'last_updated_date', 'total_cases', 'new_cases', 'total_deaths', 'new_deaths', 'population']

In [8]:
Data=Data[Selected_Columns]

Sorting the Dataframe using total_cases column

In [9]:
Data=Data.sort_values(by="total_cases",ascending=False).reset_index(drop=True)

## Data Exploration

printing the data table

In [38]:
Data.head()

Unnamed: 0,iso_code,continent,location,last_updated_date,total_cases,new_cases,total_deaths,new_deaths,population
7,USA,North America,United States,2023-01-22,102005805.0,2715.0,1104118.0,15.0,338289900.0
10,IND,Asia,India,2023-01-22,44683239.0,94.0,530735.0,2.0,1417173000.0
11,FRA,Europe,France,2023-01-22,39516793.0,4475.0,163821.0,44.0,67813000.0
12,DEU,Europe,Germany,2023-01-23,37668384.0,8866.0,164703.0,118.0,83369840.0
13,BRA,South America,Brazil,2023-01-22,36718053.0,552.0,696257.0,3.0,215313500.0


printing the dataset information like column values counts, column value data type

printing different statistics of dataset like mean, quantiles, min, max etc.

In [13]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 237 entries, 0 to 236
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   iso_code           237 non-null    object 
 1   continent          224 non-null    object 
 2   location           237 non-null    object 
 3   last_updated_date  237 non-null    object 
 4   total_cases        231 non-null    float64
 5   new_cases          231 non-null    float64
 6   total_deaths       227 non-null    float64
 7   new_deaths         227 non-null    float64
 8   population         236 non-null    float64
dtypes: float64(5), object(4)
memory usage: 16.8+ KB


In [14]:
Data.describe()

Unnamed: 0,total_cases,new_cases,total_deaths,new_deaths,population
count,231.0,231.0,227.0,227.0,236.0
mean,12370450.0,2756.844156,124037.8,27.453744,137197400.0
std,59045090.0,13233.706619,567823.1,120.947774,683229700.0
min,1.0,0.0,1.0,0.0,808.0
25%,32123.5,0.0,297.5,0.0,829769.0
50%,285958.0,0.0,3164.0,0.0,6909971.0
75%,2174932.0,313.5,20490.0,2.0,33531060.0
max,668733400.0,113854.0,6738373.0,1377.0,7975105000.0


In [15]:
Data.describe(include="all")

Unnamed: 0,iso_code,continent,location,last_updated_date,total_cases,new_cases,total_deaths,new_deaths,population
count,237,224,237,237,231.0,231.0,227.0,227.0,236.0
unique,237,6,237,6,,,,,
top,OWID_WRL,Africa,World,2023-01-22,,,,,
freq,1,55,1,225,,,,,
mean,,,,,12370450.0,2756.844156,124037.8,27.453744,137197400.0
std,,,,,59045090.0,13233.706619,567823.1,120.947774,683229700.0
min,,,,,1.0,0.0,1.0,0.0,808.0
25%,,,,,32123.5,0.0,297.5,0.0,829769.0
50%,,,,,285958.0,0.0,3164.0,0.0,6909971.0
75%,,,,,2174932.0,313.5,20490.0,2.0,33531060.0


### Data Reduction

here some countries have NULL Values in their specific columns, so dropping the columns without filling it

checking if any column has null values

checking counts of the null values of columns

In [16]:
Data.isnull().any()

iso_code             False
continent             True
location             False
last_updated_date    False
total_cases           True
new_cases             True
total_deaths          True
new_deaths            True
population            True
dtype: bool

In [17]:
Data.isnull().sum()

iso_code              0
continent            13
location              0
last_updated_date     0
total_cases           6
new_cases             6
total_deaths         10
new_deaths           10
population            1
dtype: int64

dropping the rows that have null values

In [18]:
Data=Data.dropna()

reprinting the dataset

In [19]:
Data

Unnamed: 0,iso_code,continent,location,last_updated_date,total_cases,new_cases,total_deaths,new_deaths,population
7,USA,North America,United States,2023-01-22,102005805.0,2715.0,1104118.0,15.0,3.382899e+08
10,IND,Asia,India,2023-01-22,44683239.0,94.0,530735.0,2.0,1.417173e+09
11,FRA,Europe,France,2023-01-22,39516793.0,4475.0,163821.0,44.0,6.781300e+07
12,DEU,Europe,Germany,2023-01-23,37668384.0,8866.0,164703.0,118.0,8.336984e+07
13,BRA,South America,Brazil,2023-01-22,36718053.0,552.0,696257.0,3.0,2.153135e+08
...,...,...,...,...,...,...,...,...,...
221,MAC,Asia,Macao,2023-01-22,3468.0,6.0,117.0,1.0,6.951800e+05
222,WLF,Oceania,Wallis and Futuna,2023-01-22,3427.0,0.0,7.0,0.0,1.159600e+04
223,SPM,North America,Saint Pierre and Miquelon,2023-01-22,3411.0,0.0,2.0,0.0,5.885000e+03
227,MSR,North America,Montserrat,2023-01-22,1403.0,0.0,8.0,0.0,4.413000e+03


# Sampling


## 1. Random Sampling

### Random Values from Array

#### random indexes from array using numpy module

#### random indexes from array using random module

In [20]:
array=Data.index

In [21]:
numpy.random.choice(array,size=50)

array([207, 124, 104,  30, 144, 114,  56,   7, 187, 174, 111, 227, 110,
        82,  41,  83,  26, 173,  11,  97, 159,  44, 182,  39,  49,  98,
        95, 127, 154,  24, 227,  53, 100, 221, 152, 164, 138, 166,  51,
       213, 195,  69,  80, 209, 140,  38, 191,  27, 111,  93], dtype=int64)

In [22]:
numpy.array(random.sample(list(array),50))

array([ 83,  24, 126,  94, 142, 175, 192,  70,  81,  53, 210, 134,  48,
       199,  44,  62,  90, 144,  76, 223,  98, 183, 193,  41,  64, 107,
       145,  74,  33, 125, 203, 170,  46, 162, 121, 188, 113, 220,  59,
       190, 163, 115, 165, 112, 168, 166, 171,  63, 216, 179])

### Random sample from whole data containing 25 % of rows

In [39]:
random_sample1=Data.sample(frac=0.25,replace=True).reset_index(drop=True)
print("Sample Size: ",len(random_sample1))
display(random_sample1.head(5))

Sample Size:  54


Unnamed: 0,iso_code,continent,location,last_updated_date,total_cases,new_cases,total_deaths,new_deaths,population
0,BHS,North America,Bahamas,2023-01-22,37491.0,0.0,833.0,0.0,409989.0
1,UGA,Africa,Uganda,2023-01-22,170279.0,0.0,3630.0,0.0,47249588.0
2,KAZ,Asia,Kazakhstan,2023-01-22,1495880.0,114.0,19065.0,0.0,19397998.0
3,NOR,Europe,Norway,2023-01-22,1477383.0,26.0,5036.0,73.0,5434324.0
4,ROU,Europe,Romania,2023-01-22,3319680.0,3488.0,67504.0,44.0,19659270.0


## 2. Cluster Sampling

### Sampling using KMeans

#### Sampling bases on new_cases column

In [41]:
print("Clusters Based on new_cases column")
samples=generateClusters(Data.copy(),'new_cases',2)
print("Number of Samples: ",len(samples))
for sample in samples:
    print("Sample Size: ",len(sample),"Rows")
    display(sample.head(5))
    print("\n")

Clusters Based on new_cases column
Number of Samples:  2
Sample Size:  208 Rows


Unnamed: 0,iso_code,continent,location,last_updated_date,total_cases,new_cases,total_deaths,new_deaths,population
0,KOR,Asia,South Korea,2023-01-22,30008756.0,9227.0,33235.0,26.0,51815808.0
1,DEU,Europe,Germany,2023-01-23,37668384.0,8866.0,164703.0,118.0,83369840.0
2,AUS,Oceania,Australia,2023-01-22,11274262.0,8805.0,18092.0,69.0,26177410.0
3,ECU,South America,Ecuador,2023-01-22,1047624.0,7161.0,35940.0,0.0,18001002.0
4,CRI,North America,Costa Rica,2023-01-22,1177514.0,5712.0,9123.0,19.0,5180836.0




Sample Size:  6 Rows


Unnamed: 0,iso_code,continent,location,last_updated_date,total_cases,new_cases,total_deaths,new_deaths,population
0,JPN,Asia,Japan,2023-01-22,32067470.0,64450.0,65377.0,326.0,123951696.0
1,ITA,Europe,Italy,2023-01-22,25415630.0,51888.0,186488.0,495.0,59037472.0
2,ARG,South America,Argentina,2023-01-22,10024095.0,19416.0,130338.0,89.0,45510324.0
3,TWN,Asia,Taiwan,2023-01-22,9342958.0,19187.0,16098.0,27.0,23893396.0
4,GBR,Europe,United Kingdom,2023-01-22,24259240.0,15847.0,216255.0,1377.0,67508936.0






#### Sampling bases on total_cases column

In [25]:
print("Clusters Based on total_cases column")
samples=generateClusters(Data.copy(),'total_cases',2)
print("Number of Samples: ",len(samples))
for sample in samples:
    print("Sample Size: ",len(sample),"Rows")
    display(sample.head(10))
    print("\n")

Clusters Based on total_cases column
Number of Samples:  2
Sample Size:  204 Rows


Unnamed: 0,iso_code,continent,location,last_updated_date,total_cases,new_cases,total_deaths,new_deaths,population
0,TUR,Asia,Turkey,2023-01-22,17042722.0,0.0,101492.0,0.0,85341248.0
1,ESP,Europe,Spain,2023-01-22,13722677.0,11426.0,118183.0,424.0,47558632.0
2,VNM,Asia,Vietnam,2023-01-22,11526329.0,11.0,43186.0,0.0,98186856.0
3,AUS,Oceania,Australia,2023-01-22,11274262.0,8805.0,18092.0,69.0,26177410.0
4,ARG,South America,Argentina,2023-01-22,10024095.0,19416.0,130338.0,89.0,45510324.0
5,TWN,Asia,Taiwan,2023-01-22,9342958.0,19187.0,16098.0,27.0,23893396.0
6,NLD,Europe,Netherlands,2023-01-23,8590602.0,913.0,23079.0,0.0,17564020.0
7,IRN,Asia,Iran,2023-01-22,7563385.0,112.0,144735.0,1.0,88550568.0
8,MEX,North America,Mexico,2023-01-22,7342277.0,797.0,331881.0,11.0,127504120.0
9,IDN,Asia,Indonesia,2023-01-22,6728065.0,218.0,160781.0,4.0,275501344.0




Sample Size:  10 Rows


Unnamed: 0,iso_code,continent,location,last_updated_date,total_cases,new_cases,total_deaths,new_deaths,population
0,USA,North America,United States,2023-01-22,102005805.0,2715.0,1104118.0,15.0,338289900.0
1,IND,Asia,India,2023-01-22,44683239.0,94.0,530735.0,2.0,1417173000.0
2,FRA,Europe,France,2023-01-22,39516793.0,4475.0,163821.0,44.0,67813000.0
3,DEU,Europe,Germany,2023-01-23,37668384.0,8866.0,164703.0,118.0,83369840.0
4,BRA,South America,Brazil,2023-01-22,36718053.0,552.0,696257.0,3.0,215313500.0
5,JPN,Asia,Japan,2023-01-22,32067470.0,64450.0,65377.0,326.0,123951700.0
6,KOR,Asia,South Korea,2023-01-22,30008756.0,9227.0,33235.0,26.0,51815810.0
7,ITA,Europe,Italy,2023-01-22,25415630.0,51888.0,186488.0,495.0,59037470.0
8,GBR,Europe,United Kingdom,2023-01-22,24259240.0,15847.0,216255.0,1377.0,67508940.0
9,RUS,Europe,Russia,2023-01-23,21585328.0,5691.0,386748.0,40.0,144713300.0






#### Sampling bases on population column

In [33]:
print("Clusters Based on population column")
samples=generateClusters(Data.copy(),'population',3)
print("Number of Samples: ",len(samples))
for sample in samples:
    print("Sample Size: ",len(sample),"Rows")
    display(sample.head(10))
    print("\n")

Clusters Based on population column
Number of Samples:  3
Sample Size:  2 Rows


Unnamed: 0,iso_code,continent,location,last_updated_date,total_cases,new_cases,total_deaths,new_deaths,population
0,CHN,Asia,China,2023-01-22,2023904.0,0.0,5273.0,0.0,1425887000.0
1,IND,Asia,India,2023-01-22,44683239.0,94.0,530735.0,2.0,1417173000.0




Sample Size:  17 Rows


Unnamed: 0,iso_code,continent,location,last_updated_date,total_cases,new_cases,total_deaths,new_deaths,population
0,USA,North America,United States,2023-01-22,102005805.0,2715.0,1104118.0,15.0,338289856.0
1,IDN,Asia,Indonesia,2023-01-22,6728065.0,218.0,160781.0,4.0,275501344.0
2,PAK,Asia,Pakistan,2023-01-22,1576185.0,38.0,30640.0,0.0,235824864.0
3,NGA,Africa,Nigeria,2023-01-22,266463.0,0.0,3155.0,0.0,218541216.0
4,BRA,South America,Brazil,2023-01-22,36718053.0,552.0,696257.0,3.0,215313504.0
5,BGD,Asia,Bangladesh,2023-01-22,2037408.0,0.0,29441.0,0.0,171186368.0
6,RUS,Europe,Russia,2023-01-23,21585328.0,5691.0,386748.0,40.0,144713312.0
7,MEX,North America,Mexico,2023-01-22,7342277.0,797.0,331881.0,11.0,127504120.0
8,JPN,Asia,Japan,2023-01-22,32067470.0,64450.0,65377.0,326.0,123951696.0
9,ETH,Africa,Ethiopia,2023-01-22,499306.0,11.0,7572.0,0.0,123379928.0




Sample Size:  195 Rows


Unnamed: 0,iso_code,continent,location,last_updated_date,total_cases,new_cases,total_deaths,new_deaths,population
0,THA,Asia,Thailand,2023-01-22,4725885.0,0.0,33792.0,0.0,71697024.0
1,FRA,Europe,France,2023-01-22,39516793.0,4475.0,163821.0,44.0,67813000.0
2,GBR,Europe,United Kingdom,2023-01-22,24259240.0,15847.0,216255.0,1377.0,67508936.0
3,TZA,Africa,Tanzania,2023-01-22,42530.0,0.0,846.0,0.0,65497752.0
4,ZAF,Africa,South Africa,2023-01-22,4054160.0,364.0,102588.0,20.0,59893884.0
5,ITA,Europe,Italy,2023-01-22,25415630.0,51888.0,186488.0,495.0,59037472.0
6,MMR,Asia,Myanmar,2023-01-22,633801.0,2.0,19490.0,0.0,54179312.0
7,KEN,Africa,Kenya,2023-01-22,342785.0,3.0,5688.0,0.0,54027484.0
8,COL,South America,Colombia,2023-01-22,6348356.0,5297.0,142085.0,126.0,51874028.0
9,KOR,Asia,South Korea,2023-01-22,30008756.0,9227.0,33235.0,26.0,51815808.0






## 3. Strategic Smpling

#### Sampling based on different Dates

In [27]:
print("Number of Samples will be :",len(Data.groupby(by='last_updated_date')))
for sample_name, samples in Data.groupby(by='last_updated_date'):
    print("Sample Name: ",sample_name)
    print("Sample Size: ",len(samples),"Rows")
    display(samples.reset_index(drop=True).head(10))
    print("\n")

Number of Samples will be : 2
Sample Name:  2023-01-22
Sample Size:  207 Rows


Unnamed: 0,iso_code,continent,location,last_updated_date,total_cases,new_cases,total_deaths,new_deaths,population
0,USA,North America,United States,2023-01-22,102005805.0,2715.0,1104118.0,15.0,338289900.0
1,IND,Asia,India,2023-01-22,44683239.0,94.0,530735.0,2.0,1417173000.0
2,FRA,Europe,France,2023-01-22,39516793.0,4475.0,163821.0,44.0,67813000.0
3,BRA,South America,Brazil,2023-01-22,36718053.0,552.0,696257.0,3.0,215313500.0
4,JPN,Asia,Japan,2023-01-22,32067470.0,64450.0,65377.0,326.0,123951700.0
5,KOR,Asia,South Korea,2023-01-22,30008756.0,9227.0,33235.0,26.0,51815810.0
6,ITA,Europe,Italy,2023-01-22,25415630.0,51888.0,186488.0,495.0,59037470.0
7,GBR,Europe,United Kingdom,2023-01-22,24259240.0,15847.0,216255.0,1377.0,67508940.0
8,TUR,Asia,Turkey,2023-01-22,17042722.0,0.0,101492.0,0.0,85341250.0
9,ESP,Europe,Spain,2023-01-22,13722677.0,11426.0,118183.0,424.0,47558630.0


Sample Name:  2023-01-23
Sample Size:  7 Rows


Unnamed: 0,iso_code,continent,location,last_updated_date,total_cases,new_cases,total_deaths,new_deaths,population
0,DEU,Europe,Germany,2023-01-23,37668384.0,8866.0,164703.0,118.0,83369840.0
1,RUS,Europe,Russia,2023-01-23,21585328.0,5691.0,386748.0,40.0,144713312.0
2,NLD,Europe,Netherlands,2023-01-23,8590602.0,913.0,23079.0,0.0,17564020.0
3,AUT,Europe,Austria,2023-01-23,5755617.0,1555.0,21626.0,13.0,8939617.0
4,CHL,South America,Chile,2023-01-23,5103912.0,2075.0,63667.0,27.0,19603736.0
5,MYS,Asia,Malaysia,2023-01-23,5034830.0,309.0,36932.0,0.0,33938216.0
6,DNK,Europe,Denmark,2023-01-23,3398106.0,179.0,8072.0,19.0,5882259.0


#### Sampling based on different continets

In [29]:
print("Number of Samples will be :",len(Data.groupby(by='continent')))
for sample_name, samples in Data.groupby(by='continent'):
    print("Sample Name: ",sample_name)
    print("Sample Size: ",len(samples),"Rows")
    display(samples.reset_index(drop=True).head(10))
    print("\n")

Number of Samples will be : 6
Sample Name:  Africa
Sample Size:  54 Rows


Unnamed: 0,iso_code,continent,location,last_updated_date,total_cases,new_cases,total_deaths,new_deaths,population
0,ZAF,Africa,South Africa,2023-01-22,4054160.0,364.0,102588.0,20.0,59893884.0
1,MAR,Africa,Morocco,2023-01-22,1272164.0,2.0,16296.0,0.0,37457976.0
2,TUN,Africa,Tunisia,2023-01-22,1150217.0,2488.0,29291.0,0.0,12356116.0
3,EGY,Africa,Egypt,2023-01-22,515645.0,0.0,24805.0,0.0,110990096.0
4,LBY,Africa,Libya,2023-01-22,507158.0,0.0,6437.0,0.0,6812344.0
5,ETH,Africa,Ethiopia,2023-01-22,499306.0,11.0,7572.0,0.0,123379928.0
6,KEN,Africa,Kenya,2023-01-22,342785.0,3.0,5688.0,0.0,54027484.0
7,ZMB,Africa,Zambia,2023-01-22,338986.0,330.0,4039.0,0.0,20017670.0
8,BWA,Africa,Botswana,2023-01-22,328581.0,0.0,2795.0,0.0,2630300.0
9,MUS,Africa,Mauritius,2023-01-22,293386.0,0.0,1042.0,0.0,1299478.0


Sample Name:  Asia
Sample Size:  49 Rows


Unnamed: 0,iso_code,continent,location,last_updated_date,total_cases,new_cases,total_deaths,new_deaths,population
0,IND,Asia,India,2023-01-22,44683239.0,94.0,530735.0,2.0,1417173000.0
1,JPN,Asia,Japan,2023-01-22,32067470.0,64450.0,65377.0,326.0,123951700.0
2,KOR,Asia,South Korea,2023-01-22,30008756.0,9227.0,33235.0,26.0,51815810.0
3,TUR,Asia,Turkey,2023-01-22,17042722.0,0.0,101492.0,0.0,85341250.0
4,VNM,Asia,Vietnam,2023-01-22,11526329.0,11.0,43186.0,0.0,98186860.0
5,TWN,Asia,Taiwan,2023-01-22,9342958.0,19187.0,16098.0,27.0,23893400.0
6,IRN,Asia,Iran,2023-01-22,7563385.0,112.0,144735.0,1.0,88550570.0
7,IDN,Asia,Indonesia,2023-01-22,6728065.0,218.0,160781.0,4.0,275501300.0
8,MYS,Asia,Malaysia,2023-01-23,5034830.0,309.0,36932.0,0.0,33938220.0
9,ISR,Asia,Israel,2023-01-22,4780946.0,535.0,12148.0,7.0,9449000.0


Sample Name:  Europe
Sample Size:  48 Rows


Unnamed: 0,iso_code,continent,location,last_updated_date,total_cases,new_cases,total_deaths,new_deaths,population
0,FRA,Europe,France,2023-01-22,39516793.0,4475.0,163821.0,44.0,67813000.0
1,DEU,Europe,Germany,2023-01-23,37668384.0,8866.0,164703.0,118.0,83369840.0
2,ITA,Europe,Italy,2023-01-22,25415630.0,51888.0,186488.0,495.0,59037472.0
3,GBR,Europe,United Kingdom,2023-01-22,24259240.0,15847.0,216255.0,1377.0,67508936.0
4,RUS,Europe,Russia,2023-01-23,21585328.0,5691.0,386748.0,40.0,144713312.0
5,ESP,Europe,Spain,2023-01-22,13722677.0,11426.0,118183.0,424.0,47558632.0
6,NLD,Europe,Netherlands,2023-01-23,8590602.0,913.0,23079.0,0.0,17564020.0
7,POL,Europe,Poland,2023-01-22,6375463.0,73.0,118681.0,0.0,39857144.0
8,AUT,Europe,Austria,2023-01-23,5755617.0,1555.0,21626.0,13.0,8939617.0
9,UKR,Europe,Ukraine,2023-01-22,5675861.0,38.0,118924.0,1.0,39701744.0


Sample Name:  North America
Sample Size:  34 Rows


Unnamed: 0,iso_code,continent,location,last_updated_date,total_cases,new_cases,total_deaths,new_deaths,population
0,USA,North America,United States,2023-01-22,102005805.0,2715.0,1104118.0,15.0,338289856.0
1,MEX,North America,Mexico,2023-01-22,7342277.0,797.0,331881.0,11.0,127504120.0
2,CAN,North America,Canada,2023-01-22,4551279.0,318.0,50248.0,11.0,38454328.0
3,GTM,North America,Guatemala,2023-01-22,1222942.0,243.0,20074.0,2.0,17843914.0
4,CRI,North America,Costa Rica,2023-01-22,1177514.0,5712.0,9123.0,19.0,5180836.0
5,CUB,North America,Cuba,2023-01-22,1112350.0,9.0,8530.0,0.0,11212198.0
6,PAN,North America,Panama,2023-01-22,1028272.0,1025.0,8589.0,0.0,4408582.0
7,DOM,North America,Dominican Republic,2023-01-22,659761.0,155.0,4384.0,0.0,11228821.0
8,HND,North America,Honduras,2023-01-22,470144.0,420.0,11099.0,0.0,10432858.0
9,SLV,North America,El Salvador,2023-01-22,201785.0,0.0,4230.0,0.0,6336393.0


Sample Name:  Oceania
Sample Size:  17 Rows


Unnamed: 0,iso_code,continent,location,last_updated_date,total_cases,new_cases,total_deaths,new_deaths,population
0,AUS,Oceania,Australia,2023-01-22,11274262.0,8805.0,18092.0,69.0,26177410.0
1,NZL,Oceania,New Zealand,2023-01-22,2158663.0,0.0,2437.0,0.0,5185289.0
2,NCL,Oceania,New Caledonia,2023-01-22,79921.0,0.0,314.0,0.0,289959.0
3,PYF,Oceania,French Polynesia,2023-01-22,77957.0,0.0,649.0,0.0,306292.0
4,FJI,Oceania,Fiji,2023-01-22,68808.0,0.0,883.0,0.0,929769.0
5,PNG,Oceania,Papua New Guinea,2023-01-22,46663.0,0.0,669.0,0.0,10142625.0
6,SLB,Oceania,Solomon Islands,2023-01-22,24575.0,0.0,153.0,0.0,724272.0
7,FSM,Oceania,Micronesia (country),2023-01-22,22676.0,429.0,58.0,0.0,114178.0
8,TON,Oceania,Tonga,2023-01-22,16590.0,0.0,13.0,0.0,106867.0
9,WSM,Oceania,Samoa,2023-01-22,16022.0,0.0,29.0,0.0,222390.0


Sample Name:  South America
Sample Size:  12 Rows


Unnamed: 0,iso_code,continent,location,last_updated_date,total_cases,new_cases,total_deaths,new_deaths,population
0,BRA,South America,Brazil,2023-01-22,36718053.0,552.0,696257.0,3.0,215313504.0
1,ARG,South America,Argentina,2023-01-22,10024095.0,19416.0,130338.0,89.0,45510324.0
2,COL,South America,Colombia,2023-01-22,6348356.0,5297.0,142085.0,126.0,51874028.0
3,CHL,South America,Chile,2023-01-23,5103912.0,2075.0,63667.0,27.0,19603736.0
4,PER,South America,Peru,2023-01-22,4478971.0,279.0,218649.0,24.0,34049588.0
5,BOL,South America,Bolivia,2023-01-22,1183705.0,731.0,22342.0,0.0,12224114.0
6,ECU,South America,Ecuador,2023-01-22,1047624.0,7161.0,35940.0,0.0,18001002.0
7,URY,South America,Uruguay,2023-01-22,1029564.0,3754.0,7597.0,11.0,3422796.0
8,PRY,South America,Paraguay,2023-01-22,805486.0,5323.0,19778.0,32.0,6780745.0
9,VEN,South America,Venezuela,2023-01-22,551283.0,79.0,5840.0,0.0,28301700.0


## 4. Systematic Sampling

### Sampling Bases on Intervals

For getting sample that have all values trend within it (means it have all range values of certain column rather than having only maximum or minimum). here the sample will be generated based on interval between Higest to lowest 
like only even index samples, odd index samples, etc.

#### Sampling based on total_cases column

In [31]:
samples=generateSystematicSamples(Data.copy(),'total_cases',number_of_samples=2,number_of_interval_for_selection=5)
for sample in samples:
    print("Sample Size: ",len(sample),"Rows")
    display(sample.head(10))
    print("\n")

Sample Size:  43 Rows


Unnamed: 0,iso_code,continent,location,last_updated_date,total_cases,new_cases,total_deaths,new_deaths,population
0,USA,North America,United States,2023-01-22,102005805.0,2715.0,1104118.0,15.0,338289856.0
1,JPN,Asia,Japan,2023-01-22,32067470.0,64450.0,65377.0,326.0,123951696.0
2,TUR,Asia,Turkey,2023-01-22,17042722.0,0.0,101492.0,0.0,85341248.0
3,TWN,Asia,Taiwan,2023-01-22,9342958.0,19187.0,16098.0,27.0,23893396.0
4,POL,Europe,Poland,2023-01-22,6375463.0,73.0,118681.0,0.0,39857144.0
5,GRC,Europe,Greece,2023-01-22,5548487.0,0.0,34779.0,0.0,10384972.0
6,BEL,Europe,Belgium,2023-01-22,4688927.0,2780.0,33525.0,47.0,11655923.0
7,PHL,Asia,Philippines,2023-01-22,4071963.0,423.0,65694.0,14.0,115559008.0
8,SWE,Europe,Sweden,2023-01-22,2690473.0,2633.0,23020.0,375.0,10549349.0
9,HUN,Europe,Hungary,2023-01-22,2191200.0,866.0,48629.0,51.0,9967304.0


Sample Size:  43 Rows


Unnamed: 0,iso_code,continent,location,last_updated_date,total_cases,new_cases,total_deaths,new_deaths,population
0,IND,Asia,India,2023-01-22,44683239.0,94.0,530735.0,2.0,1417173000.0
1,KOR,Asia,South Korea,2023-01-22,30008756.0,9227.0,33235.0,26.0,51815810.0
2,ESP,Europe,Spain,2023-01-22,13722677.0,11426.0,118183.0,424.0,47558630.0
3,NLD,Europe,Netherlands,2023-01-23,8590602.0,913.0,23079.0,0.0,17564020.0
4,COL,South America,Colombia,2023-01-22,6348356.0,5297.0,142085.0,126.0,51874030.0
5,CHL,South America,Chile,2023-01-23,5103912.0,2075.0,63667.0,27.0,19603740.0
6,CZE,Europe,Czechia,2023-01-22,4586781.0,50.0,42280.0,0.0,10493990.0
7,ZAF,Africa,South Africa,2023-01-22,4054160.0,364.0,102588.0,20.0,59893880.0
8,SVK,Europe,Slovakia,2023-01-22,2659881.0,28.0,20906.0,5.0,5643455.0
9,NZL,Oceania,New Zealand,2023-01-22,2158663.0,0.0,2437.0,0.0,5185289.0


#### Sampling based on continent column

In [32]:
samples=generateSystematicSamples(Data.copy(),'continent',number_of_samples=2,number_of_interval_for_selection=5)
for sample in samples:
    print("Sample Size: ",len(sample),"Rows")
    display(sample.head(10))
    print("\n")

Sample Size:  43 Rows


Unnamed: 0,iso_code,continent,location,last_updated_date,total_cases,new_cases,total_deaths,new_deaths,population
0,VEN,South America,Venezuela,2023-01-22,551283.0,79.0,5840.0,0.0,28301700.0
1,BRA,South America,Brazil,2023-01-22,36718053.0,552.0,696257.0,3.0,215313504.0
2,URY,South America,Uruguay,2023-01-22,1029564.0,3754.0,7597.0,11.0,3422796.0
3,MHL,Oceania,Marshall Islands,2023-01-22,15554.0,0.0,17.0,0.0,41593.0
4,VUT,Oceania,Vanuatu,2023-01-22,12014.0,0.0,14.0,0.0,326744.0
5,PNG,Oceania,Papua New Guinea,2023-01-22,46663.0,0.0,669.0,0.0,10142625.0
6,USA,North America,United States,2023-01-22,102005805.0,2715.0,1104118.0,15.0,338289856.0
7,ATG,North America,Antigua and Barbuda,2023-01-22,9106.0,0.0,146.0,0.0,93772.0
8,CAN,North America,Canada,2023-01-22,4551279.0,318.0,50248.0,11.0,38454328.0
9,GRD,North America,Grenada,2023-01-22,19680.0,0.0,238.0,0.0,125459.0


Sample Size:  43 Rows


Unnamed: 0,iso_code,continent,location,last_updated_date,total_cases,new_cases,total_deaths,new_deaths,population
0,ECU,South America,Ecuador,2023-01-22,1047624.0,7161.0,35940.0,0.0,18001002.0
1,CHL,South America,Chile,2023-01-23,5103912.0,2075.0,63667.0,27.0,19603736.0
2,COL,South America,Colombia,2023-01-22,6348356.0,5297.0,142085.0,126.0,51874028.0
3,FSM,Oceania,Micronesia (country),2023-01-22,22676.0,429.0,58.0,0.0,114178.0
4,WLF,Oceania,Wallis and Futuna,2023-01-22,3427.0,0.0,7.0,0.0,11596.0
5,NCL,Oceania,New Caledonia,2023-01-22,79921.0,0.0,314.0,0.0,289959.0
6,TCA,North America,Turks and Caicos Islands,2023-01-22,6522.0,0.0,38.0,0.0,45726.0
7,SPM,North America,Saint Pierre and Miquelon,2023-01-22,3411.0,0.0,2.0,0.0,5885.0
8,MEX,North America,Mexico,2023-01-22,7342277.0,797.0,331881.0,11.0,127504120.0
9,CUW,North America,Curacao,2023-01-22,45986.0,0.0,301.0,0.0,191173.0
