In [16]:
import categorical.stratification as cat_stratification
import utils.preprocessing as preprocessing
import categorical.sampling as cat_sampling 
import categorical.combination as cat_combination

## Categorical sampling of 1 variable

### Preprocessing

In [13]:
print("PREPROCESSING:")
file_path = r"type_likes.csv" 
df = preprocessing.read_dataframe(file_path)
df = df.drop("likes", axis=1)
df = df.drop("name", axis=1)
print(df[:10])
preprocessing.remove_nan_df(df)
variables = preprocessing.create_lists_from_df(df) 
counters = preprocessing.count_elements_in_variables_single(variables)

all_keys, all_values = preprocessing.dictionary_to_all_lists(counters)

statistics = preprocessing.print_and_collect_statistics_single(variables)

PREPROCESSING:
      type
0  dataset
1  dataset
2  dataset
3  dataset
4  dataset
5  dataset
6  dataset
7  dataset
8  dataset
9  dataset
The dataframe does not contain NaN values.
Statistics for categorical variable 'type':
  Population Size: 674827



### Stratification

In [14]:
strata_dict = cat_stratification.create_strata_single(counters) 
print("STRATA DICT:")
for key, values in strata_dict.items():
    print(f"Length of '{key}': {len(values)}")

STRATA DICT:
Length of 'dataset': 101681
Length of 'model': 456303
Length of 'space': 116843


### Sampling

In [15]:
N, _ = cat_sampling.extract_population_size_and_means(statistics)
print(f"Population Size: {N}")
nis, phi = cat_sampling.nis_phi(strata_dict, N)
print(f"Number of observations in each stratum (nis): {nis}")
print(f"Proportion of each stratum (phi): {phi}")

epsilon = 0.05
confidence = 0.95
n = cat_sampling.sample_size(epsilon, confidence)
print("Required sample size:", n)

ni_size = cat_sampling.determine_ni_size_single(phi, all_keys, n)
print("Sample size of each strata:")
for stratum_key, size in ni_size.items():
    print(f"Stratum {stratum_key}: {size} observations")

Population Size: 674827
Number of observations in each stratum (nis): [101681, 456303, 116843]
Proportion of each stratum (phi): [0.1506771365105427, 0.6761777462964582, 0.1731451171929991]
Required sample size: 385
Sample size of each strata:
Stratum dataset: 58 observations
Stratum model: 260 observations
Stratum space: 67 observations


In [17]:
sample = cat_sampling.create_sample(ni_size, strata_dict)

In [18]:
print(f"Total stratified sample size: {len(sample)}")

Total stratified sample size: 385


In [19]:
final_combination_counts = cat_sampling.count_combinations_final(sample)

print("Counts of each combination:")
for combination, count in final_combination_counts.items():
    print(f"Combination: {combination}, Count: {count}")

Counts of each combination:
Combination: dataset, Count: 58
Combination: model, Count: 260
Combination: space, Count: 67


## Categorical sampling of 2 variable

In [20]:
print("PREPROCESSING:")
file_path = r"type_pull.csv" 
df = preprocessing.read_dataframe(file_path)
df.insert(0, 'New_Index', range(1, len(df) + 1)) # Add a new column as index
preprocessing.remove_nan_df(df)
print(df[:10])
variables = preprocessing.create_lists_from_df(df) 
counters = preprocessing.count_elements_in_variables(variables)
print(counters)

all_keys, all_values = preprocessing.dictionary_to_all_lists(counters)


statistics = preprocessing.print_and_collect_statistics(variables)

PREPROCESSING:
The dataframe does not contain NaN values.
   New_Index     type  is_pull_request
0          1  dataset                0
1          2  dataset                0
2          3  dataset                0
3          4  dataset                0
4          5  dataset                0
5          6  dataset                0
6          7  dataset                1
7          8  dataset                1
8          9  dataset                0
9         10  dataset                0
{'type': Counter({'model': 109700, 'dataset': 35258, 'space': 22531}), 'is_pull_request': Counter({1: 105829, 0: 61660})}
Statistics for categorical variable 'type':
  Population Size: 167489

Statistics for numerical variable 'is_pull_request':
  Population Size: 167489
  Mean: 0.6318564204216396
  Median: 1.0
  Standard Deviation: 0.4823006162069377



### Stratification

In [21]:
strata_dict = cat_stratification.create_strata_multiple(counters) 
for variable, strata in strata_dict.items():
    print(f"Strata for variable '{variable}': {len(strata)}")
    for i, sublist in enumerate(strata, start=1):
        print(f"Length of stratum {i}: {len(sublist)}")

Strata for variable 'type': 3
Length of stratum 1: 35258
Length of stratum 2: 109700
Length of stratum 3: 22531
Strata for variable 'is_pull_request': 2
Length of stratum 1: 61660
Length of stratum 2: 105829


### Combination

In [22]:
combination_strata = cat_combination.combination(all_keys)
print(combination_strata)
observations = cat_combination.df_to_list_observations(df)

count_onservations_combination = cat_combination.count_combinations(observations, combination_strata)
print(count_onservations_combination)

classified_observations = cat_combination.classify_observations(observations, combination_strata)
print("CLASSIFIED OBSERVATIONS")
for comb, obs_list in classified_observations.items():
    print(f"Stratum {comb}: {len(obs_list)} observations")


for comb, obs_list in classified_observations.items():
    for obs in obs_list:
        if len(obs) != 3:
            raise ValueError(f"Each sublist in the observations associated with combination {comb} must have a length of three.")
        


[['dataset', 0], ['dataset', 1], ['model', 0], ['model', 1], ['space', 0], ['space', 1]]
{'(dataset, 0)': 22557, '(dataset, 1)': 12701, '(model, 0)': 20715, '(model, 1)': 88985, '(space, 0)': 18388, '(space, 1)': 4143}
CLASSIFIED OBSERVATIONS
Stratum (dataset, 0): 22557 observations
Stratum (dataset, 1): 12701 observations
Stratum (model, 1): 88985 observations
Stratum (model, 0): 20715 observations
Stratum (space, 1): 4143 observations
Stratum (space, 0): 18388 observations


### Sampling

First, we calculate the required variables.

In [24]:
N, means = cat_sampling.extract_population_size_and_means(statistics)
print(f"Population Size: {N}")
nis, phi = cat_sampling.nis_phi(classified_observations, N)
print(f"Number of observations in each stratum (nis): {nis}")
print(f"Proportion of each stratum (phi): {phi}")


epsilon = 0.05
confidence = 0.95
n = cat_sampling.sample_size(epsilon, confidence)
print("Required sample size:", n)

ni_size = cat_sampling.determine_ni_size_multiple(phi, combination_strata, n)
print("Sample size of each stratum:")
for stratum_key, size in ni_size.items():
    print(f"Stratum {stratum_key}: {size} observations")

Population Size: 167489
Number of observations in each stratum (nis): [22557, 12701, 88985, 20715, 4143, 18388]
Proportion of each stratum (phi): [0.13467750120903463, 0.07583184567344721, 0.5312886219393512, 0.12367976404420589, 0.024735952808841177, 0.10978631432511986]
Required sample size: 385
Sample size of each stratum:
Stratum (dataset, 0): 52 observations
Stratum (dataset, 1): 29 observations
Stratum (model, 0): 204 observations
Stratum (model, 1): 48 observations
Stratum (space, 0): 10 observations
Stratum (space, 1): 42 observations


In [25]:
sample = cat_sampling.create_sample(ni_size, classified_observations)

print(f"Total stratified sample size: {len(sample)}")
print("Sample entries:")
for i in range(10):  
    print(sample[i])

final_combination_counts = cat_sampling.count_combinations_final_multiple(sample)

print("Counts of each combination:")
for combination, count in final_combination_counts.items():
    print(f"Combination: {combination}, Count: {count}")

Total stratified sample size: 385
Sample entries:
[1, 'dataset', 0]
[2, 'dataset', 0]
[3, 'dataset', 0]
[4, 'dataset', 0]
[5, 'dataset', 0]
[6, 'dataset', 0]
[9, 'dataset', 0]
[10, 'dataset', 0]
[11, 'dataset', 0]
[12, 'dataset', 0]
Counts of each combination:
Combination: ('dataset', 0), Count: 52
Combination: ('dataset', 1), Count: 29
Combination: ('model', 0), Count: 204
Combination: ('model', 1), Count: 48
Combination: ('space', 0), Count: 10
Combination: ('space', 1), Count: 42
