In [51]:
import softsampling.mixed.sampling as mix_sampling
import softsampling.utils.preprocessing as preprocessing
import softsampling.mixed.stratification as mix_stratification
import softsampling.mixed.combination as mix_combination

## Sampling of mixed variables

In this case, we sample two variables, one categorical and one numerical.

### Preprocessing

In [52]:
file_path = r"type_likes.csv"
df = preprocessing.read_dataframe(file_path)
df = df.drop("name", axis=1)
preprocessing.remove_nan_df(df)
print(df[:10])

variables = preprocessing.create_lists_from_df(df)
numerical_variables, categorical_variables = preprocessing.separate_numerical_categorical(variables)

print("Numerical variables:")
for variable_name, values in numerical_variables.items():
    print(f"{variable_name}: {len(values)}")

print("\nCategorical variables:")
for variable_name, values in categorical_variables.items():
    print(f"{variable_name}: {len(values)}")

counters = preprocessing.count_elements_in_variables_single(categorical_variables)
print("Counter: ", counters)
all_keys, all_values = preprocessing.dictionary_to_all_lists(counters)

# Only for numerical
statistics = preprocessing.print_and_collect_statistics(variables)

The dataframe does not contain NaN values.
      type  likes
0  dataset      8
1  dataset      0
2  dataset      0
3  dataset      0
4  dataset      0
5  dataset      0
6  dataset      0
7  dataset      0
8  dataset      0
9  dataset      0
Numerical variables:
likes: 674827

Categorical variables:
type: 674827
Counter:  {'type': Counter({'model': 456303, 'space': 116843, 'dataset': 101681})}
Statistics for numerical variable 'likes':
  Population Size: 674827
  Mean: 1.134440382498033
  Median: 0.0
  Standard Deviation: 28.127517598840843



### Stratification

In [54]:
categorical_strata_dict = mix_stratification.create_strata_categoricals(counters) 

print("STRATA DICT:")
for variable, strata in categorical_strata_dict.items():
    print(f"Strata for variable '{variable}': {len(strata)}")
    for i, sublist in enumerate(strata, start=1):
        print(f"Length of stratum {i}: {len(sublist)}")


num_clusters_list = [3] # Elbow Method to be implemented in next release
numerical_strata_dict = mix_stratification.create_strata_kmeans(numerical_variables, num_clusters_list) 
mix_stratification.print_stratum_counts(numerical_strata_dict) 

merged_strata = mix_stratification.merge_strata_dicts(categorical_strata_dict, numerical_strata_dict)
print("MERGED STRATA")

for variable_name, stratum_list in merged_strata.items():
    print(f"Variable: {variable_name}")
    for i, stratum_values in enumerate(stratum_list):
        print(f"  Stratum {i + 1}: {len(stratum_values)} points")

STRATA DICT:
Strata for variable 'type': 3
Length of stratum 1: 101681
Length of stratum 2: 456303
Length of stratum 3: 116843
Variable: likes
  Stratum 1: 674493 points
  Stratum 2: 21 points
  Stratum 3: 313 points
MERGED STRATA
Variable: type
  Stratum 1: 101681 points
  Stratum 2: 456303 points
  Stratum 3: 116843 points
Variable: likes
  Stratum 1: 674493 points
  Stratum 2: 21 points
  Stratum 3: 313 points


### Combination

In [56]:
numerical_ranges = mix_combination.get_stratum_ranges(numerical_strata_dict)
print("Ranges:", numerical_ranges)
combination_strata = mix_combination.combination(numerical_ranges, all_keys)
print(combination_strata)
print(len(combination_strata))
observations = mix_combination.df_to_list_observations(df)

classified_observations = mix_combination.classify_mixed_observations(observations, combination_strata)
total_observations = 0
for stratum, obs_list in classified_observations.items():
    total_observations += len(obs_list)
    print(f"Stratum: {stratum}: {len(obs_list)} observations")
print("Total Observations:", total_observations)

Ranges: {'likes': [[0, 315], [2285, 9909], [317, 1930]]}
[['dataset', [0, 315]], ['dataset', [2285, 9909]], ['dataset', [317, 1930]], ['model', [0, 315]], ['model', [2285, 9909]], ['model', [317, 1930]], ['space', [0, 315]], ['space', [2285, 9909]], ['space', [317, 1930]]]
9
Stratum: ['dataset', [0, 315]]: 101657 observations
Stratum: ['dataset', [2285, 9909]]: 1 observations
Stratum: ['dataset', [317, 1930]]: 23 observations
Stratum: ['model', [0, 315]]: 456066 observations
Stratum: ['model', [2285, 9909]]: 17 observations
Stratum: ['model', [317, 1930]]: 220 observations
Stratum: ['space', [0, 315]]: 116770 observations
Stratum: ['space', [2285, 9909]]: 3 observations
Stratum: ['space', [317, 1930]]: 70 observations
Total Observations: 674827


### Sampling

In [58]:
N, means = mix_sampling.extract_population_size_and_means(statistics)
print(f"Population Size: {N}")
nis, phi = mix_sampling.nis_phi(classified_observations, N)
print(f"Number of observations in each stratum (nis): {nis}")
print(f"Proportion of each stratum (phi): {phi}")

epsilon = 0.05
confidence = 0.95
n = mix_sampling.sample_size(epsilon, confidence)
print("Required sample size:", n)

ni_size = mix_sampling.determine_ni_size(phi, combination_strata, n)
print("Ssample size of each strata:")
for stratum_key, size in ni_size.items():
    print(f"Stratum {stratum_key}: {size} observations")

Population Size: 674827
Number of observations in each stratum (nis): [101657, 1, 23, 456066, 17, 220, 116770, 3, 70]
Proportion of each stratum (phi): [0.15064157183989377, 1.4818612770384112e-06, 3.408280937188346e-05, 0.6758265451738001, 2.5191641709652992e-05, 0.0003260094809484505, 0.1730369413197753, 4.445583831115234e-06, 0.00010373028939268879]
Required sample size: 385
Ssample size of each strata:
Stratum (dataset, [0, 315]): 58 observations
Stratum (dataset, [2285, 9909]): 0 observations
Stratum (dataset, [317, 1930]): 0 observations
Stratum (model, [0, 315]): 260 observations
Stratum (model, [2285, 9909]): 0 observations
Stratum (model, [317, 1930]): 0 observations
Stratum (space, [0, 315]): 67 observations
Stratum (space, [2285, 9909]): 0 observations
Stratum (space, [317, 1930]): 0 observations


In [59]:
combined_sample = mix_sampling.create_sample(classified_observations, ni_size)
print("Combined Sample Size:", len(combined_sample))

print("\nFirst 10 elements of the combined sample:")
for i in range(min(10, len(combined_sample))):
    print(combined_sample[i])

Combined Sample Size: 385

First 10 elements of the combined sample:
['dataset', 0]
['dataset', 0]
['dataset', 0]
['dataset', 0]
['dataset', 0]
['dataset', 0]
['dataset', 2]
['dataset', 0]
['dataset', 1]
['dataset', 0]
