In [None]:
import softsampling.mixed.sampling as mix_sampling
import softsampling.utils.preprocessing as preprocessing
import softsampling.mixed.stratification as mix_stratification
import softsampling.mixed.combination as mix_combination

## Sampling of mixed variables

In this case, we sample two variables, one categorical and one numerical.

### Preprocessing

In [None]:
file_path = r"type_likes.csv"
df = preprocessing.read_dataframe(file_path)
df = df.drop("name", axis=1)
preprocessing.remove_nan_df(df)
print(df[:10])

variables = preprocessing.create_lists_from_df(df)
numerical_variables, categorical_variables = preprocessing.separate_numerical_categorical(variables)

print("Numerical variables:")
for variable_name, values in numerical_variables.items():
    print(f"{variable_name}: {len(values)}")

print("\nCategorical variables:")
for variable_name, values in categorical_variables.items():
    print(f"{variable_name}: {len(values)}")

counters = preprocessing.count_elements_in_variables_single(categorical_variables)
print("Counter: ", counters)
all_keys, all_values = preprocessing.dictionary_to_all_lists(counters)

# Only for numerical
statistics = preprocessing.print_and_collect_statistics(variables)

### Stratification

In [None]:
categorical_strata_dict = mix_stratification.create_strata_categoricals(counters) 

print("STRATA DICT:")
for variable, strata in categorical_strata_dict.items():
    print(f"Strata for variable '{variable}': {len(strata)}")
    for i, sublist in enumerate(strata, start=1):
        print(f"Length of stratum {i}: {len(sublist)}")


num_clusters_list = [3] # Elbow Method to be implemented in next release
numerical_strata_dict = mix_stratification.create_strata_kmeans(numerical_variables, num_clusters_list) 
mix_stratification.print_stratum_counts(numerical_strata_dict) 

merged_strata = mix_stratification.merge_strata_dicts(categorical_strata_dict, numerical_strata_dict)
print("MERGED STRATA")

for variable_name, stratum_list in merged_strata.items():
    print(f"Variable: {variable_name}")
    for i, stratum_values in enumerate(stratum_list):
        print(f"  Stratum {i + 1}: {len(stratum_values)} points")

### Combination

In [None]:
numerical_ranges = mix_combination.get_stratum_ranges(numerical_strata_dict)
print("Ranges:", numerical_ranges)
combination_strata = mix_combination.combination(numerical_ranges, all_keys)
print(combination_strata)
print(len(combination_strata))
observations = mix_combination.df_to_list_observations(df)

classified_observations = mix_combination.classify_mixed_observations(observations, combination_strata)
total_observations = 0
for stratum, obs_list in classified_observations.items():
    total_observations += len(obs_list)
    print(f"Stratum: {stratum}: {len(obs_list)} observations")
print("Total Observations:", total_observations)

### Sampling

In [None]:
N, means = mix_sampling.extract_population_size_and_means(statistics)
print(f"Population Size: {N}")
nis, phi = mix_sampling.nis_phi(classified_observations, N)
print(f"Number of observations in each stratum (nis): {nis}")
print(f"Proportion of each stratum (phi): {phi}")

epsilon = 0.05
confidence = 0.95
n = mix_sampling.sample_size(epsilon, confidence)
print("Required sample size:", n)

ni_size = mix_sampling.determine_ni_size(phi, combination_strata, n)
print("Ssample size of each strata:")
for stratum_key, size in ni_size.items():
    print(f"Stratum {stratum_key}: {size} observations")

In [None]:
combined_sample = mix_sampling.create_sample(classified_observations, ni_size)
print("Combined Sample Size:", len(combined_sample))

print("\nFirst 10 elements of the combined sample:")
for i in range(min(10, len(combined_sample))):
    print(combined_sample[i])