In [34]:
import numerical.sampling as num_sampling
import utils.preprocessing as preprocessing
import numerical.stratification as num_stratification
import numerical.combination as num_combination

import importlib 

importlib.reload(num_sampling) 

<module 'numerical.sampling' from 'c:\\Users\\Adem.ait\\OneDrive - University of Luxembourg\\Feina\\PhD\\sampling\\sample-creator\\numerical\\sampling.py'>

## Numerical sampling of 1 variable

### Preprocessing

In [35]:
file_path = r"type_likes.csv"
df = preprocessing.read_dataframe(file_path)
df = df.drop(columns=['type'])
preprocessing.remove_nan_df(df)
print(df)

variables = preprocessing.create_lists_from_df(df) #diccionario de las variables -> lista de valores
# keys, values = preprocessing.dictionary_to_lists(variables)
statistics = preprocessing.print_and_collect_statistics(variables)

The dataframe does not contain NaN values.
                                     name  likes
0                  0-hero/OIG-small-chip2      8
1                   0-hero/prompt-perfect      0
2                        000alen/semantic      0
3                      000hen/captchaCode      0
4                        003myjoker1/test      0
...                                   ...    ...
674822               zzzzxx/5656ChatPaper      0
674823                   zzzzxx/ChatPaper      0
674824                  zzzzzz/text2image      1
674825               zzzzzz567/Real-CUGAN      0
674826  zzzzzz567/vits-uma-genshin-honkai      0

[674827 rows x 2 columns]
Statistics for numerical variable 'likes':
  Population Size: 674827
  Mean: 1.134440382498033
  Median: 0.0
  Standard Deviation: 28.127517598840843



### Stratification

In [36]:
num_clusters_list = [3] # Specify the number of clusters for each variable
strata = num_stratification.create_stratum_kmeans(variables, num_clusters_list) # Apply KMeans clustering to each variable
num_stratification.print_stratum_counts(strata) #Print the number of elements in each stratum for each variable
stratum_dict = num_stratification.get_stratum_dict(strata)

Variable: likes
  Stratum 1: 674493 points
  Stratum 2: 21 points
  Stratum 3: 313 points


### Sampling

First, we calculate the required variables.

In [37]:
# MAIN CODE - PRESAMPLING
print("PRE-SAMPLING")
N, mu = num_sampling.extract_population_size_and_means(statistics)
print("Population size (N):", N)
print("Means (mu):", mu)
nis, phi = num_sampling.nis_phi(stratum_dict, N) 
s = num_sampling.calculate_std_devs_single(stratum_dict) # Calculate the standard deviations for each stratum
print("nis:", len(nis), nis)
print("phi:", len(phi), phi)
print("s: ", s)

#mu_rounded = round(mu[0], 2)
epsilon = mu[0] * 0.1
#epsilon_rounded = round(epsilon, 3)

PRE-SAMPLING
Population size (N): 674827
Means (mu): [1.134440382498033]
nis: 3 [674493, 21, 313]
phi: 3 [0.9995050583334691, 3.111908681780664e-05, 0.0004638225797130227]
s:  [7.216813436470078, 1749.9527198018666, 351.2434594786889]


In [38]:
n, ni = num_sampling.nStratifiedSampling(epsilon, 0.95, phi, s, 2, N, nis)
print("Size of the global sample:", n)
print("Sample sizes for each stratum:", ni)

mean_estimate, sampling_error, lower_confidence_interval, upper_confidence_interval = num_sampling.sampling_single(
    stratum_dict,   
    phi,      
    nis,      
    s,        
    ni        
)

print("\nSampling process:")
print("N: ", N, " --> n: ", n)
print("Mu: ", mu[0], " --> x-barra: ", mean_estimate)
print("Intervalo de confianza: (", lower_confidence_interval, upper_confidence_interval, ")")

Size of the global sample: 55998
Sample sizes for each stratum: [55971, 2, 25]

Sampling process:
N:  674827  --> n:  55998
Mu:  1.134440382498033  --> x-barra:  1.17392146282282
Intervalo de confianza: ( 1.0635548498844356 1.2842880757612045 )


## Numerical sampling of 2 variables

### Preprocessing

In [39]:
file_path = r"likes_downloads.csv"
df = preprocessing.read_dataframe(file_path)
preprocessing.remove_nan_df(df)
print(df)
variables = preprocessing.create_lists_from_df(df)
statistics = preprocessing.print_and_collect_statistics(variables)

The dataframe does not contain NaN values.
                                                    id  likes  downloads
0                        models/0-hero/flan-alpaca-ul2      4          1
1                          models/0-hero/flan-OIG-base      1          0
2                         models/0-hero/flan-OIG-small      0          1
3                           models/0-hero/flan-OIG-ul2      1          0
4                            models/0-hero/flan-OIG-xl      1          0
...                                                ...    ...        ...
557979                      datasets/zzzzhhh/test_data      0          0
557980                              datasets/zzzzzy/zy      0          0
557981                     datasets/zzzzzzttt/subtrain      0          0
557982                        datasets/zzzzzzttt/train      0          0
557983  datasets/zzzzzzzzzzzzzzzzzzzzzzzzzz/Netonelada      0          0

[557984 rows x 3 columns]
Statistics for numerical variable 'likes':
  Populatio

### Stratification

In [40]:
print("STRATIFICATION:")
num_clusters_list = [3, 3] # Specify the number of clusters for each variable
strata = num_stratification.create_stratum_kmeans(variables, num_clusters_list)
num_stratification.print_stratum_counts(strata) 

STRATIFICATION:
Variable: likes
  Stratum 1: 557907 points
  Stratum 2: 75 points
  Stratum 3: 2 points
Variable: downloads
  Stratum 1: 557953 points
  Stratum 2: 3 points
  Stratum 3: 28 points


### Combination

In [41]:
stratum_ranges = num_combination.get_stratum_ranges(strata)
print("Ranges:", stratum_ranges)
strata_combinations = num_combination.combination(stratum_ranges) 
print("Combinations of ranges:") 
for comb in strata_combinations:
    print(comb)

observations = df.values.tolist()
classified_observations_before = num_combination.classify_observations(observations, strata_combinations)

# Print the number of observations in each stratum in the order of defined strata
print("\nBefore dropping empty strata:")
num_combination.print_combination_stratum_counts(classified_observations_before, strata_combinations)
# Make a copy of classified observations
classified_observations_after = classified_observations_before.copy()
# Drop stratum with 0 observations
num_combination.drop_empty_strata(classified_observations_after, strata_combinations)
# Print the number of observations in each stratum in the order of defined strata after dropping empty strata
print("\nAfter dropping empty strata:")
num_combination.print_combination_stratum_counts(classified_observations_after, strata_combinations)

Ranges: {'likes': [[0, 762], [785, 4500], [6134, 9909]], 'downloads': [[0, 4325801], [36109954, 57594587], [4948671, 27832109]]}
Combinations of ranges:
[[0, 762], [0, 4325801]]
[[0, 762], [36109954, 57594587]]
[[0, 762], [4948671, 27832109]]
[[785, 4500], [0, 4325801]]
[[785, 4500], [36109954, 57594587]]
[[785, 4500], [4948671, 27832109]]
[[6134, 9909], [0, 4325801]]
[[6134, 9909], [36109954, 57594587]]
[[6134, 9909], [4948671, 27832109]]

Before dropping empty strata:
Stratum [[0, 762], [0, 4325801]]: 557882 observations
Stratum [[0, 762], [36109954, 57594587]]: 2 observations
Stratum [[0, 762], [4948671, 27832109]]: 23 observations
Stratum [[785, 4500], [0, 4325801]]: 69 observations
Stratum [[785, 4500], [36109954, 57594587]]: 1 observations
Stratum [[785, 4500], [4948671, 27832109]]: 5 observations
Stratum [[6134, 9909], [0, 4325801]]: 2 observations
Stratum [[6134, 9909], [36109954, 57594587]]: 0 observations
Stratum [[6134, 9909], [4948671, 27832109]]: 0 observations

Total sum 

### Sampling

First, we calculated the required variables.

In [42]:
print("PRE-SAMPLING")
N, mu = num_sampling.extract_population_size_and_means(statistics)
print("Population size (N):", N)
print("Means (mu):", mu)
nis, phi = num_sampling.nis_phi(classified_observations_after, N) # Calculate the stratum proportions
s = num_sampling.calculate_variable_std_devs_multiple(classified_observations_after) # Calculate the standard deviations for each stratum
print("nis:", len(nis), nis)
print("phi:", len(phi), phi)
print("s: ", s)
print("s:")
for variable_index, std_devs_for_variable in enumerate(s, start=1):
    print(f"  Variable {variable_index}: {std_devs_for_variable}")

sample_sizes, strata = num_sampling.calculate_sample_sizes(mu, 0.95, phi, s, 2, N, nis) # Call the function to calculate sample sizes
print("Size of the global sample:", sample_sizes)
print("Sample sizes for each stratum:", strata)

max_n, max_n_dist, max_n_idx = num_sampling.get_max_sample_distribution(sample_sizes, strata)
print("Variable wiht Max n: ", max_n, max_n_dist)

filtered_max_n_dist, filtered_phi, filtered_nis, filtered_s, filtered_classified_observations = num_sampling.filter_zero_strata(
    max_n_dist=max_n_dist,
    phi=phi,
    nis=nis,
    s=s,
    max_n_idx=max_n_idx,
    classified_observations=classified_observations_after
)

print("Filtered max_n_dist:", len(filtered_max_n_dist), filtered_max_n_dist)
print("Filtered phi:", len(filtered_phi), filtered_phi)
print("Filtered nis:", len(filtered_nis), filtered_nis)
print("Filtered s:", len(filtered_s), filtered_s)
print("Filtered classified_observations:", len(filtered_classified_observations))

PRE-SAMPLING
Population size (N): 557984
Means (mu): [1.1097880942822733, 1329.2409854045993]
nis: 7 [557882, 2, 23, 69, 1, 5, 2]
phi: 7 [0.9998171990594712, 3.584332167230602e-06, 4.1219819923151915e-05, 0.00012365945976945575, 1.792166083615301e-06, 8.960830418076504e-06, 3.584332167230602e-06]
s:  [[11.223240468320144, 0.0, 152.1445118278633, 879.9998234261777, 0.0, 1255.6007277395152, 0.0], [26513.112841280315, 0.0, 2951847.996405471, 443738.6895988686, 0.0, 9556331.109827414, 0.0]]
s:
  Variable 1: [11.223240468320144, 0.0, 152.1445118278633, 879.9998234261777, 0.0, 1255.6007277395152, 0.0]
  Variable 2: [26513.112841280315, 0.0, 2951847.996405471, 443738.6895988686, 0.0, 9556331.109827414, 0.0]
Size of the global sample: [65220, 237698]
Sample sizes for each stratum: [[65209, 1, 2, 8, 0, 0, 0], [237655, 1, 10, 30, 0, 2, 0]]
Variable wiht Max n:  237698 [237655, 1, 10, 30, 0, 2, 0]
Filtered max_n_dist: 5 [237655, 1, 10, 30, 2]
Filtered phi: 5 [0.9998171990594712, 3.584332167230602

In [43]:
num_sampling.sampling_multiple(filtered_classified_observations, filtered_phi, filtered_nis, filtered_s, max_n_idx, max_n, filtered_max_n_dist)


NameError: name 'variables' is not defined

In [24]:
N, mu = num_sampling.extract_population_size_and_means(statistics)
print("Population size (N):", N)
print("Means (mu):", mu)
nis, phi = num_sampling.nis_phi(strata, N) # Calculate the stratum proportions
s = num_sampling.calculate_variable_std_devs_multiple(strata) # Calculate the standard deviations for each stratum, TODO: which variable should we put here?
print("nis:", len(nis), nis)
print("phi:", len(phi), phi)
print("s: ", s)
print("s:")
for variable_index, std_devs_for_variable in enumerate(s, start=1):
    print(f"  Variable {variable_index}: {std_devs_for_variable}")

sample_sizes, strata_2 = num_sampling.calculate_sample_sizes(mu, 0.95, phi, s, 2, N, nis) # Call the function to calculate sample sizes
print("Size of the global sample:", sample_sizes)
print("Sample sizes for each stratum:", strata_2)

Population size (N): 557984
Means (mu): [1.1097880942822733, 1329.2409854045993]


TypeError: unhashable type: 'slice'