In [1]:
!pip install pystan



In [2]:
import pandas as pd
import cmdstanpy
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('./golden_retrievers.csv')

# Display the first few rows of the dataframe to understand its structure
print(df.head())

# Assuming the dataset has a single column with weights, let's prepare the data for Stan:
weights = df['weight'].values # Replace 'weight' with the actual column name if different
data = {'N': len(weights), 'weights': weights}


      weight
0  65.847758
1  74.322877
2  74.524089
3  65.867708
4  78.392691


In [3]:
stan_model_code = """
data {
  int<lower=0> N; // Number of observations
  vector[N] weights; // Observed weights
}

parameters {
  ordered[2] mean_weights;
  real<lower=0> sd_female; // Standard deviation for female dogs
  real<lower=0> sd_male; // Standard deviation for male dogs
  simplex[2] mix_proportions; // Mixing proportions for the two distributions
}

model {
  // Priors
  mean_weights[1] ~ normal(0, 50); // Prior for mean_female, now mean_weights[1]
  mean_weights[2] ~ normal(0, 50); // Prior for mean_male, now mean_weights[2]
  sd_female ~ normal(0, 50);
  sd_male ~ normal(0, 50);
  
  // Likelihood
  for (n in 1:N) {
    target += log_mix(mix_proportions[1],
                      normal_lpdf(weights[n] | mean_weights[1], sd_female),
                      normal_lpdf(weights[n] | mean_weights[2], sd_male));
  }
}
"""


In [4]:
model_file_path = 'mixture_model.stan'
with open(model_file_path, 'w') as model_file:
    model_file.write(stan_model_code)

In [5]:
# Compile the model
model = cmdstanpy.CmdStanModel(stan_file='./mixture_model.stan')

18:51:14 - cmdstanpy - INFO - compiling stan file /Users/siyuwu/558_HW2/mixture_model.stan to exe file /Users/siyuwu/558_HW2/mixture_model
18:51:26 - cmdstanpy - INFO - compiled model executable: /Users/siyuwu/558_HW2/mixture_model


In [6]:
fit = model.sample(data=data, chains=4, parallel_chains=4)

18:51:26 - cmdstanpy - INFO - CmdStan start processing


chain 1 |          | 00:00 Status

chain 2 |          | 00:00 Status

chain 3 |          | 00:00 Status

chain 4 |          | 00:00 Status

                                                                                                                                                                                                                                                                                                                                

18:52:05 - cmdstanpy - INFO - CmdStan done processing.
Exception: normal_lpdf: Location parameter is inf, but must be finite! (in 'mixture_model.stan', line 23, column 4 to line 25, column 74)
	Exception: normal_lpdf: Location parameter is inf, but must be finite! (in 'mixture_model.stan', line 23, column 4 to line 25, column 74)
	Exception: normal_lpdf: Location parameter is inf, but must be finite! (in 'mixture_model.stan', line 23, column 4 to line 25, column 74)
Exception: normal_lpdf: Location parameter is inf, but must be finite! (in 'mixture_model.stan', line 23, column 4 to line 25, column 74)
	Exception: normal_lpdf: Location parameter is inf, but must be finite! (in 'mixture_model.stan', line 23, column 4 to line 25, column 74)
	Exception: normal_lpdf: Location parameter is inf, but must be finite! (in 'mixture_model.stan', line 23, column 4 to line 25, column 74)
	Exception: normal_lpdf: Location parameter is inf, but must be finite! (in 'mixture_model.stan', line 23, column




In [8]:
print("Summary for Female Mean Weight:")
print(summary_stats.loc['mean_weights[1]'])

print("\nSummary for Male Mean Weight:")
print(summary_stats.loc['mean_weights[2]'])

Summary for Female Mean Weight:
Mean         65.288000
MCSE          0.002502
StdDev        0.149964
5%           65.042500
50%          65.287200
95%          65.532100
N_Eff      3592.200000
N_Eff/s     533.918000
R_hat         1.000850
Name: mean_weights[1], dtype: float64

Summary for Male Mean Weight:
Mean         75.333900
MCSE          0.003305
StdDev        0.218790
5%           74.954500
50%          75.344800
95%          75.682800
N_Eff      4381.350000
N_Eff/s     651.211000
R_hat         0.999836
Name: mean_weights[2], dtype: float64


In [9]:
# Assuming `fit` is your Stan model fit object from cmdstanpy
samples = fit.draws_pd()

In [10]:
import numpy as np
import pandas as pd

# Extracting ordered mean weights samples
mean_female_samples = samples['mean_weights[1]']
mean_male_samples = samples['mean_weights[2]']

# Calculating 95% Confidence Intervals
ci_female = np.percentile(mean_female_samples, [2.5, 97.5])
ci_male = np.percentile(mean_male_samples, [2.5, 97.5])

print(f"95% CI for female golden retrievers' weight: {ci_female}")
print(f"95% CI for male golden retrievers' weight: {ci_male}")


95% CI for female golden retrievers' weight: [64.9990625 65.577315 ]
95% CI for male golden retrievers' weight: [74.884055 75.744315]


In [11]:
# Assuming mix_proportions_samples correctly extracts the samples for the mixing proportions
# If the structure of mix_proportions has not changed, no modification is needed here
mix_proportions_samples = samples[['mix_proportions[1]', 'mix_proportions[2]']]
mix_proportions_mean = mix_proportions_samples.mean(axis=0)

# Calculating the estimated ratio of female to male dogs
ratio_female_to_male = mix_proportions_mean['mix_proportions[1]'] / mix_proportions_mean['mix_proportions[2]']

print(f"Estimated ratio of female to male dogs: {ratio_female_to_male:.2f}")


Estimated ratio of female to male dogs: 2.08
