In [1]:
import pandas as pd
from scipy import stats
import numpy as np

In [2]:
df=pd.read_csv('house_clean.csv')

                                                            ONE SAMPLE T-TEST
                                                  ---*---*---*---*---*---*---*---*---*---

In [4]:
price = df['price']
mean_price = np.mean(price)
print(round(mean_price,0))

464707.0


In [5]:
np.random.seed(11)
sample_size = 25
sample_price= np.random.choice(price,sample_size)
sample_price

array([798000., 469000., 570000., 334000., 270000., 225000., 480000.,
       223000., 274500., 493000., 555000., 500000., 275000., 469000.,
       744000., 490000., 309212., 457500., 280000., 202000., 401000.,
       405000., 349900., 295000., 562500.])

In [6]:
t_value,p_value = stats.ttest_1samp(a=sample_price,popmean=mean_price)

In [12]:
print("SUMMARY OF ONE_SAMPLE_T-TEST")
print("**********************")
print("p_value:",p_value)
print("t_statistics:",t_value)
if p_value<0.05:
    print("conclusion: ")
    print("Result: Reject H0 → Mean price is significantly difference")
else:
    print("conclusion: ")
    print("Result: Fail to reject H0 → No significant difference")
print("**********************")

SUMMARY OF ONE_SAMPLE_T-TEST
**********************
p_value: 0.14159704101525317
t_statistics: -1.5199193949289218
conclusion: 
Result: Fail to reject H0 → No significant difference
**********************


                                                            TWO SAMPLE T-TEST
                                                  ---*---*---*---*---*---*---*---*---*---

In [25]:

sample_1 = df[df["waterfront"] == 'yes']["price"]
sample_2 = df[df["waterfront"] == 'no']["price"]

sample1_mean = sample_1.mean()
sample2_mean = sample_2.mean()
sample1_std = sample_1.std(ddof=1)
sample2_std = sample_2.std(ddof=1)


sample1_n = len(sample_1)
sample2_n = len(sample_2)


z = (sample1_mean - sample2_mean) / np.sqrt((sample1_std**2 / sample1_n) + (sample2_std**2 / sample2_n))


p = 2 * (1 - stats.norm.cdf(abs(z)))


print("SUMMARY OF TWO_SAMPLE_T-TEST")
print("****************************")
print("Mean price (if waterfront):", sample1_mean)
print("Mean price (if no waterfront):", sample2_mean)
print("Z-score:", z)
print("p-value:", p)

alpha = 0.05
if p < alpha:
    print("Reject H0: Mean prices are significantly different between the two groups.")
else:
    print("Fail to reject H0: Mean prices are not significantly different between the two groups.")
print("****************************")


SUMMARY OF TWO_SAMPLE_T-TEST
****************************
Mean price (if waterfront): 512949.8467781818
Mean price (if no waterfront): 464358.711680854
Z-score: 2.92620831122721
p-value: 0.0034312110052354683
Reject H0: Mean prices are significantly different between the two groups.
****************************


                                                              ONE WAY ANOVA
                                                  ---*---*---*---*---*---*---*---*---*---

In [30]:
anova_data = [group['price'].dropna() for name, group in df.groupby('condition')]
f_stat, p_val3 = stats.f_oneway(*anova_data)

print("SUMMARY OF ONE WAY ANOVA")
print("****************************")
print(f"F-statistic = {f_stat:.4f}, p-value = {p_val3:.4f}")
if p_val3 <= alpha:
    print("Result: Reject H₀ → At least one condition group has a different mean price.\n")
else:
    print("Result: Fail to reject H₀ → No significant difference among condition groups.\n")


SUMMARY OF ONE WAY ANOVA
****************************
F-statistic = 18.5552, p-value = 0.0000
Result: Reject H₀ → At least one condition group has a different mean price.



                                                              CHI-SQUARE TEST
                                                  ---*---*---*---*---*---*---*---*---*---

In [32]:
from scipy.stats import chi2_contingency

house= pd.read_csv('house_clean.csv')



contingency_table = pd.crosstab(house['view'],house['waterfront'])

chi2, p, dof, expected = chi2_contingency(contingency_table)

print("contingency_table", contingency_table)
print("Chi-square statistic:", chi2)
print("p-value:", p)
print("Degrees of freedom:", dof)
print('expected frequencie',expected)

if p < 0.05:
    print("Reject the null hypothesis - variables are not independent")
else:
    print("Fail to reject the null hypothesis - variables are independent")

contingency_table waterfront    no  yes
view                 
No          4140    0
average      200    5
excellent     47   23
good         111    5
poor          69    0
Chi-square statistic: 1075.17417766193
p-value: 1.8203282810536785e-231
Degrees of freedom: 4
expected frequencie [[4.11030000e+03 2.97000000e+01]
 [2.03529348e+02 1.47065217e+00]
 [6.94978261e+01 5.02173913e-01]
 [1.15167826e+02 8.32173913e-01]
 [6.85050000e+01 4.95000000e-01]]
Reject the null hypothesis - variables are not independent
