In [1]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind

In [2]:
df=pd.read_csv('../data/clean/nomads_all_merged.csv', sep=";")

In [3]:
df

Unnamed: 0,rank,city,country,country_code,meal,mcdonalds,beer,coffee,monthly_pass,monthly_rent,...,climate_value,climate_category,cost_of_living_value,cost_of_living_category,pollution_value,pollution_category,quality_of_life_value,quality_of_life_category,internet_speed,digital_nomads_count
0,1,Bangkok,Thailand,TH,3.0,5.0,2.0,2.0,36.0,567.0,...,69.76,High,33.88,Very Low,75.65,High,105.37,Low,225.17,622.0
1,2,Da Nang,Vietnam,VN,2.0,4.0,1.0,2.0,7.0,312.0,...,71.24,High,26.85,Very Low,84.19,Very High,97.18,Very Low,78.34,60.0
2,3,Kuala Lumpur,Malaysia,MY,3.0,4.0,2.0,3.0,22.0,418.0,...,56.55,Moderate,29.82,Very Low,61.14,High,136.22,Moderate,107.55,337.0
3,4,Cape Town,South Africa,ZA,8.0,4.0,1.0,2.0,28.0,658.0,...,95.25,Very High,31.30,Very Low,56.56,Moderate,155.84,High,54.75,97.0
4,5,Buenos Aires,Argentina,AR,6.0,5.0,1.0,2.0,10.0,297.0,...,98.28,Very High,32.65,Very Low,50.81,Moderate,115.06,Low,58.87,156.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1299,1364,Dar es Salaam,Tanzania,TZ,2.0,6.0,1.0,2.0,5.0,318.0,...,71.28,High,25.31,Very Low,62.48,High,0.00,Very Low,19.77,12.0
1300,1365,Saint Petersburg,Russia,RU,10.0,6.0,1.0,2.0,23.0,395.0,...,43.60,Moderate,25.25,Very Low,59.13,Moderate,113.56,Low,95.96,57.0
1301,1366,Yangon,Myanmar,MM,3.0,5.0,1.0,2.0,23.0,317.0,...,0.00,,35.88,Very Low,89.79,Very High,0.00,,20.69,40.0
1302,1368,Tehran,Iran,IR,4.0,5.0,0.0,1.0,16.0,516.0,...,70.99,High,24.03,Very Low,75.24,High,85.42,Very Low,21.36,11.0


## Hypothesis testing 1
#### Null Hypothesis (H₀):
##### There is no significant difference in the number of digital nomads between low-cost and high-cost cities.


#### Alternative Hypothesis (H₁):
##### Low-cost-of-living cities have significantly more digital nomads than high-cost-of-living cities.

In [None]:
# Calculate the sum of digital_nomads_count grouped by cost_of_living_category
df_cat = df.groupby("cost_of_living_category")["digital_nomads_count"].sum().round().reset_index().sort_values(by="digital_nomads_count", ascending=False)
df_cat

In [None]:
low_cost = df_cat[df_cat["cost_of_living_category"].isin(["Very Low", "Low"])]["digital_nomads_count"]
high_cost = df_cat[df_cat["cost_of_living_category"].isin(["High", "Very High"])]["digital_nomads_count"]

In [None]:
t_stat, p_value = ttest_ind(low_cost, high_cost, equal_var=False)
results = pd.DataFrame({"Test": ["Independent t-test"], "T-Statistic": [t_stat], "P-Value": [p_value]})

In [None]:
alpha = 0.05  # Significance level

if p_value < alpha:
    interpretation = "Reject H₀: There is a significant difference, and low-cost cities attract significantly more digital nomads."
else:
    interpretation = "Fail to reject H₀: There is no significant difference in digital nomads between low-cost and high-cost cities."

interpretation

In [None]:
results

## Hypothesis testing 2
#### Null Hypothesis (H₀): 
##### There is no significant difference in the number of digital nomads between low-quality-of-life and high-quality-of-life cities.
#### Alternative Hypothesis (H₁): 
##### High-quality-of-life cities attract significantly more digital nomads than low-quality-of-life cities.

In [None]:
df_cat = df.groupby("quality_of_life_category")["digital_nomads_count"].sum().round().reset_index().sort_values(by="digital_nomads_count", ascending=False)
df_cat

In [None]:
low_qol = df_cat[df_cat["quality_of_life_category"].isin(["Very Low", "Low"])]["digital_nomads_count"]
high_qol = df_cat[df_cat["quality_of_life_category"].isin(["High", "Very High"])]["digital_nomads_count"]

t_stat_qol, p_value_qol = ttest_ind(low_qol, high_qol, equal_var=False)

results_qol = pd.DataFrame({"Test": ["Independent t-test"], "T-Statistic": [t_stat_qol], "P-Value": [p_value_qol]})

In [None]:
alpha = 0.05

if p_value_qol < alpha:
    interpretation_qol = "Reject H₀: There is a significant difference, and high-quality-of-life cities attract significantly more digital nomads."
else:
    interpretation_qol = "Fail to reject H₀: There is no significant difference in digital nomads between low-quality and high-quality cities."

interpretation_qol

In [None]:
results_qol