In [2]:
import pandas as pd
from scipy import stats

# Load the datasets
file_paths = {
    'Alberta': r"C:\Users\Dell\Desktop\Final project\Alberta.csv",
    'Manitoba': r"C:\Users\Dell\Desktop\Final project\Manitoba.csv",
    'Ontario': r"C:\Users\Dell\Desktop\Final project\Ontario.csv",
    'Quebec': r"C:\Users\Dell\Desktop\Final project\Quebec.csv",
    'Vancouver': r"C:\Users\Dell\Desktop\Final project\Vancouver.csv"
}

dataframes = {name: pd.read_csv(path) for name, path in file_paths.items()}

# Statistical Testing: T-tests for pairwise comparisons and ANOVA for all groups
def perform_tests(dataframes, column):
    # Conduct a one-way ANOVA to check for differences between all groups
    data = [df[column].dropna() for df in dataframes.values()]
    anova_result = stats.f_oneway(*data)
    print(f"ANOVA for '{column}': F-statistic = {anova_result.statistic:.3f}, p-value = {anova_result.pvalue:.3e}")
    
    # Pairwise T-tests between datasets
    dataset_names = list(dataframes.keys())
    for i in range(len(dataset_names)):
        for j in range(i + 1, len(dataset_names)):
            group1 = dataframes[dataset_names[i]][column].dropna()
            group2 = dataframes[dataset_names[j]][column].dropna()
            t_test_result = stats.ttest_ind(group1, group2, equal_var=False)  # Welch's t-test
            print(f"T-test between '{dataset_names[i]}' and '{dataset_names[j]}' for '{column}': "
                  f"T-statistic = {t_test_result.statistic:.3f}, p-value = {t_test_result.pvalue:.3e}")

# Run tests on relevant columns (e.g., 'rating', 'number_of_reviews')
for col in ['rating', 'number_of_reviews', 'food', 'service', 'ambience', 'value']:
    print(f"\n=== Testing for column: '{col}' ===")
    perform_tests(dataframes, col)



=== Testing for column: 'rating' ===
ANOVA for 'rating': F-statistic = 1.805, p-value = 1.255e-01
T-test between 'Alberta' and 'Manitoba' for 'rating': T-statistic = -1.504, p-value = 1.343e-01
T-test between 'Alberta' and 'Ontario' for 'rating': T-statistic = -1.429, p-value = 1.550e-01
T-test between 'Alberta' and 'Quebec' for 'rating': T-statistic = -1.276, p-value = 2.034e-01
T-test between 'Alberta' and 'Vancouver' for 'rating': T-statistic = -0.636, p-value = 5.259e-01
T-test between 'Manitoba' and 'Ontario' for 'rating': T-statistic = 0.445, p-value = 6.576e-01
T-test between 'Manitoba' and 'Quebec' for 'rating': T-statistic = 0.428, p-value = 6.696e-01
T-test between 'Manitoba' and 'Vancouver' for 'rating': T-statistic = 1.505, p-value = 1.351e-01
T-test between 'Ontario' and 'Quebec' for 'rating': T-statistic = 0.085, p-value = 9.320e-01
T-test between 'Ontario' and 'Vancouver' for 'rating': T-statistic = 1.742, p-value = 8.212e-02
T-test between 'Quebec' and 'Vancouver' for 

In [8]:
from scipy import stats

# Independent t-test for 'rating' between Vancouver and Ontario
group1 = dataframes['Vancouver']['rating'].dropna()
group2 = dataframes['Ontario']['rating'].dropna()

t_test_result = stats.ttest_ind(group1, group2, equal_var=False)  # Welch's t-test
print(f"Independent T-test between Vancouver and Ontario for 'rating':")
print(f"T-statistic = {t_test_result.statistic:.3f}, p-value = {t_test_result.pvalue:.3e}")


Independent T-test between Vancouver and Ontario for 'rating':
T-statistic = -1.742, p-value = 8.212e-02


In [9]:
import statsmodels.api as sm

# Preparing data for multiple regression analysis for Vancouver
X = dataframes['Vancouver'][['food', 'service', 'ambience', 'value']].dropna()
y = dataframes['Vancouver']['rating'].dropna()

# Align X and y to avoid misalignment due to NaNs
X = X.loc[y.index]
X = sm.add_constant(X)  # Add constant for intercept

model = sm.OLS(y, X).fit()
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.915
Model:                            OLS   Adj. R-squared:                  0.914
Method:                 Least Squares   F-statistic:                     772.4
Date:                Tue, 05 Nov 2024   Prob (F-statistic):          6.72e-152
Time:                        00:16:07   Log-Likelihood:                 247.70
No. Observations:                 291   AIC:                            -485.4
Df Residuals:                     286   BIC:                            -467.0
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.1211      0.090     -1.352      0.1

In [10]:
from scipy.stats import chi2_contingency

# Create a categorical variable for 'number_of_reviews' being above or below the median
dataframes['Vancouver']['high_reviews'] = dataframes['Vancouver']['number_of_reviews'] > dataframes['Vancouver']['number_of_reviews'].median()

# Create a contingency table
contingency_table = pd.crosstab(dataframes['Vancouver']['food_type'], dataframes['Vancouver']['high_reviews'])

# Perform chi-square test
chi2, p, dof, expected = chi2_contingency(contingency_table)
print("Chi-square Test for association between 'food_type' and 'high_reviews':")
print(f"Chi2-statistic = {chi2:.3f}, p-value = {p:.3e}, Degrees of Freedom = {dof}")


Chi-square Test for association between 'food_type' and 'high_reviews':
Chi2-statistic = 118.555, p-value = 4.780e-06, Degrees of Freedom = 58
