# Assignmetn 4:  CHI-Square and Hypothesis testing

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency , chi2

##  Organising the Data into categorical table

- The data is summarized in a contingency table showing the counts of customers in each satisfaction level for both types of devices

In [2]:
data = {
    "Satisfaction Level": ["Very Satisfied", "Satisfied", "Neutral", "Unsatisfied", "Very Unsatisfied"],
    "Smart Thermostat": [50, 80, 60, 30, 20],
    "Smart Light": [70, 100, 90, 50, 50]
}
df = pd.DataFrame(data)

In [3]:
# Add total rows and columns

df["Total"] = df["Smart Thermostat"] + df["Smart Light"]
total_row = df.sum(numeric_only=True)
total_row["Satisfaction Level"] = "Total"
df = pd.concat([df, pd.DataFrame([total_row])], ignore_index=True)

In [4]:
# Display the contingency table

print("Contingency Table:")
print(df)

Contingency Table:
  Satisfaction Level  Smart Thermostat  Smart Light  Total
0     Very Satisfied                50           70    120
1          Satisfied                80          100    180
2            Neutral                60           90    150
3        Unsatisfied                30           50     80
4   Very Unsatisfied                20           50     70
5              Total               240          360    600


## 1. State the Hypothesis

In [5]:
# Contingency table data
observed = np.array([[50, 70],
                     [80, 100],
                     [60, 90],
                     [30, 50],
                     [20, 50]])

# Perform Chi-Square test for independence
chi2, p, dof, expected = chi2_contingency(observed)

# Output the results
print("Chi-Square Statistic:", chi2)
print("P-value:", p)
print("Degrees of Freedom:", dof)
print("Expected Frequencies:")
print(expected)

Chi-Square Statistic: 5.638227513227513
P-value: 0.22784371130697179
Degrees of Freedom: 4
Expected Frequencies:
[[ 48.  72.]
 [ 72. 108.]
 [ 60.  90.]
 [ 32.  48.]
 [ 28.  42.]]


## 2. Compute the Chi-square Statistic

In [6]:
# Contingency table data
observed = np.array([[50, 70],
                     [80, 100],
                     [60, 90],
                     [30, 50],
                     [20, 50]])

# Calculate expected frequencies
row_totals = observed.sum(axis=1)
col_totals = observed.sum(axis=0)
total = observed.sum()

expected = np.outer(row_totals, col_totals) / total

# Compute Chi-Square statistic
chi2 = np.sum((observed - expected)**2 / expected)

print("Chi-Square Statistic (Manual):", chi2)

Chi-Square Statistic (Manual): 5.638227513227513


# 3,Determine The Critical Value

In [7]:
from scipy.stats import chi2

alpha = 0.05
df = 4

critical_value = chi2.ppf(1 - alpha, df)

print("Critical Value:", critical_value)

Critical Value: 9.487729036781154


## 4.Making a Decision

In [8]:
# Chi-Square statistic from earlier calculation
chi2_statistic = 28.64297419895143  # Replace with your calculated Chi-Square statistic

# Critical value from earlier calculation
critical_value = 9.487729036781154  # Replace with your calculated critical value

# Make a decision
if chi2_statistic > critical_value:
    print("Reject the null hypothesis. There is a significant association between device type and satisfaction level.")
else:
    print("Fail to reject the null hypothesis. There is no significant association between device type and satisfaction level.")

Reject the null hypothesis. There is a significant association between device type and satisfaction level.


# Conclusion

- After performing the Chi-Square test for independence, we found that the Chi-Square statistic (𝜒^2 = 4.629) is less than the critical value (9.488) at the 0.05 significance level.
- Thus, we fail to reject the null hypothesis.
- This means there is no significant association between the type of smart home device purchased (Smart Thermostat or Smart Light) and the customer satisfaction level. In other words, customer satisfaction does not significantly depend on the type of device purchased.

# Hypothesis Testing

## 1.State the Hypotheses statement

In [9]:
from scipy.stats import norm

# Given data
sample_mean = 3050  # Mean weekly cost observed from the sample of 25 restaurants
theoretical_mean = 1000 + 5 * 600  # Theoretical mean weekly cost based on the model
sample_size = 25
population_mean = 600  # Mean number of units produced in a week
population_std = 25  # Standard deviation of number of units produced in a week

In [10]:
# Calculate the standard error of the sample mean

standard_error = population_std / (sample_size ** 0.5)

In [11]:
# Calculate the z-score

z_score = (sample_mean - theoretical_mean) / standard_error

In [12]:
# Calculate the p-value for the one-tailed test

p_value = 1 - norm.cdf(z_score)

In [13]:
# Set the significance level
alpha = 0.05

# State the hypotheses
print("Null Hypothesis (H0): The mean weekly operating cost is equal to the theoretical model's prediction.")
print("Alternative Hypothesis (H1): The mean weekly operating cost is greater than the theoretical model's prediction.")

# Make a decision based on the p-value
if p_value < alpha:
    print("Reject the null hypothesis. There is sufficient evidence to suggest that the mean weekly operating cost is higher than the theoretical model's prediction.")
else:
    print("Fail to reject the null hypothesis. There is not enough evidence to suggest that the mean weekly operating cost is higher than the theoretical model's prediction.")

Null Hypothesis (H0): The mean weekly operating cost is equal to the theoretical model's prediction.
Alternative Hypothesis (H1): The mean weekly operating cost is greater than the theoretical model's prediction.
Fail to reject the null hypothesis. There is not enough evidence to suggest that the mean weekly operating cost is higher than the theoretical model's prediction.


## 2.Calculate the Test Stastics

In [14]:
import scipy.stats as stats

# Given data
sample_mean = 3050  # Sample mean weekly cost
theoretical_mean = 1000 + 5 * 600  # Theoretical mean weekly cost based on the model
population_std = 5 * 25  # Standard deviation of the population
sample_size = 25  # Sample size

In [15]:
# Calculate the standard error

standard_error = population_std / (sample_size ** 0.5)

In [16]:
# Calculate the test statistic (t)

t_statistic = (sample_mean - theoretical_mean) / standard_error

In [17]:
print("Test Statistic (t):", t_statistic)

Test Statistic (t): -38.0


In [18]:
# Degrees of freedom
df = sample_size - 1

# Calculate the critical value
alpha = 0.05  # Significance level
critical_value = stats.t.ppf(1 - alpha, df)

print("Critical Value (t_critical):", critical_value)

Critical Value (t_critical): 1.7108820799094275


## 3.Determine the Critical Value

- the standard normal (Z) distribution table, rather than the t-distribution table, to determine the critical value for a one-tailed test with a significance level (alpha) of 0.05, we can find the critical value corresponding to the cumulative probability of 1 - alpha. we need to find the Z-score that corresponds to a cumulative probability of 0.95. We can do this using Python's scipy.stats.norm module.

In [19]:
from scipy.stats import norm

alpha = 0.05  # Significance level
critical_value = norm.ppf(1 - alpha)  # Percent point function (inverse of the cumulative distribution function)

print("Critical Value (Z_critical):", critical_value)

Critical Value (Z_critical): 1.6448536269514722


## 4.Make a Decision

- If the test statistic is greater than the critical value, we reject the null hypothesis. If the test statistic is less than or equal to the critical value, we fail to reject the null hypothesis.

In [20]:
# Test statistic and critical value from previous calculations
test_statistic = 2.9  # Replace with your calculated test statistic
critical_value = 1.645  # Replace with your calculated critical value

# Significance level (alpha)
alpha = 0.05

# Make a decision
if test_statistic > critical_value:
    print("Reject the null hypothesis. There is sufficient evidence to suggest that the mean weekly operating cost is higher than the theoretical model's prediction.")
else:
    print("Fail to reject the null hypothesis. There is not enough evidence to suggest that the mean weekly operating cost is higher than the theoretical model's prediction.")

Reject the null hypothesis. There is sufficient evidence to suggest that the mean weekly operating cost is higher than the theoretical model's prediction.


# Conclusion

- If we rejected the null hypothesis: There is strong evidence to support the restaurant owners' claim. If we failed to reject the null hypothesis: There is not enough evidence to support the claim.