# **CHI-SQUARE TEST**

In [None]:
import numpy as np    # Here we have imported 'numpy' for numerical computations
import scipy.stats as stats   # We also imported 'scipy.stats' as stats for stastical analysis for calculating values of chi-square distribution

i. State the Hypotheses

In [None]:
# Here we have defined two hypotheses i.e 'H0' and 'H1"
H0 = "There is no significant association between the type of smart home device purchased and customer satisfaction levels"
H1 = "There is a significant association between the type of smart home device purchased and customer satisfaction levels"

# Here we have printed the hypotheses to make is clear what is being tested
print("Null Hypothesis (H0):", H0)
print("Alternative Hypothesis (H1):", H1)

Null Hypothesis (H0): There is no significant association between the type of smart home device purchased and customer satisfaction levels
Alternative Hypothesis (H1): There is a significant association between the type of smart home device purchased and customer satisfaction levels


ii. Compute the Chi-Square Statistics

In [None]:
# From the provided data we have contigency table showing the counts of customer in each satisfaction level for both type of devices
# Contingency Table i.e observed frequencies
# Here we have created 2D numpy array i.e 'observed' which contains the counts of occurrences for different combinations of device types and satisfaction levels
observed = np.array([[50, 70],
                     [80, 100],
                     [60, 90],
                     [30, 50],
                     [20, 50]])
observed

array([[ 50,  70],
       [ 80, 100],
       [ 60,  90],
       [ 30,  50],
       [ 20,  50]])

In [None]:
# Here we have calculated the sum of rows and columns

#  Here 'row_totals' sums the observed frequencies across each row and reshapes array to have one column and as many rows as needed
row_totals = observed.sum(axis=1).reshape(-1, 1)

# Here 'col_totals' sums the observed frequencies across each column and the result of this operation is a 1D numpy array containing the sums of each column
col_totals = observed.sum(axis=0)

#  Here 'grand_total' is the sum of all observed frequencies giving the overall total count
grand_total = observed.sum()

In [None]:
row_totals

array([[120],
       [180],
       [150],
       [ 80],
       [ 70]])

In [None]:
col_totals

array([240, 360])

In [None]:
grand_total

600

In [None]:
# Here we calculated the expected frequencies in 'expected'
# '@' is matrix multiplication operator
#  'col_totals.reshape(1, -1)'   reshapes col_totals into a row vector i.e 2D numpy array with shape (1, 2)
expected = (row_totals @ col_totals.reshape(1, -1)) / grand_total
expected

array([[ 48.,  72.],
       [ 72., 108.],
       [ 60.,  90.],
       [ 32.,  48.],
       [ 28.,  42.]])

In [None]:
# Here we have printed both 'Observed Frequencies' and 'Expected Frequencies'
# It allows a comparison between what was observed and what would be expected if the null hypothesis were true
print("Observed Frequencies:\n", observed)
print("\nExpected Frequencies:\n", expected)

Observed Frequencies:
 [[ 50  70]
 [ 80 100]
 [ 60  90]
 [ 30  50]
 [ 20  50]]

Expected Frequencies:
 [[ 48.  72.]
 [ 72. 108.]
 [ 60.  90.]
 [ 32.  48.]
 [ 28.  42.]]


In [None]:
# Here we have calculated the Chi-Square Statistic
# It computes the chi-square statistic by summing the squared differences between observed and expected frequencies and divided by the expected frequencies
# We have stored it in 'chi_square_stat'
chi_square_stat = ((observed - expected) ** 2 / expected).sum()
print("Chi-Square Statistic:", chi_square_stat)

Chi-Square Statistic: 5.638227513227513


iii. Determine the Critical Value

In [None]:
# Here we have calculated the degrees of freedom
# The degrees of freedom i.e 'df' for the chi-square test are calculated as (number of rows - 1) * (number of columns - 1)
# Here this determines the shape of the chi-square distribution used in hypothesis testing
df = (observed.shape[0] - 1) * (observed.shape[1] - 1)
df

4

In [None]:
# Here we have taken the significance level i.e 'alpha' as '0.05'
# We have set alpha to 0.05 which means that there is a 5% chance of rejecting null hypothesis when it is actually true
alpha = 0.05

In [None]:
# Here we have calculated critical value from Chi-Square distribution
# Here  'stats.chi2'   refers to Chi-Square distribution provided by 'scipy.stats' module
# Percent Point Function i.e (ppf) is nothing but it is inverse of Cumulative Distribution Function(CDF)

critical_value = stats.chi2.ppf(1 - alpha, df)
print("Degrees of Freedom:", df)
print("\nCritical Value at alpha=0.05: ", critical_value)

Degrees of Freedom: 4

Critical Value at alpha=0.05:  9.487729036781154


iv. Make a Decision

In [None]:
# Here we have applied a decision rule by comparing 'chi_square_stat' with 'critical_value'

'''
If 'chi_square_stat' is greater than 'critical_value' then it indicates that the observed data is different from the expected data
under the null hypothesis which leads to rejection of H0.
In this case the conclusion is that there is a significant association.

If 'chi_square_stat' is not greater than 'critical_value' it suggests that any difference between observed and expected data is not statistically significant
hence H0 is not rejected.
In this case the conclusion is that there is no significant association.
'''

if chi_square_stat > critical_value:
    print("Decision: Reject the Null Hypothesis (H0)")
    print("\nThere is a significant association between the type of smart home device purchased and customer satisfaction levels.")
else:
    print("\nDecision: Do not reject the Null Hypothesis (H0)")
    print("\nThere is no significant association between the type of smart home device purchased and customer satisfaction levels.")


Decision: Do not reject the Null Hypothesis (H0)

There is no significant association between the type of smart home device purchased and customer satisfaction levels.


**Conclusion**:
The analysis using the Chi-Square test resulted in a decision to not reject the null hypothesis (H0). This means that the statistical evidence does not support a significant association between the type of smart home device purchased and customer satisfaction levels. In other words, customer satisfaction appears to be independent of the type of smart home device they buy. Therefore, variations in satisfaction levels are likely due to other factors not related to the type of device purchased, and the observed differences in satisfaction across device types can be attributed to random chance.