## Categorical Data Analysis

### Importing necessary libraries

In [4]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

In [2]:
AmericanData = pd.read_csv('Americandata.csv')

In [3]:
AmericanData

Unnamed: 0,Happy,Students,Laborers,Preachers,Physicians,Housewives,Teachers,Lawyers,Musicians
0,Yes,390,378,35,159,78,108,11,31
1,No,1610,122,265,51,122,38,64,19


####  Function to compute standardized residuals

In [6]:
def StandardRes(observed, expected):
    n = observed.sum()
    rsum = observed.sum(axis=1)[:, np.newaxis]
    csum = observed.sum(axis=0)[np.newaxis, :]
    
    # Calculate the variance
    v = csum * rsum * (n - rsum) * (n - csum) / n**3
    
    # Compute standardized residuals
    return (observed - expected) / np.sqrt(v)

#### We  perform the chi-square test

In [8]:
stat, p, dof, expected = chi2_contingency(AmericanData.iloc[:, 1:].values)

In [11]:
# Display results
print("\nChi-square Statistic:", stat)
print("Degrees of Freedom:", dof)
print("p-value:", p)


Chi-square Statistic: 936.1394782304601
Degrees of Freedom: 7
p-value: 7.523435335099551e-198


In [21]:
expected_df = pd.DataFrame(expected, columns=AmericanData.columns[1:], index=AmericanData['Happy'])
print("\nExpected Frequencies:")
print(expected_df)


Expected Frequencies:
          Students    Laborers   Preachers  Physicians  Housewives   Teachers  \
Happy                                                                           
Yes     683.711577  170.927894  102.556737   71.789716   68.371158  49.910945   
No     1316.288423  329.072106  197.443263  138.210284  131.628842  96.089055   

         Lawyers  Musicians  
Happy                        
Yes    25.639184  17.092789  
No     49.360816  32.907211  


#### Calculating the standardized residuals

In [17]:
# Calculate standardized residuals
AmericaStdResiduals = StandardRes(AmericanData.iloc[:, 1:].values, expected)

In [23]:
# Display the standardized residuals in a DataFrame
residuals_df = pd.DataFrame(AmericaStdResiduals, columns=AmericanData.columns[1:], index=AmericanData['Happy'])
print("\nStandardized Residuals:")
print(residuals_df)


Standardized Residuals:
        Students   Laborers  Preachers  Physicians  Housewives   Teachers  \
Happy                                                                       
Yes   -21.227485  21.097239  -8.601936   13.088437    1.478516  10.354758   
No     21.227485 -21.097239   8.601936  -13.088437   -1.478516 -10.354758   

        Lawyers  Musicians  
Happy                       
Yes   -3.602748   4.176521  
No     3.602748  -4.176521  


In [24]:
# Significance check
alpha = 0.05
if p < alpha:
    print("\nThe test is statistically significant (p < 0.05).")
    print("There is a significant association between occupation and happiness.")
else:
    print("\nThe test is NOT statistically significant (p >= 0.05).")
    print("There is no significant association between occupation and happiness.")


The test is statistically significant (p < 0.05).
There is a significant association between occupation and happiness.
