### Chi-Square Test

In [1]:
import pandas as pd
from scipy import stats

from scipy.stats import chi2
from scipy.stats import chi2_contingency

##### code Reference : 
 https://machinelearningmastery.com/chi-squared-test-for-machine-learning/

In [2]:
def chi_square_test(observed_values,prob):
    """
    Chi_square test for the
    * observed values in a contingency table format
         [[10,20,30],
        [40,50,60]]
    * prob - probalitiy

    """
    stat, p, dof, expected = chi2_contingency(observed_values)
    print(f"Chisquare statistic \t: {round(stat,3)}")
    print(f"p-value \t\t: {round(p,3)}")
    print(f"Degreed of freedom\t: {dof}")
    print(f"\nExpected value\t\t:")
    print(expected)
    critical = chi2.ppf(prob, dof)
    if abs(stat) >= critical:
        result = 'From critical value : Dependent (reject H0)'
    else:
        result = 'From critical value : Independent (fail to reject H0)'
    
    # interpret p-value
    alpha = 1.0 - prob
    print('significance=%.3f, p=%.3f' % (alpha, p))
    if p <= alpha:
        result2 = 'From significance : Dependent (reject H0)'
    else:
        result2 = 'From significance : Independent (fail to reject H0)'
    print(result)
    return result,result2

In [4]:
observed = [[10,20,30],
         [40,50,60]]


r1,r2 = chi_square_test(observed,0.95)

Chisquare statistic 	: 2.8
p-value 		: 0.247
Degreed of freedom	: 2

Expected value		:
[[14.28571429 20.         25.71428571]
 [35.71428571 50.         64.28571429]]
significance=0.050, p=0.247
From critical value : Independent (fail to reject H0)


### Problem 1
1. A poker-dealing machine is supposed to deal cards at random, as if from an infinite deck.

In a test, you counted 1600 cards, and observed the following:
<pre>
Spades        404
Hearts        420
Diamonds      400
Clubs         376
</pre>

Could it be that the suits are equally likely ? Or are these discrepancies too much to be random ?

### Analysis:
In general, the distribution of cards in a deck of 1600 cards is 
<pre>
Type          Expected   Actual    difference
Spades        400        404        4
Hearts        400        420        20
Diamonds      400        400        0
Clubs         400        376        -24
</pre>
Since there are 4 types, and 3 of them can be independent, so degree of freedom is <b>3</b>
chi-square = sum (square of the differene/Expected)

           = 4^2/400 + 20^2/400 + 0^2/400 + (-24)^2/400
           = (16+400+0+576)/400
           = 992/400
           = 2.48
Null Hypothesis: The suits are equally likely
Alternate Hypothesis: The suits are random


In [5]:
cards_expected = [400,400,400,400]
cards_actual = [404,420,400,376]
stats.chisquare(cards_actual,cards_expected)

Power_divergenceResult(statistic=2.48, pvalue=0.4789163768174306)

In [None]:
chi_square_test(cards_actual,0.95)

In [None]:
from scipy import stats

In [None]:
help(stats.chisquare)

### Problem 6
In the titanic Dataset, do a crosstab for embarked and survival rate. Using chi-square test, determine whether both of them are dependent or independent.

In [None]:
titanic = pd.read_csv(r'E:\SupervisedLearning\datasets\titanic.csv')

In [None]:
titanic.head()

In [None]:
help(pd.crosstab)

In [None]:
pd.crosstab(titanic['PassengerId'],titanic['Survived'],margins=True)

In [None]:
observed = pd.crosstab(titanic['PassengerId'],titanic['Survived']).values

In [None]:
observed

In [None]:
chi_square_test(observed,0.95)

In [None]:
titanic.loc[titanic['Embarked']=='Embarked']

In [None]:
titanic['Embarked'].value_counts()

In [None]:
titanic.info()

In [None]:
titanic.isnull().sum()

In [None]:
titanic.shape

In [None]:
854+434+154+336