<a href="https://colab.research.google.com/github/SubbulakshmiSN/Dataframe_Preprocessing/blob/main/Dataframe_preprocessing_hypo_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***LIBRARIES***

In [1]:
#importing libraries
import pandas as  pd
import numpy as np
from scipy import stats
import statsmodels.api as sm
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import vega_datasets

In [2]:
#reading the dataset
df= sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [4]:
df['size'].unique()

array([2, 3, 4, 1, 6, 5])

In [5]:
df.day.unique()

['Sun', 'Sat', 'Thur', 'Fri']
Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']

In [6]:
df.sex.unique()

['Female', 'Male']
Categories (2, object): ['Male', 'Female']

In [7]:
df.smoker.unique()

['No', 'Yes']
Categories (2, object): ['Yes', 'No']

In [8]:
'''
no null values
244 rows
7 columns

unsupervised learning

categorical columns : day, sex, smoker, time,size
continuous columns : total_bill, tip


'''

'\nno null values\n244 rows\n7 columns\n\nunsupervised learning\n\ncategorical columns : day, sex, smoker, time,size\ncontinuous columns : total_bill, tip\n\n\n'

# ***HYPOTHESIS / STATISTICAL TESTING***

In [9]:
'''
Testing Types

continuous- central limit theorem or 1-Tailed test(1 sample)
continuous vs continuous - correlation, 2-Tailed test(2 sample)
continuous vs categorical - ANOVA(Analysis of Variance)
categorical vs categorical - Chi-Square Test
'''

'\nTesting Types\n\ncontinuous- central limit theorem or 1-Tailed test(1 sample)\ncontinuous vs continuous - correlation, 2-Tailed test(2 sample)\ncontinuous vs categorical - ANOVA(Analysis of Variance)\ncategorical vs categorical - Chi-Square Test\n'

In [10]:
continuous = ["total_bill", "tip"]
category = ["sex",	"smoker",	"day", "time", "size"]


In [11]:
def hypothesis_testing(continuous=[],category=[],sampleRate=0.20, sampleFraction=0.05):

#central limit theorem
  central_limit_theorem_result={}
  hypothesis_df = pd.DataFrame(columns=df.columns, index=df.columns)
  if continuous:
    samplesize = int(sampleRate * len(df[continuous[0]]))
    for column in continuous:
        population=df[column].values
        population_mean= population.mean()

        sample_mean=[]
        for i in range(40):
          sample=np.random.choice(population,samplesize)
          sample_mean.append(sample.mean())
          central_limit_theorem_result[column] = {
                                          "Population Mean": population_mean,
                                          "Sample Means": np.mean(sample_mean)
                                          }

  #1-Tailed Test
  if continuous:
    samplesize = int(sampleRate * len(df[continuous[0]]))
    for column in continuous:
      H0_accepted = 0
      H0_rejected = 0
      for i in range(samplesize):
        sample=df[column].sample(frac=sampleFraction)
        t_test,p_value=stats.ttest_1samp(sample,df[column].mean())
        if p_value > 0.5:
          H0_accepted += 1
        else:
          H0_rejected += 1

      if H0_accepted > H0_rejected:
          central_limit_theorem_result[column].update( {
                                            "H0_accepted": H0_accepted,
                                            "H0_rejected": H0_rejected,
                                            "Conclusion": "H0 is accepted, Ha is rejected, There is no significant effect"
                                          } )
          hypothesis_df[column][column] = "There is a relationship"

      else:
          central_limit_theorem_result[column].update( {
                                              "H0_accepted": H0_accepted,
                                              "H0_rejected": H0_rejected,
                                              "Conclusion": "H0 is rejected, Ha is accepted, There is a significant effect"
                                              } )
          hypothesis_df[column][column] = "There is no relationship"


  #2-Tailed Test
  for i in range(len(continuous) - 1):
      column_1 = continuous[i]
      for column_2 in continuous[i+1:]:
        H0_accepted = 0
        H0_rejected = 0
        for i in range(20):
          sample1 = df[column_1].sample(frac=sampleFraction)
          sample2 = df[column_2].sample(frac=sampleFraction)
          t_test, p_value = stats.ttest_ind(sample1, sample2)
          if p_value > 0.5:
            H0_accepted += 1
          else:
            H0_rejected += 1

        if H0_accepted > H0_rejected:
          #H0 is accepted, Ha is rejected, There is no significant effect. "H0_accepted": H0_accepted, "H0_rejected": H0_rejected
          hypothesis_df[column_1][column_2] = "There is a relationship"
          hypothesis_df[column_2][column_1] = "There is a relationship"
        else:
          #H0 is rejected, Ha is accepted, There is a significant effect. "H0_accepted": H0_accepted, "H0_rejected": H0_rejected
            hypothesis_df[column_1][column_2] = "There is no relationship"
            hypothesis_df[column_2][column_1] = "There is no relationship"

  #Chi-Square Test
  if category:
    for i in range(len(category) - 1):
        column_1 = category[i]
        for column_2 in category[i + 1:]:
            data = pd.crosstab(df[column_1], df[column_2])
            observed_values = data.values
            chi2_stat, p_value, _, _ = stats.chi2_contingency(observed_values)

            if p_value > 0.05:
                # H0 is accepted, There is no relationship between two columns we're comparing
                hypothesis_df[column_1][column_2] = "There is no relationship"
                hypothesis_df[column_2][column_1] = "There is no relationship"
            else:
                # H0 is rejected, There is a relationship between two columns we're comparing
                hypothesis_df[column_1][column_2] = "There is a relationship"
                hypothesis_df[column_2][column_1] = "There is a relationship"

  #ANOVA
  if continuous and category:
    for category_column in category:
      for continuous_column in continuous:
        group = df[category_column].unique()
        data = {}
        for i in group:
          data[i]=df[continuous_column][df[category_column]==i]

        f_value, p_value = stats.f_oneway(*[data[i] for i in group])
        if p_value > 0.05:
            # H0 is accepted, There is a relationship between two columns we're comparing
            hypothesis_df[category_column][continuous_column] = "There is a relationship"
            hypothesis_df[continuous_column][category_column] = "There is a relationship"
        else:
            # H0 is rejected, There is no relationship between two columns we're comparing
            hypothesis_df[category_column][continuous_column] = "There is no relationship"
            hypothesis_df[continuous_column][category_column] = "There is no relationship"

  return central_limit_theorem_result, hypothesis_df

# ***HYPOTHESIS TESTING RESULT***

In [12]:
continuous = ["total_bill", "tip"]
category = ["sex",	"smoker",	"day", "time", "size"]

ContinuousColumn_result, comperativeColumn_result = hypothesis_testing(continuous , category)


In [13]:

ContinuousColumn_result_df = pd.DataFrame(ContinuousColumn_result)
ContinuousColumn_result_df

Unnamed: 0,total_bill,tip
Population Mean,19.785943,2.998279
Sample Means,20.141854,2.988865
H0_accepted,26,27
H0_rejected,22,21
Conclusion,"H0 is accepted, Ha is rejected, There is no si...","H0 is accepted, Ha is rejected, There is no si..."


In [14]:
comperativeColumn_result

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
total_bill,There is a relationship,There is no relationship,There is no relationship,There is a relationship,There is no relationship,There is no relationship,There is no relationship
tip,There is no relationship,There is a relationship,There is a relationship,There is a relationship,There is a relationship,There is a relationship,There is no relationship
sex,There is no relationship,There is a relationship,,There is no relationship,There is a relationship,There is a relationship,There is no relationship
smoker,There is a relationship,There is a relationship,There is no relationship,,There is a relationship,There is no relationship,There is no relationship
day,There is no relationship,There is a relationship,There is a relationship,There is a relationship,,There is a relationship,There is a relationship
time,There is no relationship,There is a relationship,There is a relationship,There is no relationship,There is a relationship,,There is a relationship
size,There is no relationship,There is no relationship,There is no relationship,There is no relationship,There is a relationship,There is a relationship,


In [15]:
df.corr()

  df.corr()


Unnamed: 0,total_bill,tip,size
total_bill,1.0,0.675734,0.598315
tip,0.675734,1.0,0.489299
size,0.598315,0.489299,1.0
