In [1]:
#====================================================================================
# CODE NAME     : Descriptive Statistics.py
# PURPOSE       : Demonstrate Hypothesis Testing using Pandas and Scipy
# APPLICATION   : Analyzing Mean difference in sales compare to company's target
#====================================================================================

In [2]:
import numpy as np
import pandas as pd

In [3]:
# Load Input Data
infile = "C:\\Users\\Training\\Data Science using SAS and Python\\Data\\invoices.xls"
invoices = pd.read_excel(infile)
invoices

Unnamed: 0,Sales
0,108.98
1,152.22
2,111.45
3,110.59
4,127.46
5,107.26
6,93.32
7,91.97
8,111.56
9,75.71


In [4]:
# Get the Mean of Sales (Response Variable)
invoices_stat = invoices.Sales.mean()
# invoices_stat = retail_stat.rename(columns = {'Sales':'Total Sales'})
invoices_stat

112.85083333333334

In [5]:
# Hyphothesis Testing

# One Sample T-test on Pandas dataframe

from scipy import stats

mu = 120 # Company's Target or Hypothesis which we need to test or Null Hyphothesis

# mean_sales = pd.Series(retail_desc_stat[:1].Mean.round(2))
# mean_sales = mean_sales.get(key = 'Sales')
mean_sales = invoices.Sales.mean()

alpha = 0.05 # Type 1 Error Rate or Level of significance

sales_t_test = pd.DataFrame(stats.ttest_1samp(invoices.Sales,mu), index = ["T_Value","P_Value"]).transpose()
sales_t_test["Alpha_Value"] = alpha
sales_t_test["Sample_Mean"] = mean_sales
sales_t_test["Population_Mean"] = mu
sales_t_test = sales_t_test.rename(index = {0:'Sales'})
sales_t_test = sales_t_test.reindex(columns = ['Population_Mean','Sample_Mean','T_Value','Alpha_Value','P_Value'])
sales_t_test["Decision"] = np.where(sales_t_test['P_Value'] < sales_t_test['Alpha_Value'],"Reject","Fail to Reject")
sales_t_test.round(2)

Unnamed: 0,Population_Mean,Sample_Mean,T_Value,Alpha_Value,P_Value,Decision
Sales,120,112.85,-1.19,0.05,0.26,Fail to Reject


In [7]:
sales.head()

NameError: name 'sales' is not defined

In [4]:
# Two Sample T-test on Pandas Dataframe

# Load Input Data
infile = "C:\\Users\\Training\\Data Science using SAS and Python\\Data\\COLA.XLS"
sales = pd.read_excel(infile, sheet_name = "Data")
normal_sales = sales[sales.Display_Type == "Normal"]
normal_sales.head()

Unnamed: 0,Display_Type,Sales
0,Normal,22
1,Normal,34
2,Normal,52
3,Normal,62
4,Normal,30


In [6]:
ea_sales = sales[sales.Display_Type == "EndAisle"]
ea_sales.head()

Unnamed: 0,Display_Type,Sales
10,EndAisle,52
11,EndAisle,71
12,EndAisle,76
13,EndAisle,54
14,EndAisle,67


In [12]:
# Two Sample T-test on Pandas Dataframe
# Check whether population mean of Normal Sales and EndAisle Sales are different

#-------------------------------------------------------------
# Step 1: Check for the equality variance between two groups
#-------------------------------------------------------------
F = np.var(normal_sales.Sales) / np.var(ea_sales.Sales)
alpha = 0.05
df1 = len(normal_sales.Sales-1)
df2 = len(ea_sales.Sales-1)
f_p_value = stats.f.cdf(F, df1, df2)

#-------------------------------------------------------------
# Step 2: Perform T-test for difference between two groups
#-------------------------------------------------------------
if f_p_value > alpha:
    t_value,t_p_value = stats.ttest_ind(ea_sales.Sales,normal_sales.Sales,equal_var = True)
    data = {"Description":"Normal - EndAisle", "T test type": "Pooled Variance",
            "T Value":t_value, "Alpha":alpha, "P Value":t_p_value}
    two_samp_t_test = pd.DataFrame(data,index = ["Sales"])
    two_samp_t_test["Decision"] = np.where(two_samp_t_test["P Value"] < alpha, "Reject", "Do not reject")
    
else:    
    t_value,t_p_value = stats.ttest_ind(ea_sales.Sales,normal_sales.Sales,equal_var = False)
    data = {"Description":"Normal - EndAise", "T test type": "Separate Variance",
            "T Value":t_value, "Alpha":alpha, "P Value":t_p_value}
    two_samp_t_test = pd.DataFrame(data,index = ["Sales"])
    two_samp_t_test["Decision"] = np.where(two_samp_t_test["P Value"] < alpha, "Reject", "Do not reject")
    
two_samp_t_test.round(5)

Unnamed: 0,Description,T test type,T Value,Alpha,P Value,Decision
Sales,Normal - EndAisle,Pooled Variance,3.04455,0.05,0.00697,Reject


In [None]:
def hyppthesis_testing(analysis_var,n_samples,samp1,samp2):
    .......
    return

hyppthesis_testing(analysis_var = "Sales", n_samples = 2,samp1 = Africa, Samp2 = Pacific)
hyppthesis_testing(analysis_var = "Sales", n_samples = 1,samp1 = , Samp2 = )