### Foundational Statistics in Python

##### This notebook performs elementary descriptive statistics of populations and samples. Then demostrates three common hypothesis tests- Welch t-test, Correlation test, Chi-square test. It shows how to run them in python and undertand the results 


In [1]:
import pandas as pd
import numpy as np

In [2]:
# get data
url = "http://peopleanalytics-regression-book.org/data/salespeople.csv"
salespeople = pd.read_csv(url)

In [3]:
mean_sales=salespeople.sales.mean()

In [4]:
print(mean_sales)

527.0057142857142


In [5]:
salespeople.shape

(351, 4)

In [6]:
salespeople.head()

Unnamed: 0,promoted,sales,customer_rate,performance
0,0,594.0,3.94,2.0
1,0,446.0,4.06,3.0
2,1,674.0,3.83,4.0
3,0,525.0,3.62,2.0
4,1,657.0,4.4,3.0


In [7]:
# sample variance
var_sales = salespeople.sales.var()
print(var_sales)

34308.11458043389


In [8]:
sd_sales = salespeople.sales.std()
print(sd_sales)

185.2244977869663


In [9]:
# population standard deviation
popsd_sales = salespeople.sales.std(ddof = 0)
print(popsd_sales)

184.9597020864771


In [10]:
# generate a sample covariance matrix between two variables
sales_rate = salespeople[['sales', 'customer_rate']]
sales_rate = sales_rate[~np.isnan(sales_rate)]
cov = sales_rate.cov()
print(cov)

                      sales  customer_rate
sales          34308.114580      55.817691
customer_rate     55.817691       0.795820


In [11]:
# pull out specific covariances
print(cov['sales']['customer_rate'])

55.817691199345006


In [12]:
cor = sales_rate.corr()
print(cor)

                  sales  customer_rate
sales          1.000000       0.337805
customer_rate  0.337805       1.000000


In [13]:
#sales and rating have a slightly positive correlation

In [14]:
#using specific correlation coefficients 
# spearman's correlation- for continuous scale and a ranked scale variable 
from scipy import stats
stats.spearmanr(salespeople.sales, salespeople.performance, 
nan_policy='omit')


SpearmanrResult(correlation=0.27354459847452534, pvalue=2.0065434379079837e-07)

In [15]:
# kendall's tau
stats.kendalltau(salespeople.sales, salespeople.performance, 
nan_policy='omit')

KendalltauResult(correlation=0.20736088105812, pvalue=2.7353258226376615e-07)

#### Hypothesis Testing 

##### Welch’s t-test on a difference in means of samples of unequal variance

In [16]:
# get sales for top and bottom performers
perf1 = salespeople[salespeople.performance == 1].sales
perf4 = salespeople[salespeople.performance == 4].sales

In [21]:
# welch's t-test with unequal variance
ttest = stats.ttest_ind(perf4, perf1, equal_var=False)
print(ttest)

Ttest_indResult(statistic=4.629477606844271, pvalue=1.0932443461577038e-05)


In [None]:
#alternate hypothesis is true: the two performance groups have a statistically significant difference in mean sales

#### Correlation test for non-zero correlation coefficients

In [23]:
# calculate correlation and p-value 
sales = salespeople.sales[~np.isnan(salespeople.sales)]

cust_rate = salespeople.customer_rate[~np.isnan(salespeople.customer_rate)]

cor = stats.pearsonr(sales, cust_rate)
print(cor)

(0.3378050448586781, 8.64795221209082e-11)


In [24]:
#alternate hypothesis is true: there is a significant correlation between sales and customer rating

#### Chi-square test of difference in frequency distribution between different categories in a data set

In [25]:
# create contingency table for promoted versus performance
contingency = pd.crosstab(salespeople.promoted, salespeople.performance)

# perform chi-square test
chi2_test = stats.chi2_contingency(contingency)
print(chi2_test)

(25.895405268094862, 1.0030629464566802e-05, 3, array([[40.62857143, 74.48571429, 84.64285714, 37.24285714],
       [19.37142857, 35.51428571, 40.35714286, 17.75714286]]))


In [None]:
#alternative hypothesis is true: there is a difference in the distribution of promoted/not promoted individuals 
#between the four performance categories