# Case Study # 4 : Testing of Hypothesis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from scipy import stats

In [2]:
data = pd.read_csv('Sales_add.csv')

In [3]:
data.head()

Unnamed: 0,Month,Region,Manager,Sales_before_digital_add(in $),Sales_After_digital_add(in $)
0,Month-1,Region - A,Manager - A,132921,270390
1,Month-2,Region - A,Manager - C,149559,223334
2,Month-3,Region - B,Manager - A,146278,244243
3,Month-4,Region - B,Manager - B,152167,231808
4,Month-5,Region - C,Manager - B,159525,258402


In [4]:
data.tail()

Unnamed: 0,Month,Region,Manager,Sales_before_digital_add(in $),Sales_After_digital_add(in $)
17,Month-18,Region - C,Manager - B,167996,191517
18,Month-19,Region - B,Manager - A,132135,227040
19,Month-20,Region - A,Manager - B,152493,212579
20,Month-21,Region - B,Manager - A,147425,263388
21,Month-22,Region - A,Manager - C,130263,243020


In [5]:
data.shape

(22, 5)

In [6]:
data.isna().sum()

Month                             0
Region                            0
Manager                           0
Sales_before_digital_add(in $)    0
Sales_After_digital_add(in $)     0
dtype: int64

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 5 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   Month                           22 non-null     object
 1   Region                          22 non-null     object
 2   Manager                         22 non-null     object
 3   Sales_before_digital_add(in $)  22 non-null     int64 
 4   Sales_After_digital_add(in $)   22 non-null     int64 
dtypes: int64(2), object(3)
memory usage: 1008.0+ bytes


### 1. The company wishes to clarify whether there is any increase in sales after stepping into digital marketing 

H0: Sales before digital markting = Sales after digital marketing

Ha: Sales before digital marketing < Sales after digital marketing

In [8]:
data[['Sales_before_digital_add(in $)', 'Sales_After_digital_add(in $)']].describe()

Unnamed: 0,Sales_before_digital_add(in $),Sales_After_digital_add(in $)
count,22.0,22.0
mean,149239.954545,231123.727273
std,14844.042921,25556.777061
min,130263.0,187305.0
25%,138087.75,214960.75
50%,147444.0,229986.5
75%,157627.5,250909.0
max,178939.0,276279.0


In [9]:
sales_before = data['Sales_before_digital_add(in $)']
sales_after = data['Sales_After_digital_add(in $)']

In [10]:
sales_before_mean = round(np.mean(sales_before),2)
sales_after_mean = round(np.mean(sales_after),2)
print('Mean of sales before Digital Marketing is ', sales_before_mean)
print('Mean of sales after Digital Marketing is ', sales_after_mean)

Mean of sales before Digital Marketing is  149239.95
Mean of sales after Digital Marketing is  231123.73


In [11]:
sales_before_std = round(np.std(sales_before),2)
sales_after_std = round(np.std(sales_after),2)
print('Standard deviation in sales before Digital Marketing is ', sales_before_std)
print('Standard deviation in sales after Digital Marketing is ', sales_after_std)

Standard deviation in sales before Digital Marketing is  14502.75
Standard deviation in sales after Digital Marketing is  24969.19


In [12]:
t_statistic, p_value = stats.ttest_rel(sales_before, sales_after, alternative = 'less')
print('T-statistic is ', t_statistic)
print('p-value is ', p_value)

T-statistic is  -12.09070525287017
p-value is  3.168333502287889e-11


In [13]:
if p_value < 0.05:
    print('We reject null hypothesis and conclude that the sales increased after stepping into digital marketing')
else:
    print('We fail to reject null hypothesis and conclude that digital marketing had no impact on sales')

We reject null hypothesis and conclude that the sales increased after stepping into digital marketing


In [14]:
print('Average sales before digital marketing is ', round(data['Sales_before_digital_add(in $)'].mean(),2))
print('Average sales after digital marketing is ', round(data['Sales_After_digital_add(in $)'].mean(),2))
difference = round(data['Sales_After_digital_add(in $)'].mean(),2) - round(data['Sales_before_digital_add(in $)'].mean(),2)
print('Difference in average sales after adopting digital marketing is',difference)

Average sales before digital marketing is  149239.95
Average sales after digital marketing is  231123.73
Difference in average sales after adopting digital marketing is 81883.78


### 2. The company needs to check whether there is any dependency between the features “Region” and “Manager”.

H0: No relationship exists between "Region" and "Manager" (They are independent)

Ha: There is significant relationship between "Region" and "Manager" (They are dependent)

To compute Chi square test statistic, we would need to create a contigency table which can be created using 'crosstab' function in pandas. The numbers in table represents frequencies.

In [15]:
crosstab = pd.crosstab(data.Region, data.Manager)
crosstab

Manager,Manager - A,Manager - B,Manager - C
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Region - A,4,3,3
Region - B,4,1,2
Region - C,1,3,1


In [16]:
crosstab_margins = pd.crosstab(data.Region, data.Manager, margins=True)
crosstab_margins

Manager,Manager - A,Manager - B,Manager - C,All
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Region - A,4,3,3,10
Region - B,4,1,2,7
Region - C,1,3,1,5
All,9,7,6,22


stats.chi2_contingency(crosstab): Gives 4 values as outputs:-
1. Chi square test statistic
2. p-value
3. Degrees of freedom
4. Array containing expected cell values

In [17]:
Xtest_statistic, p_value, dof, exp_values = stats.chi2_contingency(crosstab)

In [18]:
print('Chi-square test statistic is ', Xtest_statistic)
print('p-value is ', p_value)
print('Degrees of freedom is  ', dof)
print('Expected values are: ', exp_values)

Chi-square test statistic is  3.050566893424036
p-value is  0.5493991051158094
Degrees of freedom is   4
Expected values are:  [[4.09090909 3.18181818 2.72727273]
 [2.86363636 2.22727273 1.90909091]
 [2.04545455 1.59090909 1.36363636]]


In [19]:
if p_value < 0.05:
    print('We reject null hypothesis and conclude that there is dependency between Region & Manager')
else:
    print('We fail to reject null hypothesis and conclude that Region and Manager are independent')

We fail to reject null hypothesis and conclude that Region and Manager are independent
