In [1]:
# Importing Libraries

import numpy as np
import matplotlib as plt
import seaborn as sns
import pandas as pd
from scipy.stats import ttest_rel
from scipy.stats import ttest_ind
from statsmodels.stats.weightstats import ztest

In [2]:
# Reading the dataset

data= pd.read_csv('Sales_add.csv')
data


Unnamed: 0,Month,Region,Manager,Sales_before_digital_add(in $),Sales_After_digital_add(in $)
0,Month-1,Region - A,Manager - A,132921,270390
1,Month-2,Region - A,Manager - C,149559,223334
2,Month-3,Region - B,Manager - A,146278,244243
3,Month-4,Region - B,Manager - B,152167,231808
4,Month-5,Region - C,Manager - B,159525,258402
5,Month-6,Region - A,Manager - B,137163,256948
6,Month-7,Region - C,Manager - C,130625,222106
7,Month-8,Region - A,Manager - A,131140,230637
8,Month-9,Region - B,Manager - C,171259,226261
9,Month-10,Region - C,Manager - B,141956,193735


In [3]:
# Checking null values

data.isnull().sum()

Month                             0
Region                            0
Manager                           0
Sales_before_digital_add(in $)    0
Sales_After_digital_add(in $)     0
dtype: int64

In [4]:
# Renaming the column names

data= data.rename(columns={'Sales_before_digital_add(in $)': 'Sales_before', 'Sales_After_digital_add(in $)':'Sales_After' })
data

Unnamed: 0,Month,Region,Manager,Sales_before,Sales_After
0,Month-1,Region - A,Manager - A,132921,270390
1,Month-2,Region - A,Manager - C,149559,223334
2,Month-3,Region - B,Manager - A,146278,244243
3,Month-4,Region - B,Manager - B,152167,231808
4,Month-5,Region - C,Manager - B,159525,258402
5,Month-6,Region - A,Manager - B,137163,256948
6,Month-7,Region - C,Manager - C,130625,222106
7,Month-8,Region - A,Manager - A,131140,230637
8,Month-9,Region - B,Manager - C,171259,226261
9,Month-10,Region - C,Manager - B,141956,193735


In [5]:
# The company wishes to clarify whether there is any increase in sales after stepping into digital marketing.

# setting the Hypothesis for checking the change in sales after digital marketing
#H0= There is no significant increase in sales after digital marketing
#H1= There is significant increase in Sales after digital marketing


In [6]:
# Importing stats

import scipy.stats as stats

# alpha value is set at 0.05

alpha= 0.05

# performing paired t test for checking the increase in sales

t_value, p_value= stats.ttest_rel(data['Sales_After'] , data['Sales_before'])

print("The calulated T score & p-value are : \n  \nt-value = %0.3f  \np-value = %0.3f \n" % (t_value, p_value))

The calulated T score & p-value are : 
  
t-value = 12.091  
p-value = 0.000 



In [7]:
if (alpha> p_value):
    print ('Based on testing carried out, we get a p value which is less than alpha level(0.05), therefore we are failing to accept the null hypothesis')
else:
    print('Based on testing carried out, we get a p value which is greater than alpha level(0.05), therefore we are failing to accept the null hypothesis')
        

Based on testing carried out, we get a p value which is less than alpha level(0.05), therefore we are failing to accept the null hypothesis


The null hypothesis was that there was no significant change after digital marketing, but the test failed to accept the null hypothesis based on the provided data.
Therefore we are accepting the alternative hypothesis,i.e., there is a significant increase in sales after digital marketing. 

In [8]:
#The company needs to check whether there is any dependency between the features “Region” and “Manager”.
#H0= There is no relationship between Region and Manager
#H1= There is a relationship between Region and Manager

In [9]:
#Crosstabulation of Region and Manager

dataset= pd.crosstab(data['Region'], data['Manager'])
dataset

Manager,Manager - A,Manager - B,Manager - C
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Region - A,4,3,3
Region - B,4,1,2
Region - C,1,3,1


In [10]:
#observed values
observed_values =dataset.values
print('Observed values:-\n', observed_values)

Observed values:-
 [[4 3 3]
 [4 1 2]
 [1 3 1]]


In [11]:
# Chi-Square test

value= stats.chi2_contingency(dataset)
value

(3.050566893424036,
 0.5493991051158094,
 4,
 array([[4.09090909, 3.18181818, 2.72727273],
        [2.86363636, 2.22727273, 1.90909091],
        [2.04545455, 1.59090909, 1.36363636]]))

In [12]:
# Expected values

expected_value= value[3]
expected_value

array([[4.09090909, 3.18181818, 2.72727273],
       [2.86363636, 2.22727273, 1.90909091],
       [2.04545455, 1.59090909, 1.36363636]])

In [13]:
# Calculating Degrees of freedom

no_of_rows=len(dataset.iloc[0:3,0])
no_of_columns=len(dataset.iloc[0,0:3])
ddof=(no_of_rows-1)*(no_of_columns-1)
print('Degrees of Freedom:', ddof)
alpha=0.05
from scipy.stats import chi2

Degrees of Freedom: 4


In [14]:
# Calculating Chi-Square Statistics and critical value

chi_square= sum([(o-e)**2./e for o, e in zip(observed_values, expected_value)])
chi_square_statistics= chi_square[0] + chi_square[1]
print('Chi-Square statistics:', chi_square_statistics)
critical_value= chi2.ppf(q=1-alpha,df=ddof)
print('Critical_value:', critical_value)

Chi-Square statistics: 2.921995464852608
Critical_value: 9.487729036781154


In [15]:
# Calculating p_value

p_value=1-chi2.cdf(x= chi_square_statistics, df=ddof)
print('p_value:', p_value)
print('Significance Level:', alpha)
print('Degrees of freedom:', ddof)

p_value: 0.5709629929220089
Significance Level: 0.05
Degrees of freedom: 4


In [16]:
if critical_value<= chi_square_statistics:
    print('Reject H0, That there is a relationship between region and manager')
else:
    print('Accept H0, That there is no relationship between region and manager')
if p_value<= alpha:
    print('Reject H0, That there is a relationship between region and manager')
else:
    print('Accept H0, That there is no relationship between region and manager')

Accept H0, That there is no relationship between region and manager
Accept H0, That there is no relationship between region and manager


Based on the test results of the provided data, there is no relatr