In [1]:
import numpy as np
import pandas as pd


import statsmodels.api  as  sm
import scipy.stats as stats

import warnings
warnings.filterwarnings('ignore')

In [102]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
data =  pd.concat([train,test])
print(train.shape,test.shape,data.shape)

(550068, 12) (233599, 11) (783667, 12)


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
User_ID                       550068 non-null int64
Product_ID                    550068 non-null object
Gender                        550068 non-null object
Age                           550068 non-null object
Occupation                    550068 non-null int64
City_Category                 550068 non-null object
Stay_In_Current_City_Years    550068 non-null object
Marital_Status                550068 non-null int64
Product_Category_1            550068 non-null int64
Product_Category_2            376430 non-null float64
Product_Category_3            166821 non-null float64
Purchase                      550068 non-null int64
dtypes: float64(2), int64(5), object(5)
memory usage: 50.4+ MB


In [5]:
train['Gender'] = train['Gender'].astype("object")
train['Age'] = (train['Age']).astype("object")
train['Occupation'] = (train['Occupation']).astype("object")
train['City_Category'] = (train['City_Category']).astype("object")
train['Stay_In_Current_City_Years'] = (train['Stay_In_Current_City_Years']).astype("object")
train['Marital_Status'] = (train['Marital_Status']).astype("object")
train['Product_Category_1'] = (train['Product_Category_1']).astype("object")

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
User_ID                       550068 non-null int64
Product_ID                    550068 non-null object
Gender                        550068 non-null object
Age                           550068 non-null object
Occupation                    550068 non-null object
City_Category                 550068 non-null object
Stay_In_Current_City_Years    550068 non-null object
Marital_Status                550068 non-null object
Product_Category_1            550068 non-null object
Product_Category_2            376430 non-null float64
Product_Category_3            166821 non-null float64
Purchase                      550068 non-null int64
dtypes: float64(2), int64(2), object(8)
memory usage: 50.4+ MB


## Droping the columns that are not useful for statistical analysis


* User_ID ,Product_ID are not useful
* Product_Category_2 & Product_Category_3 contains null values hence lets ignore that 

# 1: Checking Dependents of Gender and Product_Category_1

* Step 1: State the null and alternative hypothesis:

**Null hypothesis: $H_0$: Gender and Product_Category_1 are Independent.**

**Alternative hypothesis: $H_A$: Gender and Product_Category_1 are Not Independent.**

* Step 2: Decide the significance level

**Here we select α = 0.05**

* Step 3: Identify the test statistic

**We use the chi-square test of independence to find out the difference of categorical variables*

* Step 4: Calculate p value or chi-square statistic value

In [7]:
Gen_tab = pd.crosstab(index=train["Gender"], columns=train["Product_Category_1"])
Gen_array = np.array(Gen_tab)
Gen_tab

Product_Category_1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
F,24831,5658,6006,3639,41961,4559,943,33558,70,1162,4739,1532,1462,623,1046,2402,62,382,451,723
M,115547,18206,14207,8114,108972,15907,2778,80367,340,3963,19548,2415,4087,900,5244,7426,516,2743,1152,1827


In [8]:
chi_sq_Stat, p_value, deg_freedom, exp_freq = stats.chi2_contingency(Gen_array)

print('Chi-square statistic %3.5f P value %1.6f Degrees of freedom %d' %(chi_sq_Stat, p_value,deg_freedom))

Chi-square statistic 8070.41102 P value 0.000000 Degrees of freedom 19


**Step 5: Decide to reject or accept null hypothesis¶**
* Here, p value is 0 and < 0.05 so we reject the null hypothesis.
* Going with the Alternative hypothesis: $H_A$: Gender and Product_Category_1 are Not Independent. This means they are dependent.

# 2: Checking Dependents of Age and Product_Category_1

* Step 1: State the null and alternative hypothesis:

**Null hypothesis: $H_0$: Age and Product_Category_1 are Independent.**

**Alternative hypothesis: $H_A$: Age and Product_Category_1 are Not Independent.**

* Step 2: Decide the significance level

**Here we select α = 0.05**

* Step 3: Identify the test statistic

**We use the chi-square test of independence to find out the difference of categorical variables**

* Step 4: Calculate p value or chi-square statistic value

In [9]:
Age_tab = pd.crosstab(index=train["Age"], columns=train["Product_Category_1"])
Age_arr = np.array(Age_tab)
Age_tab

Product_Category_1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
Age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0-17,3585,805,1200,758,4330,399,53,2258,16,111,740,125,112,39,160,229,6,27,59,90
18-25,26962,4428,4710,2463,28522,3749,481,17911,63,603,4597,439,756,230,1024,1598,41,339,275,469
26-35,58249,8928,7662,4192,61473,8485,1651,44256,154,1787,9874,1096,2096,564,2372,4118,127,1042,563,898
36-45,27648,4912,3854,2354,29377,3899,809,23296,107,1235,4953,994,1250,312,1395,1955,135,702,320,506
46-50,10474,2105,1376,990,11971,1622,327,10656,33,520,2104,520,551,149,602,879,95,351,149,227
51-55,9049,1781,924,678,9893,1450,266,9340,29,519,1458,433,483,154,508,672,107,423,134,200
55+,4411,905,487,318,5367,862,134,6208,8,350,561,340,301,75,229,377,67,241,103,160


In [10]:
chi_sq_Stat, p_value, deg_freedom, exp_freq = stats.chi2_contingency(Age_arr)

print('Chi-square statistic %3.5f P value %1.6f Degrees of freedom %d' %(chi_sq_Stat, p_value,deg_freedom))

Chi-square statistic 7637.28655 P value 0.000000 Degrees of freedom 114


**Step 5: Decide to reject or accept null hypothesis¶**

* Here, p value is 0 and < 0.05 so we reject the null hypothesis.

* Going with the Alternative hypothesis:  𝐻𝐴 : age and Product_Category_1 are Not Independent. This means they are dependent.

# 3: Checking Dependents of Occupation and Product_Category_1

* Step 1: State the null and alternative hypothesis:

**Null hypothesis: $H_0$: Occupation and Product_Category_1 are Independent.**

**Alternative hypothesis: $H_A$: Occupation and Product_Category_1 are Not Independent.**

* Step 2: Decide the significance level

**Here we select α = 0.05**

* Step 3: Identify the test statistic

**We use the chi-square test of independence to find out the difference of categorical variables**

* Step 4: Calculate p value or chi-square statistic value

In [11]:
Occ_tab = pd.crosstab(index=data["Occupation"], columns=data["Product_Category_1"])
Occ_array = np.array(Occ_tab)

Occ_tab

Product_Category_1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
Occupation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,25066,4251,3707,2123,27141,3694,792,19874,72,897,5336,633,1025,285,1019,1794,82,580,188,291
1,14733,2755,2195,1369,18887,2561,537,17186,44,761,2353,639,787,238,731,1121,88,344,139,219
2,8300,1550,1390,744,10904,1473,342,8943,22,409,1634,340,374,110,428,621,22,205,62,123
3,5577,982,883,557,7508,982,194,5465,20,284,1014,237,300,97,244,506,19,143,49,65
4,27458,4379,4692,2425,29259,3842,640,19724,65,679,4538,541,845,263,1085,1687,67,380,178,339
5,5183,827,632,344,4756,549,55,2730,17,138,1134,104,133,30,199,317,11,135,24,55
6,6673,1189,1060,639,8094,1037,214,6877,20,265,1206,241,290,104,291,569,30,141,63,99
7,22998,3502,2329,1596,22221,3262,484,17981,59,779,3399,634,860,228,909,1653,114,659,192,268
8,719,144,99,61,538,59,7,357,4,18,89,4,13,2,37,20,2,7,5,4
9,1642,404,494,334,3048,263,23,1877,4,48,244,119,88,39,51,154,4,31,22,40


In [12]:
chi_sq_Stat, p_value, deg_freedom, exp_freq = stats.chi2_contingency(Occ_array)

print('Chi-square statistic %3.5f P value %1.6f Degrees of freedom %d' %(chi_sq_Stat, p_value,deg_freedom))

Chi-square statistic 13784.49193 P value 0.000000 Degrees of freedom 380


**Step 5: Decide to reject or accept null hypothesis**

* Here, p value is 0 and < 0.05 so we reject the null hypothesis.

* Going with the Alternative hypothesis: $H_A$: Occupation and Product_Category_1 are Not Independent. This means they are dependent.

# Checking Dependents of Marital_Status and Product_Category_1

* Step 1: State the null and alternative hypothesis:

**Null hypothesis: $H_0$: Marital_Status and Product_Category_1 are Independent.**

**Alternative hypothesis: $H_A$: Marital_Status and Product_Category_1 are Not Independent.**

* Step 2: Decide the significance level

**Here we select α = 0.05**

* Step 3: Identify the test statistic

**We use the chi-square test of independence to find out the difference of categorical variables** 

* Step 4: Calculate p value or chi-square statistic value

In [13]:
Marital_Status_tab = pd.crosstab(index=data["Marital_Status"], columns=data["Product_Category_1"])
Marital_Status_array = np.array(Marital_Status_tab)
Marital_Status_tab

Product_Category_1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
Marital_Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,120577,20230,17688,10241,128015,17388,2981,93120,361,3967,20831,2913,4477,1226,5145,8170,415,2367,946,1480
1,80122,13826,11103,6515,87935,11938,2364,69174,243,3406,13609,2697,3453,960,3839,5763,386,2069,657,1070


In [14]:
chi_sq_Stat, p_value, deg_freedom, exp_freq = stats.chi2_contingency(Marital_Status_array)

print('Chi-square statistic %3.5f P value %1.6f Degrees of freedom %d' %(chi_sq_Stat, p_value,deg_freedom))

Chi-square statistic 755.29005 P value 0.000000 Degrees of freedom 19


**Step 5: Decide to reject or accept null hypothesis**

* Here, p value is 0 and < 0.05 so we reject the null hypothesis.
* Going with the Alternative hypothesis: $H_A$: Marital_Status and Product_Category_1 are Not Independent. This means they are dependent.

# Checking Dependency of Marital_Status and Stay_In_Current_City_Years

* Step 1: State the null and alternative hypothesis:

**Null hypothesis: $H_0$: Marital_Status and Stay_In_Current_City_Years are Independent.**

**Alternative hypothesis: $H_A$: Marital_Status and Stay_In_Current_City_Years are Not Independent.*

* Step 2: Decide the significance level

**Here we select α = 0.05**

* Step 3: Identify the test statistic

**We use the chi-square test of independence to find out the difference of categorical variables** 

* Step 4: Calculate p value or chi-square statistic value

In [15]:
Mar_tab = pd.crosstab(index=data["Marital_Status"], columns=data["Stay_In_Current_City_Years"])
Mar_array = np.array(Mar_tab)
Mar_tab

Stay_In_Current_City_Years,0,1,2,3,4+
Marital_Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,63842,157873,86730,81802,72291
1,41874,118552,58697,53626,48380


In [16]:
chi_sq_Stat, p_value, deg_freedom, exp_freq = stats.chi2_contingency(Mar_array)

print('Chi-square statistic %3.5f P value %1.6f Degrees of freedom %d' %(chi_sq_Stat, p_value,deg_freedom))

Chi-square statistic 667.25198 P value 0.000000 Degrees of freedom 4


**Step 5: Decide to reject or accept null hypothesis¶**

* Here, p value is 0 and < 0.05 so we reject the null hypothesis.

* Going with the Alternative hypothesis: Marital_Status and Stay_In_Current_City_Years are Not Independent. This means they are dependent.

# Checking Dependency of Marital_Status and City_Category

* Step 1: State the null and alternative hypothesis:

**Null hypothesis: $H_0$: Marital_Status and City_Category are Independent.**

**Alternative hypothesis: $H_A$: Marital_Status and City_Category are Not Independent.**

* Step 2: Decide the significance level¶

**Here we select α = 0.05**

* Step 3: Identify the test statistic

**We use the chi-square test of independence to find out the difference of categorical variables**

* Step 4: Calculate p value or chi-square statistic value

In [17]:
City_tab = pd.crosstab(index=data["Marital_Status"], columns=data["City_Category"])
City_array = np.array(City_tab)
City_tab

City_Category,A,B,C
Marital_Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,129690,195153,137695
1,80554,134586,105989


In [18]:
chi_sq_Stat, p_value, deg_freedom, exp_freq = stats.chi2_contingency(City_array)

print('Chi-square statistic %3.5f P value %1.6f Degrees of freedom %d' %(chi_sq_Stat, p_value,deg_freedom))

Chi-square statistic 1258.28019 P value 0.000000 Degrees of freedom 2


**Step 5: Decide to reject or accept null hypothesis**

* Here, p value is 0 and < 0.05 so we reject the null hypothesis.

* Going with the Alternative hypothesis: Marital_Status and City_Category are Not Independent. This means they are dependent.

# Checking Dependency of Marital_Status and Occupation

* Step 1: State the null and alternative hypothesis:

**Null hypothesis: $H_0$: Marital_Status and Occupation are Independent.**

**Alternative hypothesis: $H_A$: Marital_Status and Occupation are Not Independent.**

* Step 2: Decide the significance level

**Here we select α = 0.05**

* Step 3: Identify the test statistic

**We use the chi-square test of independence to find out the difference of categorical variables** 

* Step 4: Calculate p value or chi-square statistic value

In [19]:
MarOccupation_tab = pd.crosstab(index=data["Marital_Status"], columns=data["Occupation"])
MarOccupation_array = np.array(MarOccupation_tab)
MarOccupation_tab

Occupation,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
Marital_Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,57450,35360,20457,14700,75126,10810,15986,47457,961,4378,...,10287,23756,5183,22724,9566,19014,32638,5101,9750,24591
1,41400,32327,17539,10426,27960,6563,13116,36670,1228,4551,...,6306,20692,5870,16058,7790,17108,24780,4266,2169,23249


In [20]:
chi_sq_Stat, p_value, deg_freedom, exp_freq = stats.chi2_contingency(MarOccupation_array)
print('Chi-square statistic %3.5f P value %1.6f Degrees of freedom %d' %(chi_sq_Stat, p_value,deg_freedom))

Chi-square statistic 26306.09627 P value 0.000000 Degrees of freedom 20


**Step 5: Decide to reject or accept null hypothesis¶**

* Here, p value is 0 and < 0.05 so we reject the null hypothesis.

* Going with the Alternative hypothesis: Marital_Status and Occupation are Not Independent. This means they are dependent.

# Checking Dependency of Marital_Status and Age

* Step 1: State the null and alternative hypothesis:¶

**Null hypothesis: $H_0$: Marital_Status and Age are Independent.**

**Alternative hypothesis: $H_A$: Marital_Status and Age are Not Independent.**

* Step 2: Decide the significance level

**Here we select α = 0.05**

* Step 3: Identify the test statistic

**We use the chi-square test of independence to find out the difference of categorical variables**

* Step 4: Calculate p value or chi-square statistic value

In [21]:
MarAge_tab = pd.crosstab(index=data["Marital_Status"], columns=data["Age"])
MarAge_array = np.array(MarAge_tab)
MarAge_tab

Age,0-17,18-25,26-35,36-45,46-50,51-55,55+
Marital_Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,21334,111821,189800,94766,18205,15352,11260
1,0,30132,123215,61958,47073,39432,19319


In [22]:
chi_sq_Stat, p_value, deg_freedom, exp_freq = stats.chi2_contingency(MarAge_array)
print('Chi-square statistic %3.5f P value %1.6f Degrees of freedom %d' %(chi_sq_Stat, p_value,deg_freedom))

Chi-square statistic 92339.66563 P value 0.000000 Degrees of freedom 6


**Step 5: Decide to reject or accept null hypothesis**

* Here, p value is 0 and < 0.05 so we reject the null hypothesis.

* Going with the Alternative hypothesis: Marital_Status and Age are Not Independent. This means they are dependent

# Checking Dependency of Marital_Status and Gender

* Step 1: State the null and alternative hypothesis:¶

**Null hypothesis: $H_0$: Marital_Status and Gender are Independent.**

**Alternative hypothesis: $H_A$: Marital_Status and Gender are Not Independent.**

* Step 2: Decide the significance level

**Here we select α = 0.05**

* Step 3: Identify the test statistic

**We use the chi-square test of independence to find out the difference of categorical variables**

* Step 4: Calculate p value or chi-square statistic value

In [23]:
MarGender_tab = pd.crosstab(index=data["Marital_Status"], columns=data["Gender"])
MarGender_array = np.array(MarGender_tab)
MarGender_tab

Gender,F,M
Marital_Status,Unnamed: 1_level_1,Unnamed: 2_level_1
0,112469,350069
1,81167,239962


In [24]:
chi_sq_Stat, p_value, deg_freedom, exp_freq = stats.chi2_contingency(MarGender_array)

print('Chi-square statistic %3.5f P value %1.6f Degrees of freedom %d' %(chi_sq_Stat, p_value,deg_freedom))

Chi-square statistic 93.82108 P value 0.000000 Degrees of freedom 1


**Step 5: Decide to reject or accept null hypothesis¶**

* Here, p value is 0 and < 0.05 so we reject the null hypothesis.

* Going with the Alternative hypothesis: Marital_Status and Gender are Not Independent. This means they are dependent.

# ANOVA of Purchase based on
   - Gender
   - City Category
   - Marital Status


# Gender

In [25]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd, MultiComparison

In [103]:
train.drop(["User_ID","Product_ID","Product_Category_3"],axis=1,inplace=True)

In [40]:
train.columns

Index(['Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Purchase'],
      dtype='object')

In [46]:
modelGender = ols(formula= 'Purchase ~ (Gender)', data=train[0:10000]).fit()

aov_table = anova_lm(modelGender, type = 2)
print(aov_table)

              df        sum_sq       mean_sq          F        PR(>F)
Gender       1.0  1.123042e+09  1.123042e+09  47.346759  6.304179e-12
Residual  9998.0  2.371477e+11  2.371952e+07        NaN           NaN


In [91]:
if aov_table['PR(>F)'][0] <0.05:
    print("The means are not equal")
else:
    print("The means are equal")

The means are not equal


# City Category

In [50]:
modelCity = ols(formula= 'Purchase ~ C(City_Category)', data=train[0:10000]).fit()

aov_table = anova_lm(modelCity, type = 2)
print(aov_table)

                      df        sum_sq       mean_sq          F        PR(>F)
C(City_Category)     2.0  1.763421e+09  8.817105e+08  37.269283  7.485133e-17
Residual          9997.0  2.365074e+11  2.365783e+07        NaN           NaN


In [92]:
if aov_table['PR(>F)'][0] <0.05:
    print("The means are not equal")
else:
    print("The means are equal")


The means are not equal


In [93]:
mcCity = MultiComparison(train.Purchase, train.City_Category)
resultCity = mcCity.tukeyhsd(alpha=0.05)

print(resultCity)

 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj  lower    upper   reject
-----------------------------------------------------
     A      B 239.3613 0.001 200.2275 278.4952   True
     A      C 807.9818 0.001 766.2596 849.7039   True
     B      C 568.6204 0.001 531.1581 606.0828   True
-----------------------------------------------------


In [94]:
print(train[train['City_Category'] == 'A']['Purchase'].mean())
print(train[train['City_Category'] == 'B']['Purchase'].mean())
print(train[train['City_Category'] == 'C']['Purchase'].mean())

8911.939216084484
9151.300562781986
9719.92099313568


# Marital Status

In [95]:
modelMarriage = ols(formula= 'Purchase ~ C(Marital_Status)', data=train[0:10000]).fit()

aov_table = anova_lm(modelMarriage, type = 2)
print(aov_table)



                       df        sum_sq       mean_sq         F    PR(>F)
C(Marital_Status)     1.0  1.311806e+05  1.311806e+05  0.005504  0.940859
Residual           9998.0  2.382707e+11  2.383183e+07       NaN       NaN


In [96]:
if aov_table['PR(>F)'][0] <0.05:
    print("The means are not equal, apply TUKEY-KRAMER's Test")
else:
    print("The means are equal")

The means are equal


# Occupation

In [72]:
from scipy.stats import f_oneway
f_oneway(train["Occupation"],train["Purchase"])

F_onewayResult(statistic=1867731.9230829645, pvalue=0.0)

In [73]:
modelCity = ols(formula= 'Purchase ~ C(Occupation)', data=train).fit()

aov_table = anova_lm(modelCity, type = 2)
print(aov_table)


                     df        sum_sq       mean_sq          F  PR(>F)
C(Occupation)      20.0  4.951671e+10  2.475836e+09  98.473775     0.0
Residual       550047.0  1.382933e+13  2.514208e+07        NaN     NaN


In [74]:
if aov_table['PR(>F)'][0] <0.05:
    print("The means are not equal")
else:
    print("The means are equal")

The means are not equal


# AGE

In [80]:
age_dict = {'0-17':0, '18-25':1, '26-35':2, '36-45':3, '46-50':4, '51-55':5, '55+':6}
train["Age"] = train["Age"].apply(lambda x: age_dict[x])

In [81]:
f_oneway(train["Age"],train["Purchase"])

F_onewayResult(statistic=1869987.6901634906, pvalue=0.0)

In [82]:
modelCity = ols(formula= 'Purchase ~ C(Age)', data=train).fit()

aov_table = anova_lm(modelCity, type = 2)
print(aov_table)

                df        sum_sq       mean_sq          F        PR(>F)
C(Age)         6.0  6.140003e+09  1.023334e+09  40.575799  1.053564e-49
Residual  550061.0  1.387270e+13  2.522030e+07        NaN           NaN


In [83]:
if aov_table['PR(>F)'][0] <0.05:
    print("The means are not equal")
else:
    print("The means are equal")

The means are not equal


# Stay_In_Current_City_Years

In [86]:
stay_dict = {'0':0, '1':1, '2':2, '3':3, '4+':4}
train["Stay_In_Current_City_Years"] = train["Stay_In_Current_City_Years"].apply(lambda x: stay_dict[x])

In [87]:
f_oneway(train["Stay_In_Current_City_Years"],train["Purchase"])

F_onewayResult(statistic=1870245.3541570134, pvalue=0.0)

In [88]:
modelCity = ols(formula= 'Purchase ~ C(Stay_In_Current_City_Years)', data=train).fit()

aov_table = anova_lm(modelCity, type = 2)
print(aov_table)

                                     df        sum_sq       mean_sq         F  \
C(Stay_In_Current_City_Years)       4.0  9.468844e+08  2.367211e+08  9.382655   
Residual                       550063.0  1.387790e+13  2.522965e+07       NaN   

                                     PR(>F)  
C(Stay_In_Current_City_Years)  1.401144e-07  
Residual                                NaN  


In [89]:
if aov_table['PR(>F)'][0] <0.05:
    print("The means are not equal")
else:
    print("The means are equal")

The means are not equal


# Product_Category_1

In [98]:
f_oneway(train["Product_Category_1"],train["Purchase"])

F_onewayResult(statistic=1868812.6160895713, pvalue=0.0)

In [100]:
modelCity = ols(formula= 'Purchase ~ C(Product_Category_1)', data=train).fit()

aov_table = anova_lm(modelCity)
print(aov_table)

                             df        sum_sq       mean_sq             F  \
C(Product_Category_1)      19.0  8.834630e+12  4.649805e+11  50703.974421   
Residual               550048.0  5.044212e+12  9.170495e+06           NaN   

                       PR(>F)  
C(Product_Category_1)     0.0  
Residual                  NaN  


In [101]:
if aov_table['PR(>F)'][0] <0.05:
    print("The means are not equal")
else:
    print("The means are equal")

The means are not equal


# Product_Category_2

In [104]:
train["Product_Category_2"]=train["Product_Category_2"].fillna(-99)

In [105]:
f_oneway(train["Product_Category_2"],train["Purchase"])

F_onewayResult(statistic=1880719.6534673236, pvalue=0.0)

In [106]:
modelCity = ols(formula= 'Purchase ~ C(Product_Category_2)', data=train).fit()

aov_table = anova_lm(modelCity)
print(aov_table)

                             df        sum_sq       mean_sq            F  \
C(Product_Category_2)      17.0  2.177073e+12  1.280631e+11  6019.698014   
Residual               550050.0  1.170177e+13  2.127401e+07          NaN   

                       PR(>F)  
C(Product_Category_2)     0.0  
Residual                  NaN  


In [107]:
if aov_table['PR(>F)'][0] <0.05:
    print("The means are not equal")
else:
    print("The means are equal")

The means are not equal
