# 라이브러리 및 데이터 불러오기

In [1]:
import pandas as pd
from scipy import stats
from scipy.stats import chi2_contingency

In [2]:
df = pd.read_csv("./data/preprocessed_cookie_cat.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       25000 non-null  int64 
 1   userid           25000 non-null  int64 
 2   version          25000 non-null  object
 3   sum_gamerounds   25000 non-null  int64 
 4   retention_1      25000 non-null  bool  
 5   retention_7      25000 non-null  bool  
 6   gameround_group  25000 non-null  object
dtypes: bool(2), int64(3), object(2)
memory usage: 1.0+ MB


In [4]:
df

Unnamed: 0.1,Unnamed: 0,userid,version,sum_gamerounds,retention_1,retention_7,gameround_group
0,1,337,gate_30,38,True,False,A
1,2,377,gate_40,165,True,False,N
2,4,488,gate_40,179,True,True,O
3,5,540,gate_40,187,True,True,P
4,8,1574,gate_40,108,True,True,H
...,...,...,...,...,...,...,...
24995,90171,9997757,gate_30,84,True,False,F
24996,90175,9998125,gate_30,30,False,False,A
24997,90177,9998376,gate_40,53,False,False,C
24998,90184,9999441,gate_40,97,True,False,G


# gate_30 / gate_40 - Paired Sample T-Test

$$H_{0}: \mu_{30}-\mu_{40}=0$$
$$H_{a}: \mu_{30}-\mu_{40}\neq0$$

## 등분산 검정

In [5]:
gate_30_rounds = df[df["version"] == "gate_30"]["sum_gamerounds"]
gate_40_rounds = df[df["version"] == "gate_40"]["sum_gamerounds"]

In [6]:
_,p_value_levene = stats.levene(gate_30_rounds, gate_40_rounds)

In [7]:
if p_value_levene > 0.05:
    print(p_value_levene, "등분산 가정 만족")
else:
    print(p_value_levene, "이분산 가정 만족")

0.3510470259494839 등분산 가정 만족


## T-Test 수행

In [8]:
t, p_value = stats.ttest_ind(
    a = gate_30_rounds,
    b = gate_40_rounds,
    alternative = "two-sided",
    equal_var = True
)

In [9]:
print(f"p-value : {p_value}")
print(f"귀무가설 기각 : {p_value < 0.05}")

p-value : 7.396046609212658e-121
귀무가설 기각 : True


# version / retention_1 - Chi Square Test

귀무가설 : version 과 retention_1은 독립이다.  
대립가설 : version 과 retention_1은 독립이 아니다.

## 분할표 및 상대 도수 분할표 그리기

In [10]:
retention_1_c_table = pd.crosstab(df["version"], df["retention_1"],
                     margins = True)

In [14]:
retention_1_c_table

retention_1,False,True,All
version,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
gate_30,3143,10784,13927
gate_40,2158,8915,11073
All,5301,19699,25000


In [12]:
retention_1_rfc_table = pd.crosstab(df["version"], df["retention_1"],
                        margins = True, normalize = True)

In [15]:
retention_1_rfc_table

retention_1,False,True,All
version,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
gate_30,0.12572,0.43136,0.55708
gate_40,0.08632,0.3566,0.44292
All,0.21204,0.78796,1.0


## Chi Square Test 수행

In [16]:
chi2_statistics, p_value, _, _ = chi2_contingency(retention_1_c_table)

In [17]:
print(f"Chi Squeare Statistic : {chi2_statistics}")
print(f"P-Value : {p_value}")
print(f"귀무가설 기각 : {p_value < 0.05}")

Chi Squeare Statistic : 34.99706547056732
P-Value : 4.651800425809452e-07
귀무가설 기각 : True


# version / retention_7 - Chi Square Test

귀무가설 : version 과 retention_7은 독립이다.  
대립가설 : version 과 retention_7은 독립이 아니다.

## 분할표 및 상대 도수 분할표 그리기

In [18]:
retention_7_c_table = pd.crosstab(df["version"], df["retention_7"],
                     margins = True)

In [19]:
retention_7_c_table

retention_7,False,True,All
version,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
gate_30,8922,5005,13927
gate_40,6695,4378,11073
All,15617,9383,25000


In [20]:
retention_7_rfc_table = pd.crosstab(df["version"], df["retention_7"],
                                   margins = True, normalize = True)

In [21]:
retention_7_rfc_table

retention_7,False,True,All
version,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
gate_30,0.35688,0.2002,0.55708
gate_40,0.2678,0.17512,0.44292
All,0.62468,0.37532,1.0


## Chi Square Test 수행

In [22]:
chi2_statistics, p_value, _, _, = chi2_contingency(retention_7_c_table)

In [23]:
print(f"Chi Squeare Statistic : {chi2_statistics}")
print(f"P-Value : {p_value}")
print(f"귀무가설 기각 : {p_value < 0.05}")

Chi Squeare Statistic : 34.102257998318635
P-Value : 7.100568725708325e-07
귀무가설 기각 : True


# retention_1 / retention_7 - Chi Square Test

귀무가설 : retention_1 과 retention_7은 독립이다.  
대립가설 : retention_1 과 retention_7은 독립이 아니다.

## 분할표 및 상대 도수 분할표 그리기

In [24]:
retention_c_table = pd.crosstab(df["retention_1"], df["retention_7"],
                     margins = True)

In [25]:
retention_c_table

retention_7,False,True,All
retention_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,3459,1842,5301
True,12158,7541,19699
All,15617,9383,25000


In [26]:
retention_rfc_table = pd.crosstab(df["retention_1"], df["retention_7"],
                                   margins = True, normalize = True)

In [27]:
retention_rfc_table

retention_7,False,True,All
retention_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,0.13836,0.07368,0.21204
True,0.48632,0.30164,0.78796
All,0.62468,0.37532,1.0


## Chi Square Test 수행

In [29]:
chi2_statistics, p_value, _, _, = chi2_contingency(retention_c_table)

In [30]:
print(f"Chi Squeare Statistic : {chi2_statistics}")
print(f"P-Value : {p_value}")
print(f"귀무가설 기각 : {p_value < 0.05}")

Chi Squeare Statistic : 22.23732989406514
P-Value : 0.00017975457218262524
귀무가설 기각 : True


# Result

### gate_30 접속율

In [52]:
gate_30_retention_1_t = df[(df["version"] == "gate_30") & (df["retention_1"] == True)]["retention_1"].count()
gate_30_retention_1_f = df[(df["version"] == "gate_30") & (df["retention_1"] == False)]["retention_1"].count()

In [56]:
gate_30_retention_7_t = df[(df["version"] == "gate_30") & (df["retention_7"] == True)]["retention_7"].count()
gate_30_retention_7_f = df[(df["version"] == "gate_30") & (df["retention_7"] == False)]["retention_7"].count()

In [78]:
print(f"1일 뒤 접속율 : {round((gate_30_retention_1_t / (gate_30_retention_1_t + gate_30_retention_1_f)) * 100, 2)}%")
print(f"1일 뒤 미접속율 : {round((gate_30_retention_1_f / (gate_30_retention_1_t + gate_30_retention_1_f)) * 100, 2)}%")
print("----------------------")
print(f"7일 뒤 접속율 : {round((gate_30_retention_7_t / (gate_30_retention_7_t + gate_30_retention_7_f)) * 100, 2)}%")
print(f"7일 뒤 미접속율 : {round((gate_30_retention_7_f / (gate_30_retention_7_t + gate_30_retention_7_f)) * 100, 2)}%")

1일 뒤 접속율 : 77.43%
1일 뒤 미접속율 : 22.57%
----------------------
7일 뒤 접속율 : 35.94%
7일 뒤 미접속율 : 64.06%


### gate_40 접속율

In [58]:
gate_40_retention_1_t = df[(df["version"] == "gate_40") & (df["retention_1"] == True)]["retention_1"].count()
gate_40_retention_1_f = df[(df["version"] == "gate_40") & (df["retention_1"] == False)]["retention_1"].count()

In [60]:
gate_40_retention_7_t = df[(df["version"] == "gate_40") & (df["retention_7"] == True)]["retention_1"].count()
gate_40_retention_7_f = df[(df["version"] == "gate_40") & (df["retention_7"] == False)]["retention_1"].count()

In [79]:
print(f"1일 뒤 접속율 : {round((gate_40_retention_1_t / (gate_40_retention_1_t + gate_40_retention_1_f)) * 100, 2)}%")
print(f"1일 뒤 미접속율 : {round((gate_40_retention_1_f / (gate_40_retention_1_t + gate_40_retention_1_f)) * 100, 2)}%")
print("----------------------")
print(f"7일 뒤 접속율 : {round((gate_40_retention_7_t / (gate_40_retention_7_t + gate_40_retention_7_f)) * 100, 2)}%")
print(f"7일 뒤 미접속율 : {round((gate_40_retention_7_f / (gate_40_retention_7_t + gate_40_retention_7_f)) * 100, 2)}%")

1일 뒤 접속율 : 80.51%
1일 뒤 미접속율 : 19.49%
----------------------
7일 뒤 접속율 : 39.54%
7일 뒤 미접속율 : 60.46%


- version 과 retention 은 독립이 아닌 종속관계임을 카이제곱검정으로 확인.
- gate_40을 설정한 그룹이 gate_30을 설정한 그룹보다 전체적인 미접속율이 낮음.
- 코치님의 조언으로 게임을 실제로 진행해보니 게임 진행 속도가 생각보다 굉장히 빠름을 알 수 있었음.
- 30라운드는 생각 이상으로 빠르게 도달할 수 있으며 그로 인한 짧은 플레이타임으로 게임의 재미를 다 느끼기 전에 막힐 가능성이 상당함.
- gate를 40으로 미뤄 미접속율을 낮추는 것이 도움이 될 것이다.