## Načítanie knižníc a trénovacieho datasetu

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats as ss
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import seaborn as sns
import math

In [2]:
df = pd.read_parquet("ml_dataset_train.parquet")

In [3]:
df.head()

Unnamed: 0,account_id,n_ad_reward_claims,n_ad_reward_fails,n_ads_watched,n_battlepass_lvls_finished,masked_feature_0,masked_feature_1,masked_feature_2,masked_feature_3,masked_feature_4,...,battlepass_22.0,battlepass_23.0,battlepass_8008.0,first_login_day_time,first_login_country_is_mfreq,time_to_first_purchase,time_to_last_purchase,time_between_last_purchase_last_login,sum_payments_package_key_ltv,n_payments_package_key_ltv
0,100000042,13.0,,26.0,8.0,175,354.0,354.0,281.0,141.0,...,False,False,False,eve,1,,,,,
1,100000053,,,7.0,1.0,25,17.0,17.0,8.0,8.0,...,False,False,False,morning,1,,,,,
2,100000082,,,,,1,,,,,...,False,False,False,eve,1,,,,,
3,100000112,9.0,,36.0,5.0,72,197.0,197.0,194.0,83.0,...,False,False,False,eve,1,,,,,
4,100000122,,,1.0,3.0,67,108.0,108.0,96.0,38.0,...,False,False,False,eve,1,,,,,


In [4]:
df.loc[:,["account_id","n_ads_watched","first_login_day_time","sum_payments_package_key","sum_payments_package_key_ltv"]].sort_values(by="sum_payments_package_key_ltv",ascending=False).head()

Unnamed: 0,account_id,n_ads_watched,first_login_day_time,sum_payments_package_key,sum_payments_package_key_ltv
2494459,169616730,3.0,early_morning,2854.17,22535.53
2508909,170207391,7.0,night,968.61,14256.66
2368471,165357101,1.0,noon,1446.35,11661.82
921994,127826834,,early_morning,984.62,7641.16
2145973,150404211,105.0,early_morning,63.89,6348.67


## Úprava datasetu a vyhodenie skrytých údajov(jednalo sa o citlivé údaje klientov)

In [5]:
empty_array = []
for i in range(0,42):
    empty_array.append("masked_feature_" +str(i))
print(empty_array)

['masked_feature_0', 'masked_feature_1', 'masked_feature_2', 'masked_feature_3', 'masked_feature_4', 'masked_feature_5', 'masked_feature_6', 'masked_feature_7', 'masked_feature_8', 'masked_feature_9', 'masked_feature_10', 'masked_feature_11', 'masked_feature_12', 'masked_feature_13', 'masked_feature_14', 'masked_feature_15', 'masked_feature_16', 'masked_feature_17', 'masked_feature_18', 'masked_feature_19', 'masked_feature_20', 'masked_feature_21', 'masked_feature_22', 'masked_feature_23', 'masked_feature_24', 'masked_feature_25', 'masked_feature_26', 'masked_feature_27', 'masked_feature_28', 'masked_feature_29', 'masked_feature_30', 'masked_feature_31', 'masked_feature_32', 'masked_feature_33', 'masked_feature_34', 'masked_feature_35', 'masked_feature_36', 'masked_feature_37', 'masked_feature_38', 'masked_feature_39', 'masked_feature_40', 'masked_feature_41']


In [6]:
df.drop(['masked_feature_0', 'masked_feature_1', 'masked_feature_2', 'masked_feature_3', 'masked_feature_4', 'masked_feature_5', 'masked_feature_6', 'masked_feature_7', 'masked_feature_8', 'masked_feature_9','masked_feature_13','masked_feature_14','masked_feature_15', 'masked_feature_16', 'masked_feature_17', 'masked_feature_18', 'masked_feature_20', 'masked_feature_21', 'masked_feature_22', 'masked_feature_23', 'masked_feature_24', 'masked_feature_25', 'masked_feature_26', 'masked_feature_27', 'masked_feature_28', 'masked_feature_29', 'masked_feature_30', 'masked_feature_31', 'masked_feature_32', 'masked_feature_33', 'masked_feature_34', 'masked_feature_35', 'masked_feature_36', 'masked_feature_37', 'masked_feature_38', 'masked_feature_39', 'masked_feature_40', 'masked_feature_41'],axis=1,inplace=True)

## Rozdelenie a úprava datasetu pre ďalší postup a popisná štatistika jednotlivých častí pre lepší prehľad

In [7]:
# rozdelenie datasetu na buyerov a non_buyerov
buyers = df.loc[(df["sum_payments_package_key_ltv"]>0)]
non_buyers = df.loc[(df["n_payments_package_key_ltv"].isna())]
buyers_count = len(buyers.index)
non_buyers_count = len(non_buyers.index)
buyers_non_buyers_ratio = round((buyers_count/non_buyers_count)*100,2)
print(f"Percento buyerov v datasete {buyers_non_buyers_ratio}")

Percento buyerov v datasete 2.55


In [8]:
#oddelenie numerických premenných pre buyerov od tých ostatných nakoľko v regresných modeloch budeme pracovať len s numerickými premennými
numeric_columns_buyers = buyers.dtypes[buyers.dtypes!=np.object_].index.tolist()
numeric_variables_buyers = buyers.loc[:,numeric_columns_buyers].astype("float64")
# nahradenie nan hodnôt priemerom príslušných stĺpcov a zbavenie sa boolean hodnot+následne resetnutie indexu aby sme boli schopní pristupovať k dátam 
# cez index
numeric_variables_buyers.drop(["battlepass_0.0","battlepass_22.0","battlepass_23.0","battlepass_8008.0"],inplace=True,axis=1)

In [9]:
# nahradenie nan hodnôt priemerom príslušných stĺpcov a zbavenie sa boolean hodnot+následne resetnutie indexu aby sme boli schopní pristupovať k dátam 
# cez index
for i in range(numeric_variables_buyers.shape[1]):
    numeric_variables_buyers.iloc[:,i].fillna(numeric_variables_buyers.iloc[:,i].mean(),inplace=True)
numeric_variables_buyers.reset_index(inplace=True,drop=True)

In [10]:
#Vyfiltrovanie textových premenných pre buyerov
string_columns_buyers = buyers.dtypes[buyers.dtypes==np.object_].index.tolist()
string_variables_buyers= buyers.loc[:,string_columns_buyers]

In [11]:
#popisná štatistika textových premenných pre buyerov
string_variables_buyers.describe() 

Unnamed: 0,account_id,device_language,first_device_os,first_time_zone,first_device_model,first_network_type,first_login_country,first_device_manufacturer,is_paid_user,form_factor,most_frequent_country,most_frequent_network_type,first_login_weekday,first_login_day_time
count,69858,69841,69841,69841,69841,69841,69841,69841,69841,31040,69841,69841,69858,69858
unique,69858,43,3,32,2182,4,174,117,2,3,174,5,7,6
top,100000423,en,android,+09:00,"iPhone12,1",WIFI,US,Apple,True,Phone,US,WIFI,Sunday,noon
freq,1,36957,35701,9722,5448,45020,20297,34140,65428,30334,20304,43849,11098,14159


In [12]:
#vyfiltrovanie číselných premenných pre non-buyerov
numeric_columns_non_buyers = non_buyers.dtypes[non_buyers.dtypes!=np.object_].index.tolist()
numeric_variables_non_buyers = non_buyers.loc[:,numeric_columns_non_buyers]

In [13]:
#popisná štatistika číselných premenných pre non-buyerov
numeric_variables_non_buyers.describe()

Unnamed: 0,n_ad_reward_claims,n_ad_reward_fails,n_ads_watched,n_battlepass_lvls_finished,n_instant_awards_claims,n_extra_challenge_buys,connected_fb,connect_fb_attempt,n_friend_installs,n_accept_all_friends,...,screen_height,nunique_countries,nunique_network_types,nunique_iaps_bought,first_login_country_is_mfreq,time_to_first_purchase,time_to_last_purchase,time_between_last_purchase_last_login,sum_payments_package_key_ltv,n_payments_package_key_ltv
count,225719.0,182300.0,924496.0,758512.0,6923.0,10646.0,106847.0,144687.0,1463.0,26906.0,...,1147467.0,2742969.0,2742969.0,1282.0,2742985.0,0.0,0.0,0.0,0.0,0.0
mean,6.681932,1.837926,20.405868,3.516323,1.472916,1.115536,1.106114,1.353515,1.101846,1.580354,...,2004.614,1.010239,1.350424,1.054602,0.9972658,,,,,
std,9.125862,2.49289,34.280043,2.04277,0.864203,0.378111,0.418406,0.89016,0.403325,1.529549,...,372.4043,0.11426,0.5387187,0.355812,0.05221851,,,,,
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,5.0,1.0,1.0,1.0,0.0,,,,,
25%,4.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1640.0,1.0,1.0,1.0,1.0,,,,,
50%,5.0,1.0,7.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,...,2009.0,1.0,1.0,1.0,1.0,,,,,
75%,9.0,2.0,22.0,5.0,2.0,1.0,1.0,1.0,1.0,2.0,...,2340.0,1.0,2.0,1.0,1.0,,,,,
max,3038.0,201.0,617.0,59.0,26.0,6.0,22.0,38.0,7.0,71.0,...,3840.0,9.0,5.0,8.0,1.0,,,,,


In [14]:
# Vyfiltrovanie textových premenných pre non-buyerov
string_columns_non_buyers = non_buyers.dtypes[non_buyers.dtypes==np.object_].index.tolist()
string_variables_non_buyers = non_buyers.loc[:,string_columns_non_buyers]

In [15]:
# Popisná štatistika textových premenných pre non-buyerov
string_variables_non_buyers.describe()

Unnamed: 0,account_id,device_language,first_device_os,first_time_zone,first_device_model,first_network_type,first_login_country,first_device_manufacturer,is_paid_user,form_factor,most_frequent_country,most_frequent_network_type,first_login_weekday,first_login_day_time
count,2742985,2742969,2742985,2742985,2742985,2742985,2742979,2742985,2742637,1147467,2742969,2742969,2742985,2742985
unique,2742985,122,3,39,10438,7,230,1180,2,3,231,6,7,6
top,100000042,en,android,+01:00,"iPhone12,1",WIFI,US,Apple,False,Phone,US,WIFI,Sunday,noon
freq,1,1377856,1440797,376185,233898,1928017,605961,1302189,2738109,1099691,605395,1897433,427703,586142


## Načítanie a úprava testovacieho datasetu

In [16]:
df_test = pd.read_parquet("ml_dataset_test.parquet")
df_test.drop(['masked_feature_0', 'masked_feature_1', 'masked_feature_2', 'masked_feature_3', 'masked_feature_4', 'masked_feature_5', 'masked_feature_6', 'masked_feature_7', 'masked_feature_8', 'masked_feature_9','masked_feature_13','masked_feature_14','masked_feature_15', 'masked_feature_16', 'masked_feature_17', 'masked_feature_18', 'masked_feature_20', 'masked_feature_21', 'masked_feature_22', 'masked_feature_23', 'masked_feature_24', 'masked_feature_25', 'masked_feature_26', 'masked_feature_27', 'masked_feature_28', 'masked_feature_29', 'masked_feature_30', 'masked_feature_31', 'masked_feature_32', 'masked_feature_33', 'masked_feature_34', 'masked_feature_35', 'masked_feature_36', 'masked_feature_37', 'masked_feature_38', 'masked_feature_39', 'masked_feature_40', 'masked_feature_41'],axis=1,inplace=True)
df_test.head()

Unnamed: 0,account_id,n_ad_reward_claims,n_ad_reward_fails,n_ads_watched,n_battlepass_lvls_finished,n_instant_awards_claims,n_extra_challenge_buys,connected_fb,connect_fb_attempt,n_friend_installs,...,battlepass_22.0,battlepass_23.0,battlepass_8008.0,first_login_day_time,first_login_country_is_mfreq,time_to_first_purchase,time_to_last_purchase,time_between_last_purchase_last_login,sum_payments_package_key_ltv,n_payments_package_key_ltv
12,100000293,,,,2.0,,,2.0,2.0,,...,False,False,False,morning,1,,,,,
13,100000333,,,5.0,2.0,,,,,,...,False,False,False,morning,1,,,,,
19,100000463,,,,,,,,,,...,False,False,False,morning,1,,,,,
24,100000624,,,,,,,,,,...,False,False,False,eve,1,,,,,
28,100000713,,,,,,,,,,...,False,False,False,morning,1,,,,,


In [17]:
buyers_test = df_test.loc[(df_test["sum_payments_package_key_ltv"].notna())]
non_buyers_test = df_test.loc[(df_test["n_payments_package_key_ltv"].isna())]
buyers_count_test = len(buyers_test.index)
non_buyers_count_test = len(non_buyers_test.index)
buyers_non_buyers_ratio_test = round((buyers_count_test/non_buyers_count_test)*100,2)
buyers_non_buyers_ratio_test

2.55

In [48]:
#vyfiltrovanie numerickych premennych
buyers_test_numeric_columns = buyers_test.dtypes[buyers_test.dtypes!=np.object_].index.tolist()
buyers_test_numeric_variables = buyers_test.loc[:,buyers_test_numeric_columns].astype("float64")
# odstraneie boolean premenných pre zjednodušenie procesu a nahradenie nan honotami nulov + resetnutie indexu
buyers_test_numeric_variables.drop(["battlepass_0.0","battlepass_22.0","battlepass_23.0","battlepass_8008.0"],axis=1,inplace = True)
#buyers_test_numeric_variables.fillna(0,inplace = True)
#buyers_test_numeric_variables.reset_index(inplace=True,drop=True)
#buyers_test_numeric_variables.head() 

In [56]:
# v testovacom boli vsetky hodnoty NaN, preto so vyhodil v oboch datasetoch
buyers_test_numeric_variables.drop(["max_session_end_viplevel"],axis=1,inplace=True)
numeric_variables_buyers.drop(["max_session_end_viplevel"],axis=1,inplace=True)

KeyError: "['max_session_end_viplevel'] not found in axis"

In [57]:
for i in range(buyers_test_numeric_variables.shape[1]):
    buyers_test_numeric_variables.iloc[:,i].fillna(buyers_test_numeric_variables.iloc[:,i].mean(),inplace=True)
buyers_test_numeric_variables.reset_index(inplace=True,drop=True)

In [21]:
buyers_test_numeric_variables.describe()

Unnamed: 0,n_ad_reward_claims,n_ad_reward_fails,n_ads_watched,n_battlepass_lvls_finished,n_instant_awards_claims,n_extra_challenge_buys,connected_fb,connect_fb_attempt,n_friend_installs,n_accept_all_friends,...,screen_height,nunique_countries,nunique_network_types,nunique_iaps_bought,first_login_country_is_mfreq,time_to_first_purchase,time_to_last_purchase,time_between_last_purchase_last_login,sum_payments_package_key_ltv,n_payments_package_key_ltv
count,12347.0,12347.0,12347.0,12347.0,12347.0,12347.0,12347.0,12347.0,12347.0,12347.0,...,12347.0,12347.0,12347.0,12347.0,12347.0,12347.0,12347.0,12347.0,12347.0,12347.0
mean,9.040816,2.302998,39.246882,5.493306,1.795134,1.357815,1.130055,1.385053,1.103448,1.879132,...,2111.355398,1.018711,1.779038,2.162364,0.995869,47933.866898,79916.267889,128851.672309,39.847188,4.815097
std,4.235898,1.452575,46.083714,2.551292,0.252834,0.192188,0.197628,0.385524,0.027563,0.473573,...,190.181631,0.148615,0.605821,2.425169,0.064139,62807.728145,79613.747689,83688.356096,198.594481,10.309769
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,854.0,1.0,1.0,1.0,0.0,13.048,16.568,0.411,0.99,1.0
25%,8.0,2.302998,8.0,4.0,1.795134,1.357815,1.130055,1.385053,1.103448,1.879132,...,2009.0,1.0,1.0,1.0,1.0,3762.4805,7341.4925,54920.0915,1.995,1.0
50%,9.040816,2.302998,26.0,5.493306,1.795134,1.357815,1.130055,1.385053,1.103448,1.879132,...,2111.355398,1.0,2.0,1.0,1.0,14875.246,52190.36,134973.114,4.99,2.0
75%,9.040816,2.302998,47.0,7.0,1.795134,1.357815,1.130055,1.385053,1.103448,1.879132,...,2111.355398,1.0,2.0,2.0,1.0,75674.8745,140535.0695,204529.02,22.97,4.0
max,44.0,39.0,635.0,50.0,10.0,8.0,11.0,13.0,3.0,16.0,...,3200.0,4.0,5.0,47.0,1.0,259119.486,259197.446,258530.907,11701.32,368.0


## Lineárna regresia za pomoci statsmodels

In [22]:
X_train = numeric_variables_buyers.iloc[:,:45]
Y_train = numeric_variables_buyers.iloc[:,45].to_frame()

In [23]:
X_test = buyers_test_numeric_variables.iloc[:,:45]
Y_test = buyers_test_numeric_variables.iloc[:,45].to_frame()

In [24]:
dep_var = Y_train
exp_var = X_train
exp_var = sm.add_constant(exp_var) 

In [25]:
ols = sm.OLS(dep_var.astype(float),exp_var.astype(float))
ols_fit = ols.fit()
ols_fit.summary()

0,1,2,3
Dep. Variable:,sum_payments_package_key_ltv,R-squared:,0.429
Model:,OLS,Adj. R-squared:,0.428
Method:,Least Squares,F-statistic:,1164.0
Date:,"Wed, 18 May 2022",Prob (F-statistic):,0.0
Time:,16:56:44,Log-Likelihood:,-447240.0
No. Observations:,69858,AIC:,894600.0
Df Residuals:,69812,BIC:,895000.0
Df Model:,45,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,36.9043,60.356,0.611,0.541,-81.392,155.201
n_ad_reward_claims,0.1750,0.149,1.171,0.242,-0.118,0.468
n_ad_reward_fails,0.3286,0.391,0.841,0.401,-0.438,1.095
n_ads_watched,-0.0224,0.015,-1.478,0.139,-0.052,0.007
n_battlepass_lvls_finished,-0.9839,0.366,-2.687,0.007,-1.702,-0.266
n_instant_awards_claims,-5.4216,2.425,-2.236,0.025,-10.174,-0.669
n_extra_challenge_buys,10.5771,2.691,3.930,0.000,5.302,15.852
connected_fb,0.0673,3.185,0.021,0.983,-6.175,6.309
connect_fb_attempt,-2.5284,1.738,-1.455,0.146,-5.934,0.877

0,1,2,3
Omnibus:,185366.472,Durbin-Watson:,1.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,16381357685.397
Skew:,31.241,Prob(JB):,0.0
Kurtosis:,2374.493,Cond. No.,5530000000.0


In [26]:
adjusted_X_train = X_train.drop(["n_ad_reward_claims","n_ad_reward_fails","n_ads_watched","connected_fb","connect_fb_attempt","n_friend_installs","n_accept_all_friends","n_clicked_add_friends","n_friend_searches","n_calendar_login_days","n_payments_package_key","rated_game","n_remove_ad_clicks","max_session_end_player_level","min_session_start_viplevel","bigmac_dollar_price","screen_density","screen_width","screen_height","nunique_countries","nunique_network_types","first_login_country_is_mfreq","time_between_last_purchase_last_login"],axis=1)
adjusted_X_test = X_test.drop(["n_ad_reward_claims","n_ad_reward_fails","n_ads_watched","connected_fb","connect_fb_attempt","n_friend_installs","n_accept_all_friends","n_clicked_add_friends","n_friend_searches","n_calendar_login_days","n_payments_package_key","rated_game","n_remove_ad_clicks","max_session_end_player_level","min_session_start_viplevel","bigmac_dollar_price","screen_density","screen_width","screen_height","nunique_countries","nunique_network_types","first_login_country_is_mfreq","time_between_last_purchase_last_login"],axis=1)

In [27]:
adj_dep_var = Y_train
adj_exp_var = adjusted_X_train

In [28]:
adj_ols = sm.OLS(adj_dep_var.astype(float),adj_exp_var.astype(float))
adj_ols_fit = adj_ols.fit()
adj_ols_fit.summary()

0,1,2,3
Dep. Variable:,sum_payments_package_key_ltv,R-squared (uncentered):,0.452
Model:,OLS,Adj. R-squared (uncentered):,0.452
Method:,Least Squares,F-statistic:,2620.0
Date:,"Wed, 18 May 2022",Prob (F-statistic):,0.0
Time:,16:56:49,Log-Likelihood:,-447260.0
No. Observations:,69858,AIC:,894600.0
Df Residuals:,69836,BIC:,894800.0
Df Model:,22,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
n_battlepass_lvls_finished,-1.0999,0.351,-3.130,0.002,-1.789,-0.411
n_instant_awards_claims,-8.5511,1.998,-4.281,0.000,-12.467,-4.636
n_extra_challenge_buys,7.5685,2.335,3.242,0.001,2.993,12.144
n_accept_friend,-10.0592,1.583,-6.356,0.000,-13.161,-6.957
n_levels_completed,-0.4843,0.175,-2.760,0.006,-0.828,-0.140
n_missions_completed,0.0231,0.007,3.174,0.002,0.009,0.037
n_package_info_offers_viewed,-0.0052,0.001,-4.340,0.000,-0.008,-0.003
sum_payments_package_key,2.6761,0.018,146.999,0.000,2.640,2.712
n_package_tips_offers_viewed,-0.2028,0.033,-6.139,0.000,-0.267,-0.138

0,1,2,3
Omnibus:,185316.881,Durbin-Watson:,1.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,16332356042.243
Skew:,31.219,Prob(JB):,0.0
Kurtosis:,2370.942,Cond. No.,192000000.0


## Lineárna regresia za pomoci scikit learn

In [29]:
regr = linear_model.LinearRegression(fit_intercept=True)
regr.fit(X_train,Y_train)
y_pred = regr.predict(X_test)

In [30]:
#score parameters
print(f"R^2 skóre pre tréningový dataset: {regr.score(X_train,Y_train)}")
print(f"R^2 skóre pre testovací dataet: {regr.score(X_test,Y_test)}")
print(f"MAE: {mean_absolute_error(Y_test,y_pred)}")
print(f"MSE: {mean_squared_error(Y_test,y_pred)}")
print(f"RMSE: {math.sqrt(mean_squared_error(Y_test,y_pred))}")

R^2 skóre pre tréningový dataset: 0.4285749032148223
R^2 skóre pre testovací dataet: 0.31797368247379243
MAE: 38.046701899351845
MSE: 26896.781027734618
RMSE: 164.00238116483132


In [31]:
#lineárna regresia po vyhodení nevýznamných premenných
regr = linear_model.LinearRegression(fit_intercept=True)
regr.fit(adjusted_X_train,Y_train)
y_pred = regr.predict(adjusted_X_test)

In [32]:
print(f"R^2 skóre pre tréningový dataset: {regr.score(adjusted_X_train,Y_train)}")
print(f"R^2 skóre pre testovací dataet: {regr.score(adjusted_X_test,Y_test)}")
print(f"MAE: {mean_absolute_error(Y_test,y_pred)}")
print(f"MSE: {mean_squared_error(Y_test,y_pred)}")
print(f"RMSE: {math.sqrt(mean_squared_error(Y_test,y_pred))}")

R^2 skóre pre tréningový dataset: 0.4283600332826919
R^2 skóre pre testovací dataet: 0.32087190966733226
MAE: 37.177736420424665
MSE: 26782.48487787927
RMSE: 163.6535513757012


## Logaritmická regresia s využitím statsmodels

In [33]:
numeric_variables_buyers_log = []
for i in range(0,len(numeric_variables_buyers)):
    numeric_variables_buyers_log.append(math.log(numeric_variables_buyers.iloc[i,45]))

In [34]:
numeric_variables_buyers_log_test = []
for i in range(0,len(buyers_test_numeric_variables)):
    numeric_variables_buyers_log_test.append(math.log(buyers_test_numeric_variables.iloc[i,45]))

In [35]:
log_y_train = {"sum_payments_package_key_ltv": numeric_variables_buyers_log}
log_y_train = pd.DataFrame(log_y_train)

In [36]:
log_y_test = {"sum_payments_package_key_ltv": numeric_variables_buyers_log_test}
log_y_test = pd.DataFrame(log_y_test)

In [37]:
dep_var = log_y_train
exp_var = X_train
exp_var = sm.add_constant(exp_var) 

In [38]:
ols = sm.OLS(dep_var.astype(float),exp_var.astype(float))
ols_fit = ols.fit()
ols_fit.summary()

0,1,2,3
Dep. Variable:,sum_payments_package_key_ltv,R-squared:,0.491
Model:,OLS,Adj. R-squared:,0.49
Method:,Least Squares,F-statistic:,1495.0
Date:,"Wed, 18 May 2022",Prob (F-statistic):,0.0
Time:,16:57:03,Log-Likelihood:,-109890.0
No. Observations:,69858,AIC:,219900.0
Df Residuals:,69812,BIC:,220300.0
Df Model:,45,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.2044,0.482,-0.424,0.672,-1.150,0.741
n_ad_reward_claims,-0.0050,0.001,-4.145,0.000,-0.007,-0.003
n_ad_reward_fails,-0.0004,0.003,-0.130,0.897,-0.007,0.006
n_ads_watched,-0.0008,0.000,-6.873,0.000,-0.001,-0.001
n_battlepass_lvls_finished,0.0230,0.003,7.852,0.000,0.017,0.029
n_instant_awards_claims,0.0132,0.019,0.683,0.495,-0.025,0.051
n_extra_challenge_buys,-0.3046,0.022,-14.159,0.000,-0.347,-0.262
connected_fb,-0.0465,0.025,-1.827,0.068,-0.096,0.003
connect_fb_attempt,0.0257,0.014,1.848,0.065,-0.002,0.053

0,1,2,3
Omnibus:,5307.832,Durbin-Watson:,2.001
Prob(Omnibus):,0.0,Jarque-Bera (JB):,13274.354
Skew:,0.456,Prob(JB):,0.0
Kurtosis:,4.931,Cond. No.,5530000000.0


In [39]:
#logaritmická regresia po vyhodení nevýznamných premenných
dep_var = log_y_train
exp_var = X_train.drop(["n_ad_reward_fails","n_instant_awards_claims","connected_fb","connect_fb_attempt","n_accept_all_friends","n_clicked_add_friends","n_friend_searches","n_remove_ad_clicks","n_time_skip_buys","avg_country_spend_30d","nunique_countries","first_login_country_is_mfreq"],axis=1)
exp_var = sm.add_constant(exp_var) #bola aj tak nevyznamna a zvysilo to kvalitu modelu plus umoznilo robit predict cez tuto kniznicu

In [40]:
ols = sm.OLS(dep_var.astype(float),exp_var.astype(float))
ols_fit = ols.fit()
ols_fit.summary()

0,1,2,3
Dep. Variable:,sum_payments_package_key_ltv,R-squared:,0.491
Model:,OLS,Adj. R-squared:,0.49
Method:,Least Squares,F-statistic:,2038.0
Date:,"Wed, 18 May 2022",Prob (F-statistic):,0.0
Time:,16:57:04,Log-Likelihood:,-109900.0
No. Observations:,69858,AIC:,219900.0
Df Residuals:,69824,BIC:,220200.0
Df Model:,33,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.1700,0.469,-0.362,0.717,-1.089,0.749
n_ad_reward_claims,-0.0052,0.001,-4.361,0.000,-0.007,-0.003
n_ads_watched,-0.0008,0.000,-6.875,0.000,-0.001,-0.001
n_battlepass_lvls_finished,0.0233,0.003,7.971,0.000,0.018,0.029
n_extra_challenge_buys,-0.3117,0.021,-14.725,0.000,-0.353,-0.270
n_friend_installs,0.2476,0.118,2.100,0.036,0.016,0.479
n_accept_friend,-0.0564,0.014,-4.110,0.000,-0.083,-0.030
n_levels_completed,-0.0154,0.001,-10.566,0.000,-0.018,-0.013
n_calendar_login_days,0.0019,0.001,3.113,0.002,0.001,0.003

0,1,2,3
Omnibus:,5319.066,Durbin-Watson:,2.001
Prob(Omnibus):,0.0,Jarque-Bera (JB):,13412.367
Skew:,0.454,Prob(JB):,0.0
Kurtosis:,4.945,Cond. No.,5470000000.0


## Logaritmická regresia s využitím scikit learn

In [41]:
log_regr = linear_model.LinearRegression()
log_regr.fit(X_train,log_y_train)
log_y_pred = log_regr.predict(X_test)

In [42]:
print(f"R^2 skóre pre trénovací dataset: {log_regr.score(X_train,log_y_train)}")
print(f"R^2 skóre pre testovací dataset: {log_regr.score(X_test,log_y_test)}")
print(f"MAE: {mean_absolute_error(log_y_test,log_y_pred)}")
print(f"MSE: {mean_squared_error(log_y_test,log_y_pred)}")
print(f"RMSE: {math.sqrt(mean_squared_error(log_y_test,log_y_pred))}")

R^2 skóre pre trénovací dataset: 0.49074346359152077
R^2 skóre pre testovací dataset: 0.4329984884503193
MAE: 0.927477296373004
MSE: 1.5069509691276135
RMSE: 1.2275793127646024


In [43]:
#logaritmicka regresia po vyhodení nevýznamných premenných
adj_for_log_X_test = X_test.drop(["n_ad_reward_fails","n_instant_awards_claims","connected_fb","connect_fb_attempt","n_accept_all_friends","n_clicked_add_friends","n_friend_searches","n_remove_ad_clicks","n_time_skip_buys","avg_country_spend_30d","nunique_countries","first_login_country_is_mfreq"],axis=1)
adj_for_log_X_train = X_train.drop(["n_ad_reward_fails","n_instant_awards_claims","connected_fb","connect_fb_attempt","n_accept_all_friends","n_clicked_add_friends","n_friend_searches","n_remove_ad_clicks","n_time_skip_buys","avg_country_spend_30d","nunique_countries","first_login_country_is_mfreq"],axis=1)
log_regr = linear_model.LinearRegression()
log_regr.fit(adj_for_log_X_train,log_y_train)
log_y_pred = log_regr.predict(adj_for_log_X_test)

In [44]:
print(f"R^2 skóre pre trénovací dataset: {log_regr.score(adj_for_log_X_train,log_y_train)}")
print(f"R^2 skóre pre testovací dataset: {log_regr.score(adj_for_log_X_test,log_y_test)}")
print(f"MAE: {mean_absolute_error(log_y_test,log_y_pred)}")
print(f"MSE: {mean_squared_error(log_y_test,log_y_pred)}")
print(f"RMSE:{math.sqrt(mean_squared_error(log_y_test,log_y_pred))}")

R^2 skóre pre trénovací dataset: 0.4906413889100434
R^2 skóre pre testovací dataset: 0.4325305306110965
MAE: 0.9278944204517513
MSE: 1.5081946863046638
RMSE:1.228085781329897


In [45]:
numeric_variables_buyers.shape

(69858, 47)