In [None]:
%pip install scikit-learn
%pip install pandas
%pip install seaborn
%pip install matplotlib
%pip install pyQt5
%pip install xgboost
# or install tkinter for using TkAgg as matplotlib backend

In [1]:
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.colors as mcolors
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

df = pd.read_csv('registration_data_training.csv')

In [None]:
matplotlib.use('Qt5Agg') # if you want to see the plots run this block, some other options for backand if this doesnt work are: TkAgg, MacOSX

Plots block further execution

In [None]:
# scatter
target = 'days_active_first_28_days_after_registration'
i = 0
for column in df.columns:
    if column in {target, 'user_id', 'registration_time_utc', 'registration_platform_specific', 'registration_country', 'registration_store'}: continue
    sns.scatterplot(x=column, y=target, data=df, alpha=0.7)
    sns.set_style('whitegrid')           
    plt.title('Title')                   
    plt.xlabel(column)                
    plt.ylabel('Target Value')           
    plt.show()  

In [None]:
# hexbin log norm
target = 'days_active_first_28_days_after_registration'
i = 0
for column in df.columns:
    if column in {target, 'registration_device_manufacturer', 'registration_device_type', 'user_id', 'registration_time_utc', 'registration_platform_specific', 'registration_country', 'registration_store', 'registration_channel_detailed'}: continue
    plt.hexbin(df[column], df[target], gridsize=30, cmap='Blues',  alpha=1, norm=mcolors.LogNorm())
    plt.colorbar(label='Density')           
    plt.title('Log norm scale')                   
    plt.xlabel(column)                
    plt.ylabel('Target Value')           
    plt.show()  

In [None]:
# hexbin power norm + scatter, best plot
target = 'days_active_first_28_days_after_registration'
i = 0
for column in df.columns:
    if column in {target, 'registration_device_manufacturer', 'registration_device_type', 'user_id', 'registration_time_utc', 'registration_platform_specific', 'registration_country', 'registration_store', 'registration_channel_detailed'}: continue
    plt.hexbin(df[column], df[target], gridsize=30, cmap='Reds',  alpha=1, norm=mcolors.PowerNorm(gamma=0.4)) # gamma < 1, stretches smaller values
    sns.scatterplot(x=column, y=target, data=df, alpha=0.7)
    plt.colorbar(label='Density')           
    plt.title('Power norm scale')                   
    plt.xlabel(column)                
    plt.ylabel('Target Value')           
    plt.show() 

In [None]:
# kde plot
target = 'days_active_first_28_days_after_registration'
i = 0
for column in df.columns:
    if column in {target, 'registration_device_manufacturer', 'registration_device_type', 'user_id', 'registration_time_utc', 'registration_platform_specific', 'registration_country', 'registration_store', 'registration_channel_detailed'}: continue
    try:
        sns.kdeplot(x=df[column], y=df[target], gridsize=30, cmap='Blues', fill=True)
        sns.set_style('whitegrid')
        plt.title('Title')                   
        plt.xlabel(column)                
        plt.ylabel('Target Value')           
        plt.show() 
    except:
        print(f'failed here {column}')

In [2]:
# hist data

df_hist = pd.read_csv('previous_lives_training_data.csv')

aggregation_dict = {
    'is_payer_lifetime': 'any',
    'is_rewarded_video_watcher_lifetime': 'any',
    'ratio': 'mean',
    'days_active_lifetime': 'mean',
    'transaction_count_iap_lifetime': 'sum'
}
df_hist['ratio'] = df_hist['days_active_lifetime'] / (28 - df_hist['registration_season_day']%28) # ratio of days played and the days left in the season
df_hist = df_hist.groupby('user_id').agg(aggregation_dict)

In [3]:
to_drop = ['registration_time_utc', 'registration_platform_specific', 'registration_country', 'registration_store', 'registration_channel_detailed', 'registration_device_type',
           'registration_device_manufacturer']
df = df.drop(columns=to_drop)
df_full_data = pd.merge(df, df_hist, on='user_id', how='inner')

df_full_data[['is_rewarded_video_watcher_lifetime', 'is_payer_lifetime']] = df_full_data[['is_rewarded_video_watcher_lifetime', 'is_payer_lifetime']].astype(int)
df_full_data = df_full_data.fillna(-1) # average age can be NaN in some cases

In [4]:
import xgboost as xgb 

label_column = 'days_active_first_28_days_after_registration'

features = df_full_data.drop(columns=[label_column, 'user_id'])  
features['playtime'] /= (1000*60*60)
features['money_stash'] /= 1e9
features['registration_season_day'] %= 28 #getting in on 28th day is like getting into season day 1 just 
features['tokens'] = features['tokens_spent'] / (features['tokens_spent'] + features['tokens_stash'])
features['rests'] = features['rests_spent'] / (features['rests_spent'] + features['rests_stash'])
features = features.fillna(0)

labels = df_full_data[label_column]


xg_regressor = xgb.XGBRegressor(n_estimators=1500, learning_rate=0.005)
xg_regressor.fit(features, labels);



In [5]:
df_test = pd.read_csv('registration_data_test.csv')
df_hist_test = pd.read_csv('previous_lives_test_data.csv')

df_hist_test['ratio'] = df_hist_test['days_active_lifetime'] / (28 - df_hist_test['registration_season_day']%28) # ratio of days played and the days left in the season
df_hist_test = df_hist_test.groupby('user_id').agg(aggregation_dict)

to_drop = ['registration_time_utc', 'registration_platform_specific', 'registration_country', 'registration_store', 'registration_channel_detailed', 'registration_device_type',
           'registration_device_manufacturer']
df_test = df_test.drop(columns=to_drop)
df_full_data_test = pd.merge(df_test, df_hist_test, on='user_id', how='inner')

df_full_data_test[['is_rewarded_video_watcher_lifetime', 'is_payer_lifetime']] = df_full_data_test[['is_rewarded_video_watcher_lifetime', 'is_payer_lifetime']].astype(int)
df_full_data_test = df_full_data_test.fillna(-1)

features_test = df_full_data_test.drop(columns=['user_id'])  
features_test['playtime'] /= (1000*60*60)
features_test['money_stash'] /= 1e9
features_test['registration_season_day'] %= 28 #getting in on 28th day is like getting into season day 1 just 
features_test['tokens'] = features_test['tokens_spent'] / (features_test['tokens_spent'] + features_test['tokens_stash'])
features_test['rests'] = features_test['rests_spent'] / (features_test['rests_spent'] + features_test['rests_stash'])
features_test = features_test.fillna(0)

y_pred = xg_regressor.predict(features_test)

y_pred = [max(0, min(int(pred), 28)) for pred in y_pred]

user_ids = df_test['user_id']

df_submission = pd.DataFrame({
    'user_id': user_ids,
    'predicted_days_active_first_28_days_after_registration': y_pred
})

df_submission.to_csv('days_active_first_28_days_after_registration_predictions.csv', index=False)
