In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

## Prepare the data

In [2]:
players_df = pd.read_csv("data/players.csv")
sessions_df = pd.read_csv("data/sessions.csv")
tournaments_df = pd.read_csv("data/tournaments.csv")
transactions_df = pd.read_csv("data/transactions.csv")

# Convert date columns to datetime
players_df['signup_date'] = pd.to_datetime(players_df['signup_date'])

sessions_df['start_ts'] = pd.to_datetime(sessions_df['start_ts'])
sessions_df['end_ts'] = pd.to_datetime(sessions_df['end_ts'])

tournaments_df['start_ts'] = pd.to_datetime(tournaments_df['start_ts'])

transactions_df['txn_ts'] = pd.to_datetime(transactions_df['txn_ts'])

In [3]:
deposits_mask = transactions_df['txn_type'] == "deposit"
deposits_df = transactions_df[deposits_mask][['player_id', 'txn_ts', 'amount']].copy()
deposits_df.rename(columns={'txn_ts': 'ts', 'amount': 'deposit'}, inplace=True)

##  Step 0: Baseline model

### predict the avrage 30 days deposites per player

In [4]:
# Step 1: Calculate the average deposit per player over the last 30 days (baseline)
# Sort the deposits by player_id and timestamp
deposits_df = deposits_df.sort_values(by=['player_id', 'ts'])

# Create a rolling window feature: average deposit over the last 30 days per player
deposits_df['30_day_avg_deposit'] = deposits_df.groupby('player_id')['deposit'].rolling(window=30, min_periods=1).mean().reset_index(0, drop=True)

# Step 2: Use the 30-day rolling average as the target variable for our baseline model
baseline_df = deposits_df[['player_id', 'ts', '30_day_avg_deposit']].dropna()

# Step 3: Aggregate the data by player to create a single row per player with the baseline target
baseline_df = baseline_df.groupby('player_id').agg({'30_day_avg_deposit': 'mean'}).reset_index()

# Split into train and test sets
X = baseline_df[['player_id']]  # In a real model, we might add more features here, but for the baseline, we'll use player_id
y = baseline_df['30_day_avg_deposit']

# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Use a simple model - we will use the mean of the training data as our prediction
# For baseline, the prediction for all players is simply the mean of the training set
baseline_prediction = y_train.mean()

# Step 6: Evaluate the baseline model using mean squared error
y_pred = [baseline_prediction] * len(y_test)
mse = mean_squared_error(y_test, y_pred)

print(f"Baseline model MSE: {mse}")

Baseline model MSE: 597.3587036513112


In [29]:
deposits_df.sort_values(by=['player_id', 'ts'])

Unnamed: 0,player_id,ts,deposit,30_day_avg_deposit
2,1,2024-12-21 02:28:26,90.73,90.730000
7,2,2025-03-16 01:39:12,20.36,20.360000
10,3,2025-04-27 13:11:03,85.79,85.790000
9,3,2025-04-28 14:45:03,86.54,86.165000
14,4,2025-02-15 19:29:37,56.15,56.150000
...,...,...,...,...
49880,19997,2025-03-25 07:41:12,40.22,56.490000
49879,19997,2025-03-26 11:44:12,26.57,46.516667
49881,19998,2025-04-12 14:48:08,19.12,19.120000
49885,20000,2025-01-18 20:25:07,44.41,44.410000


## Model 1: Time series model

### Feature Engineering

### Create Fact table per user

In [20]:
# 1. Aggregate transaction data (total deposits per player)
player_deposits = transactions_df.groupby('player_id').agg({'amount': 'sum'}).reset_index()
player_deposits.rename(columns={'amount': 'total_deposits'}, inplace=True)

# 2. Player-level features (country, acquisition_channel, signup_date)
players_df['signup_date'] = pd.to_datetime(players_df['signup_date'])
players_df['days_since_signup'] = (pd.to_datetime('today') - players_df['signup_date']).dt.days
player_features = players_df[['player_id', 'country', 'acquisition_channel', 'days_since_signup']]

# 3. Session-related features
# Calculate average session length per player (in seconds)
sessions_df['session_length'] = (pd.to_datetime(sessions_df['end_ts']) - pd.to_datetime(sessions_df['start_ts'])).dt.total_seconds()
avg_session_length = sessions_df.groupby('player_id').agg({'session_length': 'mean'}).reset_index()
avg_session_length.rename(columns={'session_length': 'avg_session_length'}, inplace=True)

# Calculate most active device per player
most_active_device = sessions_df.groupby(['player_id', 'device']).size().reset_index(name='device_count')
most_active_device = most_active_device.loc[most_active_device.groupby('player_id')['device_count'].idxmax()][['player_id', 'device']]
most_active_device.rename(columns={'device': 'most_active_device'}, inplace=True)

# 4. Merge all features into a single fact table
fact_table = player_features.merge(player_deposits, on='player_id', how='left') \
                            .merge(avg_session_length, on='player_id', how='left') \
                            .merge(most_active_device, on='player_id', how='left')

In [25]:
# Players that only deposited and weren't active
fact_table['visitor'] = fact_table['most_active_device'].isna().replace({True: 'yes', False: 'no'})

In [26]:
fact_table

Unnamed: 0,player_id,country,acquisition_channel,days_since_signup,total_deposits,avg_session_length,most_active_device,visitor
0,1,CA,Facebook,195,160.32,2550.000000,Android,no
1,2,ZA,Organic,133,82.34,1573.333333,iOS,no
2,3,CA,Google,143,180.33,2208.000000,Android,no
3,4,GB,Referral,163,61.15,735.000000,iOS,no
4,5,FR,Facebook,164,10.00,,,yes
...,...,...,...,...,...,...,...,...
19995,19996,FR,Facebook,175,115.39,2220.000000,iOS,no
19996,19997,FR,Organic,168,141.55,2280.000000,Android,no
19997,19998,DE,Organic,189,19.12,2208.000000,Android,no
19998,19999,DE,Apple Search Ads,170,5.00,,,yes


In [34]:
deposits_df = deposits_df.sort_values(by=['player_id', 'ts'])
deposits_df

Unnamed: 0,player_id,ts,deposit,30_day_avg_deposit
2,1,2024-12-21 02:28:26,90.73,90.730000
7,2,2025-03-16 01:39:12,20.36,20.360000
10,3,2025-04-27 13:11:03,85.79,85.790000
9,3,2025-04-28 14:45:03,86.54,86.165000
14,4,2025-02-15 19:29:37,56.15,56.150000
...,...,...,...,...
49880,19997,2025-03-25 07:41:12,40.22,56.490000
49879,19997,2025-03-26 11:44:12,26.57,46.516667
49881,19998,2025-04-12 14:48:08,19.12,19.120000
49885,20000,2025-01-18 20:25:07,44.41,44.410000


In [35]:
# Ensure deposits_df is sorted by player_id and timestamp
deposits_df = deposits_df.sort_values(by=['player_id', 'ts'])

# Create a rolling window for 30-day deposits per player
deposits_df['30_day_deposit'] = deposits_df.groupby('player_id')['deposit'].rolling(window='30D', on='ts').sum().reset_index(0, drop=True)

ValueError: invalid on specified as ts, must be a column (of DataFrame), an Index or None

In [6]:
transactions_df.groupby('player_id').agg({'amount': 'sum'}).reset_index()

Unnamed: 0,player_id,amount
0,1,160.32
1,2,82.34
2,3,180.33
3,4,61.15
4,5,10.00
...,...,...
16633,19996,115.39
16634,19997,141.55
16635,19998,19.12
16636,19999,5.00


In [27]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Feature Engineering

# 1. Aggregate transaction data (total deposits in the last 30 days per player)
# We don't use `ts`, but we can still aggregate transactions by player
transactions_df['txn_date'] = pd.to_datetime(transactions_df['txn_ts']).dt.date
transactions_df['txn_month'] = pd.to_datetime(transactions_df['txn_ts']).dt.month
player_deposits = transactions_df.groupby('player_id').agg({'amount': 'sum'}).reset_index()
player_deposits.rename(columns={'amount': 'total_deposits_30d'}, inplace=True)

# 2. Session data (total sessions per player)
session_features = sessions_df.groupby('player_id').agg({'session_id': 'count'}).reset_index()
session_features.rename(columns={'session_id': 'session_count'}, inplace=True)

# 3. Player-level features (country, acquisition_channel, signup_date)
# Convert signup_date to days since signup for the model
players_df['signup_date'] = pd.to_datetime(players_df['signup_date'])
players_df['days_since_signup'] = (pd.to_datetime('today') - players_df['signup_date']).dt.days

# Merge all features together
final_features = players_df[['player_id', 'country', 'acquisition_channel', 'days_since_signup']].merge(
    player_deposits, on='player_id', how='left'
).merge(
    session_features, on='player_id', how='left'
)

# Handle categorical variables (e.g., 'country', 'acquisition_channel') using one-hot encoding
final_features = pd.get_dummies(final_features, columns=['country', 'acquisition_channel'], drop_first=True)

# Define X (features) and y (target variable)
# Assuming you want to predict 'total_deposits_30d' (the expected deposit amount)
X = final_features.drop(columns=['player_id', 'total_deposits_30d'])
y = final_features['total_deposits_30d']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print(f"Random Forest model MSE: {mse}")


ValueError: Input X contains NaN.
RandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [28]:
X_train

Unnamed: 0,days_since_signup,session_count,country_BR,country_CA,country_DE,country_FR,country_GB,country_IL,country_IN,country_US,country_ZA,acquisition_channel_Facebook,acquisition_channel_Google,acquisition_channel_Organic,acquisition_channel_Referral
5894,194,9.0,0,1,0,0,0,0,0,0,0,1,0,0,0
3728,174,7.0,0,1,0,0,0,0,0,0,0,1,0,0,0
8958,138,10.0,0,0,1,0,0,0,0,0,0,0,0,0,1
7671,177,2.0,1,0,0,0,0,0,0,0,0,0,0,1,0
5999,135,5.0,0,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,128,5.0,0,0,0,1,0,0,0,0,0,0,0,0,1
11964,166,6.0,0,0,0,0,0,1,0,0,0,1,0,0,0
5390,118,10.0,0,1,0,0,0,0,0,0,0,0,1,0,0
860,119,6.0,0,1,0,0,0,0,0,0,0,1,0,0,0
