In [None]:
# Import python packages
import streamlit as st
import pandas as pd
import snowflake.snowpark.functions as F 
from snowflake.ml.modeling.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import plotly.graph_objects as go
from datetime import timedelta
from snowflake.ml.registry import Registry
import xgboost as xgb

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()


# Add a query tag to the session.
session.query_tag = {"origin":"sf_sit-is", 
                     "name":"Player_360", 
                     "version":{"major":1, "minor":0,},
                     "attributes":{"is_quickstart":1}}

# Churn Prediction Model

In [None]:
features_df = session.sql("""
SELECT 
r.user_id, 
r.total_logins,
r.logged_in_after_1_day,
r.logged_in_after_7_days,
r.logged_in_after_30_days,
r.logged_in_in_last_30_days,
r.days_since_last_login,
d.age,
d.gender,
d.location,
d.average_sessions_per_active_week,
d.average_session_duration,
d.player_type,
d.total_ads,
d.avg_purchase_amount_per_ad,
d.has_support_ticket,
ur.total_points,
ur.rank_name,
( 
        CASE WHEN VICTORY_ROYALE THEN 1 ELSE 0 END +
        CASE WHEN ELIMINATION_MILESTONES THEN 1 ELSE 0 END +
        CASE WHEN SURVIVAL_ACHIEVEMENTS THEN 1 ELSE 0 END +
        CASE WHEN BUILDING_RESOURCES THEN 1 ELSE 0 END +
        CASE WHEN EXPLORATION_TRAVEL THEN 1 ELSE 0 END +
        CASE WHEN WEAPON_USAGE THEN 1 ELSE 0 END +
        CASE WHEN ASSIST_TEAMMATES THEN 1 ELSE 0 END +
        CASE WHEN EVENT_CHALLENGES THEN 1 ELSE 0 END +
        CASE WHEN CREATIVE_MODE THEN 1 ELSE 0 END +
        CASE WHEN SOCIAL_ACHIEVEMENTS THEN 1 ELSE 0 END
    ) / 11.0 AS ACHIEVEMENTS_PERCENTAGE,
ae.total_purchases,
ae.proportion_purchased,
ae.average_purchase_amount,
ae.average_ad_engagement_time,
r.churned
FROM DANIEL_PLAYER360_PROD.ANALYTIC.RETENTION r 
JOIN DANIEL_PLAYER360_PROD.ANALYTIC.DEMOGRAPHICS d ON r.user_id = d.user_id 
JOIN DANIEL_PLAYER360_PROD.ANALYTIC.USER_RANKINGS ur ON r.user_id = ur.user_id
JOIN DANIEL_PLAYER360_PROD.RAW.ACHIEVEMENTS a ON r.user_id = a.user_id
JOIN DANIEL_PLAYER360_PROD.ANALYTIC.AD_ENGAGEMENT ae ON r.user_id = ae.user_id
""").to_pandas()
features_df.head(100)

# Exploratory Data Analysis

In [None]:
features_df.describe()

In [None]:
# plot distribution of age of playerbase    
features_df["AGE"].hist(bins=20)
plt.title("Age Distribution")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.show()

In [None]:
correlation_matrix = features_df[list(features_df.describe().columns)].corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()

In [None]:
features_df['AGE_GROUP'] = pd.cut(features_df['AGE'], bins=[0,12,18, 24, 34, 44, 54, 64, 100], labels=['0_11','12_17','18_24', '25_34', '35_44', '45_54', '55_64', '65+'])
churn_by_age = features_df.groupby('AGE_GROUP')['CHURNED'].mean() * 100
churn_by_age.plot(kind='bar', title="Churn Rate by Age Group")
plt.ylabel("Churn Rate (%)")
plt.show()

In [None]:
# Churn by Location!
churn_by_location = features_df.groupby('LOCATION')['CHURNED'].mean() * 100
churn_by_location.plot(kind='bar', title="Churn Rate by Location")
plt.ylabel("Churn Rate (%)")
plt.show()

In [None]:
churn_by_player_type = features_df.groupby('PLAYER_TYPE')['CHURNED'].mean() * 100
churn_by_player_type.plot(kind='bar', title="Churn Rate by Player Type")
plt.ylabel("Churn Rate (%)")
plt.show()

In [None]:
# Graph gender breakdown of playerbase
features_df['GENDER'].value_counts().plot(kind='pie', autopct='%1.1f%%', title="Gender Distribution")
plt.show()

In [None]:
# Graph the Purchases and Ad Engagement Information by Churn
sns.pairplot(features_df[['PROPORTION_PURCHASED', 'AVERAGE_PURCHASE_AMOUNT','AVERAGE_AD_ENGAGEMENT_TIME', 'CHURNED']], diag_kind='kde', hue="CHURNED",palette='husl')
plt.suptitle('Pair Plot of Purchases and Ad Engagement Information by Churn', y=1.02, fontsize=20)
plt.tight_layout()
plt.show()

In [None]:
# Graph Totals pair plot
sns.pairplot(features_df[['TOTAL_PURCHASES', 'TOTAL_ADS', 'TOTAL_LOGINS', 'TOTAL_POINTS', 'CHURNED']], diag_kind='kde', hue="CHURNED",palette='husl')
plt.suptitle('Pair Plot of Purchases and Ad Engagement Information by Churn', y=1.02, fontsize=20)
plt.tight_layout()
plt.show()

In [None]:
churned_data = features_df[features_df["CHURNED"] == 1]
non_churned_data = features_df[features_df["CHURNED"] == 0]
one_class_count = len(churned_data)
zero_class_count = len(non_churned_data)

In [None]:
# Plot TOTAL_LOGINS vs TOTAL_POINTS vs TOTAL_PURCHASES
fig = go.Figure()

# Add churned data points (red 'x')
fig.add_trace(go.Scatter3d(
    x=churned_data["TOTAL_LOGINS"],
    y=churned_data["TOTAL_POINTS"],
    z=churned_data["TOTAL_PURCHASES"],
    mode='markers',
    marker=dict(size=6, color='red', symbol='x'),
    name='Churned'
))

# Add non-churned data points (blue markers)
fig.add_trace(go.Scatter3d(
    x=non_churned_data["TOTAL_LOGINS"],
    y=non_churned_data["TOTAL_POINTS"],
    z=non_churned_data["TOTAL_PURCHASES"],
    mode='markers',
    marker=dict(size=6, color='blue'),
    name='Non-Churned'
))

# Set axis labels and title
fig.update_layout(
    title='Interactive 3D Plot of Total Logins, Total Points, and Total Purchases',
    scene=dict(
        xaxis_title='Total Logins',
        yaxis_title='Total Points',
        zaxis_title='Total Purchases'
    ),
    legend=dict(x=0.1, y=0.9)
)

# Show the plot
fig

# Feature Engineering

In [None]:
# use Ordinal Encoding for rank_name
# use ordinal_encoding for player_type -> Hardcore:1, Casual:0
# use ordinal encoding for gender -> Female:0, Male:1
categories = {"RANK_NAME":["Bronze", "Silver", "Gold", "Platinum", "Diamond", "Elite", "Champion", "Unreal"],
             "PLAYER_TYPE":["Casual", "Hardcore"],
             "GENDER":["Female", "Male"]} 
snowml_oe = OrdinalEncoder(input_cols=["RANK_NAME", "PLAYER_TYPE", "GENDER"], output_cols=["RANK_NAME_OE", "PLAYER_TYPE_OE", "GENDER_OE"], categories=categories)
encoded_feature_df = snowml_oe.fit(features_df).transform(features_df)
encoded_feature_df.head(10)

In [None]:
encoded_feature_df["AGE_GROUP"] = pd.Categorical(encoded_feature_df["AGE_GROUP"], ordered=False)
encoded_feature_df["LOCATION"] = pd.Categorical(encoded_feature_df["LOCATION"])
encoded_feature_df['AGE_GROUP'] = encoded_feature_df['AGE_GROUP'].cat.codes
encoded_feature_df['LOCATION'] = encoded_feature_df['LOCATION'].cat.codes
encoded_feature_df['HAS_SUPPORT_TICKET'] = encoded_feature_df['HAS_SUPPORT_TICKET'].astype(int)
encoded_feature_df.columns = [u.upper().strip('"') for u in encoded_feature_df.columns]
encoded_feature_df.columns

In [None]:
final_features_df = encoded_feature_df[["USER_ID","AGE_GROUP", "LOCATION","AVERAGE_SESSIONS_PER_ACTIVE_WEEK", "AVERAGE_SESSION_DURATION", \
                                       "HAS_SUPPORT_TICKET", "ACHIEVEMENTS_PERCENTAGE", \
                                        "PROPORTION_PURCHASED", "AVERAGE_PURCHASE_AMOUNT", \
                                        "AVERAGE_AD_ENGAGEMENT_TIME",\
                                       "RANK_NAME_OE", "PLAYER_TYPE_OE", "GENDER_OE", \
                                       "CHURNED"]]
final_features_corr_df = final_features_df[list(final_features_df.describe().columns)].corr().round(2)
final_features_corr_df

In [None]:
mask = np.triu(np.ones_like(final_features_corr_df, dtype=bool))
# Create a heatmap with the features
plt.figure(figsize=(18, 14))
heatmap = sns.heatmap(final_features_corr_df, mask=mask, cmap="YlGnBu", annot=True, vmin=-1, vmax=1)

# Model Training

In [None]:
final_features_df = session.write_pandas(df=final_features_df, \
                     table_name="CHURN_FEATURES", database="DANIEL_PLAYER360_PROD", schema="APP", \
                     quote_identifiers=False,
                     auto_create_table=True,
                     overwrite=True).to_pandas()

In [None]:
y = final_features_df['CHURNED']
X = final_features_df[list(final_features.columns)[1:-1]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train)

In [None]:
predictions = model.predict_proba(X_test)
y_pred = [1 if l[1] >= .4 else 0 for l in predictions]

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

# Model Registry

In [None]:
USE SCHEMA APP;

In [None]:
from snowflake.ml.registry import Registry

reg = Registry(session=session)

MODEL_NAME = "Player360_Churn_classifier"
MODEL_VERSION = "v1"

mv = reg.log_model(model,
                   model_name=MODEL_NAME,
                   version_name=MODEL_VERSION,
                   options={
                       'relax_version': False,
                       "case_sensitive": True,                  
    },
                    sample_input_data=X_test
)

reg.show_models()

In [None]:
model_name = "PLAYER360_CHURN_CLASSIFIER"
model_version = "v1"
reg = Registry(session=session) 
mv= reg.get_model(model_name).version(model_version)

In [None]:
mv = mv.load()

In [None]:
explainer = shap.TreeExplainer(mv)

# Calculate SHAP values
shap_values = explainer(X_test)

# Plot SHAP values
shap.summary_plot(shap_values, X_test)