In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, recall_score, precision_score, f1_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.cluster import KMeans
import plotly.offline as pyoff
import plotly.graph_objs as go
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

# Using original features
## Training set modeling

In [28]:
sub_train = pd.read_csv('/Users/vivianyan/Desktop/NYU_Capstone_Project/Data/sub_train.csv')
sub_val = pd.read_csv('/Users/vivianyan/Desktop/NYU_Capstone_Project/Data/sub_val.csv')
sub_test = pd.read_csv('/Users/vivianyan/Desktop/NYU_Capstone_Project/Data/sub_test.csv')

### Convert categorical features into numerical

In [29]:
sub_train = pd.get_dummies(sub_train, columns=['BRAND', 'CURRENT_TERM'], dtype='int')
sub_train = sub_train.drop(columns=['Unnamed: 0', 'SUB_START', 'SUB_END', 'CURRENT_TERM_weekly'])
sub_train.head()

Unnamed: 0,Capstone_id,RESUBSCRIBER,recency,frequency,monetary_1,recency_cluster,frequency_cluster,monetary_cluster,segment,monetary_2,multi_sub,churned,LTV_cluster,BRAND_CHILLSTREAM,BRAND_CINEQUEST,BRAND_LIMELIGHT,BRAND_PULSE,BRAND_RETROREEL,CURRENT_TERM_annual,CURRENT_TERM_monthly
0,TIA3987582,0,150,1,45.43,2,2,1,3,37.56,1,0,1,0,0,1,0,0,1,0
1,PNY8037927,0,60,1,19.98,0,2,0,1,0.0,0,1,0,0,0,0,0,1,0,1
2,SRA6677487,0,7,1,7.99,0,2,0,1,0.0,0,1,0,0,0,0,1,0,0,1
3,QIO2081907,0,16,1,9.99,0,2,0,1,0.0,0,1,0,0,0,0,0,1,0,1
4,UYU7131656,0,90,1,36.58,1,2,1,2,59.34,0,0,2,0,0,0,1,0,0,1


### Correlation matrix across features

In [30]:
corr_matrix = sub_train.corr(numeric_only=True)
corr_matrix['LTV_cluster'].sort_values(ascending=False)

LTV_cluster             1.000000
monetary_2              0.945474
recency                 0.537319
recency_cluster         0.535151
segment                 0.291884
monetary_cluster        0.199706
monetary_1              0.108973
RESUBSCRIBER            0.106478
frequency               0.099369
BRAND_LIMELIGHT         0.089391
BRAND_RETROREEL         0.062957
multi_sub               0.042789
CURRENT_TERM_annual     0.001236
CURRENT_TERM_monthly   -0.001292
BRAND_PULSE            -0.023149
BRAND_CINEQUEST        -0.066818
BRAND_CHILLSTREAM      -0.093127
frequency_cluster      -0.103375
churned                -0.788162
Name: LTV_cluster, dtype: float64

### XGB with 5 cross validation

In [31]:
X_train = sub_train.drop(['Capstone_id','monetary_2', 'LTV_cluster'], axis=1)
y_train = sub_train['LTV_cluster']

param_grid = {
    'max_depth': [3, 5],        
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 300]
}

ltv_xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',  # Changed for binary classification
    n_jobs=-1, 
    min_child_weight=1,
    random_state=42
)

grid_search = GridSearchCV(
    estimator=ltv_xgb_model,
    param_grid=param_grid,
    scoring='accuracy',  
    cv=5,      # 5-fold cross-validation
    n_jobs=-1,
    verbose=1,
)

grid_search.fit(X_train, y_train)
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)
# ltv_xgb_model = xgb.XGBClassifier(max_depth=5, learning_rate=0.1, objective= 'multi:softprob', n_jobs=-1).fit(X_train, y_train)
# print('Accuracy of XGB classifier on training set: {:.2f}'.format(ltv_xgb_model.score(X_train, y_train)))
# print('Accuracy of XGB classifier on test set: {:.2f}'.format(ltv_xgb_model.score(X_test[X_train.columns], y_test)))

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters found:  {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300}
Best cross-validation score:  0.9755989352262645


## Fit best model to test set

In [32]:
sub_val = pd.get_dummies(sub_val, columns=['BRAND', 'CURRENT_TERM'], dtype='int')
sub_val = sub_val.drop(columns=['Unnamed: 0', 'SUB_START', 'SUB_END'])
X_val = sub_val.drop(['Capstone_id','monetary_2', 'LTV_cluster'], axis=1)
y_val = sub_val['LTV_cluster']

sub_test = pd.get_dummies(sub_test, columns=['BRAND', 'CURRENT_TERM'], dtype='int')
sub_test = sub_test.drop(columns=['Unnamed: 0', 'SUB_START', 'SUB_END'])
X_test = sub_test.drop(['Capstone_id','monetary_2', 'LTV_cluster'], axis=1)
y_test = sub_test['LTV_cluster']

### Results

In [35]:
best_model = grid_search.best_estimator_

train_acc = best_model.score(X_train, y_train)
test_acc = best_model.score(X_test[X_train.columns], y_test)
print('Accuracy of XGB classifier on training set: {:.2f}'.format(train_acc))
print('Accuracy of XGB classifier on test set: {:.2f}'.format(test_acc))

y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

Accuracy of XGB classifier on training set: 0.98
Accuracy of XGB classifier on test set: 0.97
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      8287
           1       0.96      0.94      0.95      3311
           2       0.93      0.97      0.95      2893

    accuracy                           0.97     14491
   macro avg       0.96      0.97      0.97     14491
weighted avg       0.97      0.97      0.97     14491



# Using partial features
## Training set modeling

In [36]:
sub_train = pd.read_csv('/Users/vivianyan/Desktop/NYU_Capstone_Project/Data/sub_train.csv')
sub_val = pd.read_csv('/Users/vivianyan/Desktop/NYU_Capstone_Project/Data/sub_val.csv')
sub_test = pd.read_csv('/Users/vivianyan/Desktop/NYU_Capstone_Project/Data/sub_test.csv')

### Convert categorical features into numerical

In [37]:
sub_train.columns

Index(['Unnamed: 0', 'Capstone_id', 'SUB_START', 'SUB_END', 'BRAND',
       'CURRENT_TERM', 'RESUBSCRIBER', 'recency', 'frequency', 'monetary_1',
       'recency_cluster', 'frequency_cluster', 'monetary_cluster', 'segment',
       'monetary_2', 'multi_sub', 'churned', 'LTV_cluster'],
      dtype='object')

In [38]:
sub_train = pd.get_dummies(sub_train, columns=['BRAND', 'CURRENT_TERM'], dtype='int')
sub_train = sub_train.drop(
    columns=['Unnamed: 0', 'SUB_START', 'SUB_END', 'CURRENT_TERM_weekly',
    'recency', 'frequency', 'monetary_1']
)
sub_train.head()

Unnamed: 0,Capstone_id,RESUBSCRIBER,recency_cluster,frequency_cluster,monetary_cluster,segment,monetary_2,multi_sub,churned,LTV_cluster,BRAND_CHILLSTREAM,BRAND_CINEQUEST,BRAND_LIMELIGHT,BRAND_PULSE,BRAND_RETROREEL,CURRENT_TERM_annual,CURRENT_TERM_monthly
0,TIA3987582,0,2,2,1,3,37.56,1,0,1,0,0,1,0,0,1,0
1,PNY8037927,0,0,2,0,1,0.0,0,1,0,0,0,0,0,1,0,1
2,SRA6677487,0,0,2,0,1,0.0,0,1,0,0,0,0,1,0,0,1
3,QIO2081907,0,0,2,0,1,0.0,0,1,0,0,0,0,0,1,0,1
4,UYU7131656,0,1,2,1,2,59.34,0,0,2,0,0,0,1,0,0,1


### Correlation matrix across features

In [39]:
corr_matrix = sub_train.corr(numeric_only=True)
corr_matrix['LTV_cluster'].sort_values(ascending=False)

LTV_cluster             1.000000
monetary_2              0.945474
recency_cluster         0.535151
segment                 0.291884
monetary_cluster        0.199706
RESUBSCRIBER            0.106478
BRAND_LIMELIGHT         0.089391
BRAND_RETROREEL         0.062957
multi_sub               0.042789
CURRENT_TERM_annual     0.001236
CURRENT_TERM_monthly   -0.001292
BRAND_PULSE            -0.023149
BRAND_CINEQUEST        -0.066818
BRAND_CHILLSTREAM      -0.093127
frequency_cluster      -0.103375
churned                -0.788162
Name: LTV_cluster, dtype: float64

### XGB with 5 cross validation

In [40]:
X_train = sub_train.drop(['Capstone_id','monetary_2', 'LTV_cluster'], axis=1)
y_train = sub_train['LTV_cluster']

param_grid = {
    'max_depth': [3, 5],        
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 300]
}

ltv_xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',  # Changed for binary classification
    n_jobs=-1, 
    min_child_weight=1,
    random_state=42
)

grid_search = GridSearchCV(
    estimator=ltv_xgb_model,
    param_grid=param_grid,
    scoring='accuracy',  
    cv=5,      # 5-fold cross-validation
    n_jobs=-1,
    verbose=1,
)

grid_search.fit(X_train, y_train)
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)
# ltv_xgb_model = xgb.XGBClassifier(max_depth=5, learning_rate=0.1, objective= 'multi:softprob', n_jobs=-1).fit(X_train, y_train)
# print('Accuracy of XGB classifier on training set: {:.2f}'.format(ltv_xgb_model.score(X_train, y_train)))
# print('Accuracy of XGB classifier on test set: {:.2f}'.format(ltv_xgb_model.score(X_test[X_train.columns], y_test)))

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters found:  {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}
Best cross-validation score:  0.8890121265897664


## Fit best model to test set

In [44]:
sub_val = pd.get_dummies(sub_val, columns=['BRAND', 'CURRENT_TERM'], dtype='int')
sub_val = sub_val.drop(
    columns=['Unnamed: 0', 'SUB_START', 'SUB_END', 
    'recency', 'frequency', 'monetary_1']
)
X_val = sub_val.drop(['Capstone_id','monetary_2', 'LTV_cluster'], axis=1)
y_val = sub_val['LTV_cluster']

sub_test = pd.get_dummies(sub_test, columns=['BRAND', 'CURRENT_TERM'], dtype='int')
sub_test = sub_test.drop(
    columns=['Unnamed: 0', 'SUB_START', 'SUB_END', 
    'recency', 'frequency', 'monetary_1']
)
X_test = sub_test.drop(['Capstone_id','monetary_2', 'LTV_cluster'], axis=1)
y_test = sub_test['LTV_cluster']

### Results

In [45]:
best_model = grid_search.best_estimator_

train_acc = best_model.score(X_train, y_train)
test_acc = best_model.score(X_test[X_train.columns], y_test)
print('Accuracy of XGB classifier on training set: {:.2f}'.format(train_acc))
print('Accuracy of XGB classifier on test set: {:.2f}'.format(test_acc))

y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

Accuracy of XGB classifier on training set: 0.89
Accuracy of XGB classifier on test set: 0.89
              precision    recall  f1-score   support

           0       0.99      0.91      0.95      8287
           1       0.87      0.80      0.83      3311
           2       0.71      0.94      0.81      2893

    accuracy                           0.89     14491
   macro avg       0.86      0.88      0.86     14491
weighted avg       0.91      0.89      0.89     14491

