In [0]:
import data_model as dm
import importlib

import mlflow.statsmodels

import pickle
from datetime import datetime, timedelta
import numpy as np

import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, RocCurveDisplay, average_precision_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import mlflow

from pyspark.sql.window import Window

import datetime as dt
from dateutil.relativedelta import relativedelta

import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
import pyspark.sql.functions as F

import plotly.graph_objs as go
import plotly.io as pio

In [0]:
design_matrix = dm.main(spark).where(F.col("isSelfCureMonth") == 0)
design_matrix_pd = design_matrix.toPandas()

In [0]:
def plot(predicted, title, Y_test, plt, axis_lim=None):
  # predicted = model.predict_proba(X_test)[:,1]
  Y_test = Y_test.astype(int)
  fig, axes = plt.subplots(2, 2, figsize=(20, 12))
  axes = axes.ravel()
  roc_plot = RocCurveDisplay.from_predictions(Y_test, predicted, ax=axes[0])
  axes[0].set_title("ROC curve")
  
  #create precision recall curve
  precision, recall, thresholds = precision_recall_curve(Y_test, predicted)
  ap_score = round(average_precision_score(y_true=Y_test,y_score=predicted), 2)
  axes[1].plot(recall, precision, color='purple', label = f"Classifier (AP={ap_score}")
  #add axis labels to plot
  axes[1].set_title('Precision-Recall Curve')
  axes[1].set_ylabel('Precision')
  axes[1].set_xlabel('Recall')
  axes[1].legend()

  data = pd.DataFrame([np.array(Y_test).T, np.array(predicted)]).T
  data=data.astype(float)
  data.columns = ["class", "prob"]
  for label, df in data.groupby('class'):
    df.prob.plot.kde(label=label, ax=axes[2])
    axes[2].legend()
    df.prob.hist(alpha=0.4, label=label, ax=axes[3], bins=25)
    axes[3].legend()
  axes[2].set_title("Probability Density Function Estimation")
  axes[3].set_title("Raw Probability Histogram")
  if axis_lim:
    axes[3].set_ylim(axis_lim)
  plt.suptitle(title, fontsize=20)
  plt.tight_layout()
  return fig


def plot_pr_curve(precision, recall, thresholds, ap_score):
    x = recall[:-1]
    y = precision[:-1]
    z = thresholds

    # Create a line trace
    trace = go.Scatter(
        x=x,
        y=y,
        mode='lines+markers',
        hovertemplate='<b>Recall:</b> %{x}<br><b>Precision:</b> %{y}<br><b>threshold:</b> %{customdata}<extra></extra>',
        customdata=z  # Pass the third dimension data
    )

    # Create a layout
    layout = go.Layout(
        title='Precision-Recall Curve',
        xaxis=dict(title='recall'),
        yaxis=dict(title='precision'),
        width=800,
        height=600,
          annotations=[
          dict(
              xref='x',
              yref='y',
              x=1,
              y=1,
              text=f"average precision: {ap_score}",
              showarrow=False,
          ),
          ]
    )

    # Create a figure
    fig = go.Figure(data=[trace], layout=layout)
    return fig

In [0]:
out_of_time_start = dt.date.fromisoformat("2022-09-01").strftime("%Y-%m-%d")
in_time_start = dt.date.fromisoformat("2023-01-01").strftime("%Y-%m-%d")
in_time_stop = dt.date.fromisoformat("2023-11-01").strftime("%Y-%m-%d")
out_of_time_stop = dt.date.fromisoformat("2024-03-01").strftime("%Y-%m-%d")

categorical = [
    "brand",
    "creditScoreBucket",
    "creditFileBucket",
    "employmentStatus"
]

In [0]:
_train_dm = design_matrix_pd.where(design_matrix_pd["lastDayOfMonth"].between(in_time_start, in_time_stop))
train_dm = pd.get_dummies(_train_dm, columns=categorical, drop_first=True, dummy_na=True).where(_train_dm["isSelfCureMonth"] ==  0).dropna(subset=["isSelfCureMonth"])

_oot_dm = design_matrix_pd.where(design_matrix_pd["lastDayOfMonth"].between(out_of_time_start, in_time_start) | design_matrix_pd["lastDayOfMonth"].between(in_time_stop, out_of_time_stop))
oot_dm = pd.get_dummies(_oot_dm, columns=categorical, drop_first=True, dummy_na=True).where(_oot_dm["isSelfCureMonth"] ==  0).dropna(subset=["isSelfCureMonth"])

_full_dm = design_matrix_pd.where(design_matrix_pd["lastDayOfMonth"].between(out_of_time_start, out_of_time_stop))
full_dm = pd.get_dummies(_full_dm, columns=categorical, drop_first=True, dummy_na=True).where(design_matrix_pd["isSelfCureMonth"] ==  0).dropna(subset=["isSelfCureMonth"])

In [0]:
to_drop = ["isSelfCureMonth", "isSelfCureNextMonth", "lastDayOfMonth", "creditAccountId"]
X = train_dm.drop(to_drop, axis = 1)
Y = train_dm["isSelfCureNextMonth"].fillna(0)

oot_X = oot_dm.drop(to_drop, axis = 1)
oot_Y = oot_dm["isSelfCureNextMonth"].fillna(0)

full_X = full_dm.drop(to_drop, axis = 1)
full_Y = full_dm["isSelfCureNextMonth"].fillna(0) 


X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, stratify=Y, random_state=2024
)     
dpds = [(0,180),(0,90),(90,180)]

with mlflow.start_run() as run:
    evaluations_model = RandomForestClassifier().fit(X_train, Y_train.astype(int))


    test_dm = X_test.copy(deep=True)
    test_dm["isSelfCureNextMonth"] = Y_test

    for dpd in dpds:
        title =  f"In Time Train Test Split {dpd}"
        test_data = test_dm.where(test_dm.delinquencyDaysPastDue.between(*dpd)).dropna(subset=["delinquencyDaysPastDue"])
        X_test = test_data.drop("isSelfCureNextMonth", axis = 1)
        Y_test = test_data["isSelfCureNextMonth"]
        predicted = evaluations_model.predict_proba(X_test)[:,1]
        precision, recall, thresholds = precision_recall_curve(Y_test, predicted)
        ap_score = round(average_precision_score(Y_test, predicted),2)
        fig1 = plot(predicted, title, Y_test, plt, axis_lim=None)
        fig2 = plot_pr_curve(precision, recall, thresholds, ap_score)
    
        mlflow.log_figure(fig1, f'{title}.png')
        mlflow.log_figure(fig2, f"{dpd}_precision_recall_interactive.html")
    
    title =  "Out Of Time Test"
    predicted = evaluations_model.predict_proba(oot_X)[:,1]
    precision, recall, thresholds = precision_recall_curve(oot_Y, predicted)
    ap_score = round(average_precision_score(oot_Y, predicted),2)
    fig1 = plot(predicted, title, oot_Y, plt, axis_lim=None)
    fig2 = plot_pr_curve(precision, recall, thresholds, ap_score)

    mlflow.log_figure(fig1, f'{title}.png')
    mlflow.log_figure(fig2, "oot_precision_recall_interactive.html")

    del(evaluations_model)

    mlflow.sklearn.autolog(log_model_signatures=True, log_input_examples=True, silent=True)
    final_model = RandomForestClassifier().fit(full_X, full_Y.astype(int))
    print("\nRandom forest model fitted")
    mlflow.sklearn.log_model(final_model, "model")
    print(run.info.run_name)