In [1]:
import pandas as pd
import os
import base64
import time
import pickle as p
import numpy as np
import matplotlib.pyplot as plt
import pandas.core.algorithms as algos
from sklearn.model_selection import train_test_split
from sklearn import metrics
from functools import reduce
import datetime
from dateutil.relativedelta import relativedelta
import logging
import matplotlib.style as style
import matplotlib
import matplotlib.lines as mlines
from matplotlib.ticker import FuncFormatter
style.use('seaborn')

import warnings
warnings.filterwarnings('ignore')

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
import gc
gc.collect()

51

In [4]:
# Client & TPA

TPA = pd.read_excel('../data/Copy_of_TPA_cash.xlsx',sheet_name='Sheet1')
Client = pd.read_excel('../data/Client_info_20191118.xlsx',sheet_name='Sheet1')
# 3 Clients have duplicates records
# Client[Client['Client ID'].isin([1500,7601,2449])]
Client = Client.drop(Client.index[[0,1,497]])


Client = Client.merge(TPA,how='left',left_on='Client ID',right_on='HOST ID')
Client['TPA'] = Client['TPA_x']
Client.loc[Client['TPA_x'].isnull(),'TPA'] = Client.loc[Client['TPA_x'].isnull(),'TPA_y']
Client['TPA'] = Client['TPA'].apply(lambda x: x.upper() if pd.notnull(x) else np.nan)
Client.drop(['TPA_x','TPA_y','HOST ID'],axis=1,inplace = True)

In [None]:
### 1. Customer Dataset

Customer_old = pd.read_csv('../data/Customer_Data_Request_20191121.csv')
Customer = pd.read_csv('../data/Customer_Data_Request_20191217.csv')

# Append these 2 dataframes together
Customer = Customer_old.append(Customer,ignore_index = True)
# 2 Customer has duplicating records, we pick ClientID 2049 as their real Client but this is just a random choice
# Customer[Customer['Unique_Customer_ID']==100541023,100696306]
# Drop the duplicates customerID by index
Customer = Customer.drop(Customer.index[[72898,51730]])


# Some data cleaning here, eliminate the customers with following cases
# Customer_Salary has NaN, 0 and '.'
# Assigned_Spending_Limit has 0
# Customer_Tenure has NaN

# Customer = Customer[(Customer['Assigned_Spending_Limit'] != 0) & 
#                     (~Customer['Customer_Salary'].isin([0, '.'])) & 
#                     (Customer['Customer_Salary'].notnull())&
#                     (Customer['Customer_Tenure'].notnull())]



 # Formating conversion
Customer['Street1'] = Customer['Street1'].apply(lambda x: x.upper() if pd.notnull(x) else np.nan)
Customer['Street2'] = Customer['Street2'].apply(lambda x: x.upper() if pd.notnull(x) else np.nan)
Customer['City'] = Customer['City'].apply(lambda x: x.upper() if pd.notnull(x) else np.nan)
Customer['Status'] = Customer['Status'].apply(lambda x: x.upper() if pd.notnull(x) else np.nan)
Customer.loc[Customer['Customer_Salary']=='.','Customer_Salary']=np.nan
Customer['Customer_Salary'] = Customer['Customer_Salary'].astype(float)
Customer['Enrollment_Date'] = pd.to_datetime(Customer['Enrollment_Date'].astype(str), format='%m/%d/%Y')
Customer.loc[Customer['Term_Date']=='.','Term_Date'] = np.nan
Customer['Term_Date'] = pd.to_datetime(Customer['Term_Date'].astype(str), format='%m/%d/%Y')

Customer = Customer.merge(Client,how='left',left_on='Unique_Company_ID',right_on='Client ID')
Customer.head()

In [None]:
### 2. Performance Dataset

Perf_11 = pd.read_csv('../data/Performance_Data_Request_20191217.csv')
Perf = pd.read_csv('../data/Performance_Data_Request_20191204.csv')
# Deal with the column name mismatching issue before appending the 2 dataframes together
Perf_11.rename(columns = {' Unique_Order_ID':'Unique_Order_ID','Order_date':'Order_Date'},inplace = True)
Perf.rename(columns = {'items_ordered':'Items_Ordered','EMPSTATUS':'Customer_Status'},inplace = True)
Perf = Perf.append(Perf_11, ignore_index=True)

# Make sure the data order is aligned with OrderID & Y-M
Perf['Year_and_Month'] = pd.to_datetime(Perf['Year_and_Month'].astype(str), format = '%Y%m')
Perf = Perf.sort_values(['Unique_Order_ID','Year_and_Month'],ascending = [False,True])

# Create new column - YQ and Year
Perf['YQ'] = pd.PeriodIndex(Perf['Year_and_Month'], freq='Q')
Perf['Year'] = Perf['YQ'].dt.year

Perf.loc[Perf['Date_account_went_delinquent']=='.','Date_account_went_delinquent'] = np.nan
Perf['Date_account_went_delinquent'] = pd.to_datetime(Perf['Date_account_went_delinquent'].astype(str), format='%m/%d/%Y')


# Assign Payment Order as a new column
Perf['Payment_Order'] = Perf.groupby('Unique_Order_ID')['Year_and_Month'].rank(method='first')

# Treat 151+ ad CO as BAD
Perf['Bad']=0
Perf.loc[Perf['Default_Status'].isin(['151+','Chargeoff']),'Bad']=1
# Create a flag Ever_Bad to distinguish first time bad and non-first time bad 
Perf['Ever_Bad'] = Perf.groupby('Unique_Order_ID')['Bad'].transform(pd.Series.cumsum)

# Create 2os Loss, using Losses for C/O orders and Current_Balance for 151+ orders
Perf['2os_loss'] = 0
Perf.loc[Perf['Default_Status']=='Chargeoff','2os_loss'] = Perf.loc[Perf['Default_Status']=='Chargeoff','Losses']
Perf.loc[Perf['Default_Status']=='151+','2os_loss'] = Perf.loc[Perf['Default_Status']=='151+','Current_Balance']


# Perf['Enrollment_Date'] = pd.to_datetime(Perf['Enrollment_Date'].astype(str), format = '%m/%d/%Y')

# Some cleaning here
Perf.loc[Perf['Order_Amount']=='.','Order_Amount']=np.nan
Perf['Order_Amount'] = Perf['Order_Amount'].astype(float)

Perf = Perf[(Perf['Order_Amount'].notnull()) & (Perf['Order_Amount']!=0)]

Perf.loc[Perf['Payments_Due']=='.','Payments_Due']=np.nan
Perf['Payments_Due'] = Perf['Payments_Due'].astype(float)


Perf = Perf.merge(Customer, how='left',on='Unique_Customer_ID')

In [None]:
df_15['2os_bal'] = df_15['Current_Balance']
df_15.loc[df_15['Default_Status']=='Chargeoff','2os_bal']= df_15.loc[df_15['Default_Status']=='Chargeoff','Losses']
df_15.loc[(df_15['Default_Status']=='Chargeoff')&(df_15['Losses']==0),'2os_bal']= df_15.loc[df_15['Default_Status']=='Chargeoff','Current_Balance']
df_15.loc[(df_15['Payment_Order']==1),'2os_bal'] = df_15.loc[(df_15['Payment_Order']==1),'Order_Amount']



In [62]:
#Select Performance Data after 2015
df_15 = Perf[Perf['Year']>=2015]

In [65]:
# Monthly flow-in loss
d1 = df_15[(df_15['Bad']==1)&(df_15['Ever_Bad']<=1)].groupby('Year_and_Month').agg({'2os_loss':'sum'}).reset_index()

In [66]:
# Monthly balance of active accounts
d2 = df_15.groupby('Year_and_Month').agg({'Current_Balance':'sum'}).reset_index()

In [68]:
# Merge these two by `Year and Month`
d3 = d1.merge(d2, how='left', on='Year_and_Month')

In [71]:
d3['loss_rate'] = (d3['2os_loss']/d3['Current_Balance'])*12
d3['Year']=d3['Year_and_Month'].dt.year
d3

Unnamed: 0,Year_and_Month,2os_loss,Current_Balance,loss_rate
0,2015-01-01,1534316.62,196056600.0,0.093911
1,2015-02-01,1318131.52,187183800.0,0.084503
2,2015-03-01,1532162.89,181737300.0,0.101168
3,2015-04-01,1309643.19,176590200.0,0.088995
4,2015-05-01,1365530.15,172102100.0,0.095213
5,2015-06-01,1600372.25,169404200.0,0.113365
6,2015-07-01,1976183.22,168165100.0,0.141017
7,2015-08-01,1994903.7,170876400.0,0.140095
8,2015-09-01,2014090.87,167677500.0,0.14414
9,2015-10-01,1888123.44,164854500.0,0.137439


In [None]:
d3.groupby('Year').agg({'loss_rate':'mean'}).reset_index()

In [None]:
Perf[Perf['YQ']==]

In [None]:
def vintage_dollar_view(df, YQ, vintage):
    order_list = vintage[vintage['YQ'] == YQ]['Unique_Order_ID'].unique()
    total_balance = vintage[vintage['YQ'] == YQ]['Current_Balance'].sum()
    df_YQ = df[df['Unique_Order_ID'].isin(order_list)]
    df_DQ = df_YQ[df_YQ.Default_Status.isin(['151+', 'Chargeoff'])]
    df_DQ = df_DQ.loc[df_DQ.groupby('Unique_Order_ID')[
        'Year_and_Month'].idxmin()]
    view = df_DQ.groupby('Year_and_Month').agg({'Current_Balance':'sum'}).reset_index()
    temp = pd.DataFrame({'month_order': np.arange(1, 31), 'Year_and_Month': np.array(
        pd.date_range(start=YQ.to_timestamp(), periods=30, freq='MS'))})
    view = temp.merge(view, how = 'left',on='Year_and_Month')
    view = view.fillna(0)
    
    view['DQ_Balance'] = view['Current_Balance'].cumsum()
    view['DQ_Rate'] = view['DQ_Balance']/total_balance

#     view['month_order'] = np.arange(1+round((view['Year_and_Month'].min()-df_YQ['Year_and_Month'].min())/np.timedelta64(1, 'M')), len(
#         view)+1+round((view['Year_and_Month'].min()-df_YQ['Year_and_Month'].min())/np.timedelta64(1, 'M')))
    view.rename(columns = {'DQ_Rate':str(YQ)+'_Vintage'},inplace = True)
    return view