# Rolling Window Reformating
### $Time$ $Series$ $3rd$ $Test$

$Vasco$ $Mergulhão$ $-$ $Jan$ $2023$

### Version 1:
 - Takes in Select IDs list and Filterd FullSet.
 - Porcess a Sub-Sample individually
 - Outputs Rolling Window DFs for respective Sample.


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import groupby
from datetime import timedelta, date
from pathlib import Path

import time
import datetime
import copy

---
## Data Import New Data

### Credit History

In [5]:
CH_FullSet = pd.read_csv('Data/FilteredIDs/TS_FilIDsSet_CredHist_V1.csv')

In [9]:
CH_FullSet#.head()

Unnamed: 0,short_ID,date,total_ontime_left
0,1,2018-01-25,479700.0
1,1,2018-01-26,393300.0
2,1,2018-01-27,306900.0
3,1,2018-01-28,220500.0
4,1,2018-01-29,134100.0
...,...,...,...
179592704,219089,2023-01-16,4035070.0
179592705,219089,2023-01-17,3948670.0
179592706,219089,2023-01-18,3862270.0
179592707,219089,2023-01-19,3775870.0


In [7]:
CH_FullSet[CH_FullSet.isnull()]

Unnamed: 0,short_ID,date,total_ontime_left
0,,,
1,,,
2,,,
3,,,
4,,,
...,...,...,...
179592704,,,
179592705,,,
179592706,,,
179592707,,,


### ID List

In [4]:
ID_List = pd.read_csv('Data/FilteredIDs/Reference_ID_List.csv')

In [5]:
ID_List.head(3)

Unnamed: 0,short_ID,customer_id,Kenya_10k,Rwanda_10k
0,1,689aa5a01c216d8b16ed0250cebdc702,0,0
1,2,21a06929d23550339ee18d98b6e05fc2,0,2
2,3,ea8df9172e5ad7ce79b77bada66d3e3e,0,0


---
## Sample Selection

In [112]:
# Variable defining chosen sub-sample
sample_name = 'Kenya_10k_Set_1'

# Dictonary with all Sub-sample names and Respective Data Column pairs
sample_col_name = {
    'Rwanda_10k_Set_1' : 'Rwanda_10k',
    'Rwanda_10k_Set_2' : 'Rwanda_10k',
    'Rwanda_10k_Set_3' : 'Rwanda_10k',
    'Kenya_10k_Set_1' : 'Kenya_10k',
    'Kenya_10k_Set_2' : 'Kenya_10k',
    'Kenya_10k_Set_3' : 'Kenya_10k'
}
# And this pairs name with correct value
sample_col_valeu = {
    'Rwanda_10k_Set_1' : 1,
    'Rwanda_10k_Set_2' : 2,
    'Rwanda_10k_Set_3' : 3,
    'Kenya_10k_Set_1' : 1,
    'Kenya_10k_Set_2' : 2,
    'Kenya_10k_Set_3' : 3
}

In [113]:
sample_IDs = ID_List[ID_List[sample_col_name[sample_name]] == sample_col_valeu[sample_name]].short_ID.values
CH_SubSample = CH_FullSet[CH_FullSet['short_ID'].isin(sample_IDs)].reset_index(drop=True)

In [114]:
CH_SubSample.head()

Unnamed: 0,short_ID,date,total_ontime_left
0,347,2018-01-19,1332480.0
1,347,2018-01-20,1246080.0
2,347,2018-01-21,1159680.0
3,347,2018-01-22,1073280.0
4,347,2018-01-23,986880.0


---
## Checks and Fixes

### Dataset Bounds

In [115]:
earliest_date = CH_SubSample.date.min()
latest_date = CH_SubSample.date.max() 
print(f'First date on record: {earliest_date}')
print(f'Last date on record: {latest_date}')

First date on record: 2018-01-10
Last date on record: 2023-01-20


### Checking for NaNs and Negative Credits

In [116]:
NaN_values = len(CH_SubSample[CH_SubSample.total_ontime_left.isnull()])
Neg_values = len(CH_SubSample[CH_SubSample['total_ontime_left'] < 0].total_ontime_left)

print(f'There are {NaN_values} NaNs \nThere are {Neg_values} Negative Values')

There are 7041 NaNs 
There are 0 Negative Values


#### Fixing NaNs
For all 10k Sub-Samples:<br>
All NaNs are on 2022-12-12 (most have been a technical issue).<br>
In this case a simple foward fill policy should cause minimal impact of the model performance.<br>

In [117]:
CH_SubSample[CH_SubSample.total_ontime_left.isnull()].date.value_counts()

2022-12-12    7041
Name: date, dtype: int64

In [118]:
if NaN_values > 0:
    CH_SubSample.fillna(method="ffill", inplace=True)
    NaN_values = len(CH_SubSample[CH_SubSample.total_ontime_left.isnull()])
    print('Applied Foward Fill')
else:
    print('Not Applied')

Applied Foward Fill


#### Fixing Negative Values

In [119]:
if Neg_values > 0:
    # Fix Neg values
    # ASSUMPTION: Neg values are a bug, where the sign should simply be possitive
    # ACTION: Take absolute valeu
    CH_SubSample['total_ontime_left'] = abs(CH_SubSample['total_ontime_left'])
    Neg_values = len(CH_SubSample[CH_SubSample['total_ontime_left'] < 0].total_ontime_left)

else:
    print('Not Applied')



Not Applied


---
## Negative Credit Penalty
Current Policy:<br>
All values at zero (i.e., No Credit) are transformed into -7 days of credit.<br>
-7 because of the most commonly applied Bboxx penalty for late payments.

In [120]:
# Penalty is applied in seconds, so -7 days needs to get converted into seconds
late_penalty_seconds = -7 * 24 * 60 * 60 

In [121]:
CH_SubSample['TCredits_seconds'] = np.where(CH_SubSample['total_ontime_left'] == 0, late_penalty_seconds , CH_SubSample['total_ontime_left'])

---
## Adjusting Columns

### Days of Credit

In [122]:
CH_SubSample['TCredits_days'] = CH_SubSample['TCredits_seconds'] / (60 * 60 * 24)

### Re-ordering columns

In [123]:
CH_SubSample = CH_SubSample[['short_ID', 'date', 'TCredits_days', 'TCredits_seconds']]
CH_SubSample.head()

Unnamed: 0,short_ID,date,TCredits_days,TCredits_seconds
0,347,2018-01-19,15.422222,1332480.0
1,347,2018-01-20,14.422222,1246080.0
2,347,2018-01-21,13.422222,1159680.0
3,347,2018-01-22,12.422222,1073280.0
4,347,2018-01-23,11.422222,986880.0


---
# Rolling Window Structure

In [134]:
def window_slicer (df, w_len = 30, stride = None, min_windows = 3, 
                   value_var = 'TCredits_days', time_var =  'date', id_col = 'short_ID',
                   add_col = ['window_start_date']):
    
    start_time = time.time()
    
    total_users = len(df[id_col].unique())
    
    ## Dealing with the step
    if not stride:
        stride = copy.deepcopy(w_len)
        
    # Minimum number of windows
    # Converts the absolute bound (in days) to a bound relative to the given w_len and stride    
    #min_windows = np.floor(((min_days - w_len) / stride) + 1)
    
    # Converts number of windows into min bound in days
    min_days = min_windows * w_len
    
    
    ########################################################################################    
    #######################
    #Creating auxiliary df column names
    col_names = [None] * (w_len + 1)
    col_names[0] = add_col[0]
    for p in range(int(w_len)):
        col_names[p+1] = 'd' + str(p+1)

       
    ##########################################################
    # Creating the Index Structure
    # Lowerbound of Number of Windows for each User
    print(f'Creating Index Structure')
    IDs = df[id_col].unique()
    too_small = []
    user_index = []
    windows_index = []
    i = 1
    for user in IDs: 
        print( f'Working on user ({i} / {total_users})', end='\r')
        i += 1

        lenght_user = df[df[id_col] == user][value_var].shape[0]
        n_w = np.floor(((lenght_user - w_len) / stride) + 1) # formulat with stride https://stackoverflow.com/questions/53796545/number-of-overlapping-windows-of-a-given-length
        if n_w >= min_windows:          
            for k in range(int(n_w)):
                user_index.append(user)
                windows_index.append(k)      
        else:
            too_small.append(user)
                       
    # Warning of delected IDs
    if len(too_small) < 20:
        print(f'\nThese {len(too_small)} IDs had less then {min_days} days in record:')
        print(too_small)
        print('')
    elif len(too_small) >= 20:
        print(f'\n{len(too_small)} IDs had less then {min_days} days in record.')
        print('')

    ##########################################################
    ## Place holder for output df
    double_index = [np.array(user_index),
                    np.array(windows_index)]
   
    df_sliding = pd.DataFrame(index=double_index, columns = col_names)     
    
    ###########################################
    # Filling out DF
    print(f'Filling in Dataframe')
    j=0
    user_list = set(user_index)
    for user, user_df in df.groupby(id_col):
        j += 1
        print( f'Working on user ({j} / {total_users})', end='\r')
        
        if user in user_list:
            n_windows = df_sliding.loc[user].shape[0]      
            for window in range(n_windows): 

                #Start date of given window  
                df_sliding.loc[(user, window) , add_col[0]] = user_df.iloc[window * stride][time_var]
                #Values for given window
                df_sliding.loc[(user, window) , col_names[-w_len:]] = user_df.iloc[window * stride : (window * stride) + w_len][value_var].values
    
    #Convert index back into short_ID and new window_ID columns
    df_sliding.reset_index(inplace=True, names=['short_ID', 'window_ID'])
    
    print(f'\nTotal process time: {np.round(time.time() - start_time,2)}[s]') 
    return df_sliding 


In [136]:
window_length = 30
CH_SubSample_w30 = window_slicer(CH_SubSample, w_len = window_length, min_windows = 6, stride = None)

Creating Index Structure
Working on user (10000 / 10000)
375 IDs had less then 180 days in record.

Filling in Dataframe
Working on user (10000 / 10000)
Total process time: 348.49[s]


In [137]:
window_length = 60
CH_SubSample_w60 = window_slicer(CH_SubSample, w_len = window_length, min_windows = 3, stride = None)

Creating Index Structure
Working on user (10000 / 10000)
375 IDs had less then 180 days in record.

Filling in Dataframe
Working on user (10000 / 10000)
Total process time: 195.43[s]


In [138]:
window_length = 90
CH_SubSample_w90 = window_slicer(CH_SubSample, w_len = window_length, min_windows = 2, stride = None)

Creating Index Structure
Working on user (10000 / 10000)
375 IDs had less then 180 days in record.

Filling in Dataframe
Working on user (10000 / 10000)
Total process time: 159.47[s]


In [139]:
window_length = 180
CH_SubSample_w180 = window_slicer(CH_SubSample, w_len = window_length, min_windows = 1, stride = None)

Creating Index Structure
Working on user (10000 / 10000)
375 IDs had less then 180 days in record.

Filling in Dataframe
Working on user (10000 / 10000)
Total process time: 119.7[s]


# Save Processed Data

In [142]:
#Create dedicated folder for this sub-sample
Path(f"Data/{sample_name}").mkdir(parents=True, exist_ok=True)

In [143]:
CH_SubSample_w30.to_csv(f"Data/{sample_name}/{sample_name + '_w30.csv'}", index = False)

In [144]:
CH_SubSample_w60.to_csv(f"Data/{sample_name}/{sample_name + '_w60.csv'}", index = False)

In [145]:
CH_SubSample_w90.to_csv(f"Data/{sample_name}/{sample_name + '_w90.csv'}", index = False)

In [146]:
CH_SubSample_w180.to_csv(f"Data/{sample_name}/{sample_name + '_w180.csv'}", index = False)