In [1]:
import pandas as pd
import numpy as np
from time import time

In [2]:
class Transformation():
    """
    Encapsulation for individual computation logic used for feature creation
    Applies a transformation on one or more Pandas Series
    in order to produce one or more features
    Args:
        on_col: String or Tuple, Series names to use when
            creating features
        f: The function to use to transform the input
        name: The name of the transformed feature (or prefix if >1 features)
        args: dictionary of additional arguments of f
        transform_type: Whether transform to be applied on series or dataframe object
    """
    
    def __init__(self, on_col, f, name, args=None, transform_type='series'):
        self.on_col = on_col
        self.transformation_f = f
        self.name = name
        self.transform_type = transform_type
        
        # Additional arguments for transformation
        self.transformation_args = args
        
        self.transformation_applied = False
        
    def apply(self, args):
        try:
            tdf = self.transformation_f(**args)
        except TypeError:
            tdf = self.transformation_f(*list(args.values()))
            
        if self.name is not None:
            if isinstance(tdf, pd.DataFrame):
                if isinstance(self.name, list):
                    tdf.columns = self.name
                else:
                    tdf.columns = [self.name + "_" + str(col) for col in tdf.columns]
            elif isinstance(tdf, pd.Series):
                tdf = pd.DataFrame(tdf)
                tdf.columns = [self.name]
                
        tdf = tdf.reset_index(drop=True)
        
        self.transformation_applied = True
        
        return tdf
        
    
    
class FeaturePipeline():
    """
    Create a feature pipeline for a raw data source.
    Attributes:
        transformations: The feature transformations for the pipeline
    """
    def __init__(self, logger=None):
        self.transformations = []
        self.transformed_df = None
        self.logger = logger
        
        self.transformed_df = None
                
    def add_transformation(self, on_cols, transformation_f, transformation_args=None, name=None, transform_type='series'):
        """
        Add a feature.
        Args:
            on_cols: String or Tuple 
            transformation_f: The function to use to transform the input
            transformation_args: Additional arguments for transformation
            name: Transformation name
            transform_type: Transformation type
            
        Returns:
            self
        """
        
        if transformation_f in self.transformations:
            print('Transformation: {} already added, skipping...'.format(transformation_f))
            return self
        
        transformation_f.logger = self.logger
        
        if type(on_cols) in [str]:
            on_cols = [on_cols]
        
        self.transformations.append(
            Transformation(on_col=on_cols,
                           f=transformation_f,
                           args=transformation_args, 
                           name=name,
                           transform_type=transform_type
                          ))
        
        return self
    
    def get_args(self, t, df, transformed_df):
        """
        Creates argument for transformation
        """
        
        list_of_series = []
        on_col_list = []
        if isinstance(t.on_col, str):
            on_col_list = [t.on_col]
        else:
            on_col_list = t.on_col
        
        for c in on_col_list:
            if c in df.columns:
                s = df[c]
                list_of_series.append(s)
            else:
                try:
                    s = transformed_df[c]
                    list_of_series.append(s)
                except:
                    raise Exception('Column {} not found in input dataframes'.format(c))                
                        
        if t.transform_type == 'series':
            args = {'ser{}'.format(i+1): l for i, l in enumerate(list_of_series)}
            
        elif t.transform_type == 'dataframe':
            arg_df = pd.concat(list_of_series, axis=1)
            assert arg_df.shape[0] == list_of_series[0].shape[0], 'Argument dataframe has different shape than concatenated series'
            args = {'df' : arg_df}
        else:
            raise Exception('Unknwon transform type')
            
        if t.transformation_args:
            args.update(t.transformation_args)
            
        return args
            
        
    def apply_transformations(self, df, index_cols=None, reapply_all=False):
        """
        Apply feature pipline on raw data
        Args:
            df: Input DataFrame 
        Returns:
            Features DataFrame
        """
        
        # Deduplicate the input dataframe on columns
        df = df.loc[:,~df.columns.duplicated()].copy()
        
        # Create index column 
        if index_cols:
            df['INDEX'] = df[index_cols].apply(lambda x: "".join([v for v in x[index_cols]]), axis=1)
            
            # Deduplicate the input dataframe at INDEX level
            df = df.drop_duplicates(['INDEX'])
        
            # Set The new index
            index_list = df['INDEX'].tolist()
            df.index = index_list
            del df['INDEX']
        else:
            index_list = df.index
                
        transformed_df = None or self.transformed_df
        for t in self.transformations:
            if t.transformation_applied and (not reapply_all):
                print('Transformation: {} applied, skipping...'.format(t))
                continue
            args = self.get_args(t=t, df=df, transformed_df=transformed_df)
            
            if transformed_df is None:
                transformed_df = t.apply(args)
                transformed_df.index = index_list
            else:
                tdf = t.apply(args)
                tdf.index = index_list
                transformed_df = pd.concat([transformed_df, tdf], axis=1)
                
            self.transformed_df = transformed_df
        return transformed_df

        

### 1. Read Raw Data

In [3]:
raw_data = pd.read_excel("../data/sample_raw_data.xlsx")

In [4]:
raw_data.astype(str).describe()

Unnamed: 0,ClaimNumber,LossCountry,ReportDate,LossDate,Hospital_Start_Date,Hospital_End_Date,DepartureDate,ReturnDate,Claimed_Amount,POL_Eff_Date
count,5000,5000,5000,5000,5000,5000,5000.0,5000.0,5000.0,5000
unique,5000,75,1622,1901,128,125,913.0,766.0,455.0,2286
top,Claim_2231,Thailand,2020-03-18,2018-09-16,NaT,NaT,,,0.0,2014-04-01
freq,1,1852,14,19,4867,4869,3739.0,4004.0,2817.0,310


In [5]:
raw_data.columns

Index(['ClaimNumber', 'LossCountry', 'ReportDate', 'LossDate',
       'Hospital_Start_Date', 'Hospital_End_Date', 'DepartureDate',
       'ReturnDate', 'Claimed_Amount', 'POL_Eff_Date'],
      dtype='object')

### 2. Define transforms functions


In [6]:
# Computation logic defined on series

def transforms_above_threshold(ser1, threshold):
    """
    Creates indicators based on claimed amount
    
    ser1: Series of values
    threshold: threshold value
    """
    s = pd.Series(np.where(ser1 > threshold, 1, 0))
    s.index = ser1.index
    
    return s


def transforms_days_between(ser1, ser2):
    """
    Difference between Dates in Days
    
    ser1: 'From' Date Series
    ser2: 'To' Date Series   
    """
    s1 = pd.to_datetime(ser1)
    s2 = pd.to_datetime(ser2)
    ser1 = (s2 - s1).dt.days
    
    # To Date>= From Date
    ser1 = ser1.clip(0)
    return ser1


### 3. Create feature Pipeline

In [7]:
ff = FeaturePipeline()

#### 3.1. Fradulent claims generally have high claimed amount

In [8]:
ff.add_transformation(on_cols=('Claimed_Amount'), 
                      transformation_f=transforms_above_threshold,
                      transformation_args={'threshold': 500},
                      name='HighClaimedAmount')

<__main__.FeaturePipeline at 0x24801671e50>

#### 3.2. Tracking travel duration could help identifying suspicious claims 

In [9]:
ff.add_transformation(on_cols=('DepartureDate', 'ReturnDate'), 
                      transformation_f=transforms_days_between,
                      transformation_args=None,
                      name='TravelLength')

<__main__.FeaturePipeline at 0x24801671e50>

#### 3.3. Tracking duration between loss & report date could help identifying suspicious claims 

In [10]:
ff.add_transformation(on_cols=('LossDate', 'ReportDate'), 
                      transformation_f=transforms_days_between,
                      transformation_args=None,
                      name='ReportLag')

<__main__.FeaturePipeline at 0x24801671e50>

#### 3.4. Tracking duration between hospital admit date & hospital discharge could help identifying suspicious claims

In [11]:
ff.add_transformation(on_cols=('Hospital_Start_Date', 'Hospital_End_Date'), 
                      transformation_f=transforms_days_between,
                      transformation_args=None,
                      name='HospitalStayLength')

<__main__.FeaturePipeline at 0x24801671e50>

#### 3.5. Tracking duration between Loss Date & Policy Effective Date could help identifying suspicious claims

In [12]:
ff.add_transformation(on_cols=('POL_Eff_Date', 'LossDate'), 
                      transformation_f=transforms_days_between,
                      transformation_args=None,
                      name='LossDurationSincePolicyEffective')

<__main__.FeaturePipeline at 0x24801671e50>

In [13]:
features = ff.apply_transformations(df=raw_data, index_cols=['ClaimNumber'])

In [14]:
features.describe()

Unnamed: 0,HighClaimedAmount,TravelLength,ReportLag,HospitalStayLength,LossDurationSincePolicyEffective
count,5000.0,982.0,5000.0,131.0,5000.0
mean,0.0786,10.178208,43.5796,4.022901,1177.416
std,0.26914,28.931812,85.770203,7.125756,1570.842842
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,10.0,1.0,91.0
50%,0.0,4.0,22.0,2.0,454.5
75%,0.0,9.0,51.0,4.0,1770.25
max,1.0,364.0,2224.0,66.0,10833.0


In [15]:
features.head()

Unnamed: 0,HighClaimedAmount,TravelLength,ReportLag,HospitalStayLength,LossDurationSincePolicyEffective
Claim_0,0,,4,,4
Claim_1,0,2.0,11,,9
Claim_2,0,,11,,4904
Claim_3,0,,24,,199
Claim_4,0,,16,,1340
