In [7]:
import numpy as np
import pandas as pd
pd.options.display.max_rows = 100
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [8]:
# helper functions
def col_rename(df, dict_rules:dict, ip=True):
    return  df.rename(columns = dict_rules, inplace=ip)

def print_unique(df, col:str):
    return df[col].unique()

def fill_nans_with_means(df, col:str):
    ''' fills nans in column with median '''
    return df[col].fillna(c_df.income.mean()).round()

def make_lower(df, col:str):
    return df[col].str.lower()

def lower_case_column_names(df) ->pd.DataFrame:
    ''' make column headers lower case '''
    df.columns = [i.lower() for i in df.columns]
    return df

def strip_char(df, col:str, char:"single character to strip"):
    return list(map(lambda x: round(float(x.strip(char))/100,0)  \
        if type(x)==str else round(x,0), df[col]))

def record_str_replace(df, col:str, rules:dict):
    return df[col].replace(rules) # additional replacement

def get_between_slash_with_split(df,col:str, place):
    return [x[place] for x in df[col].str.split('/') if type(x) != float]


def get_between_slash(df, col: str, place):
    ''' returns middle entry in a string formatted by "a/b/cc" and avoid nans '''
    return list(map(lambda x: x[place] if type(x) == str else x, df[col]))


def get_between_slash_and_join(df, col):
    ''' returns middle entry in a string formatted by "a/b/cc" and avoid nans '''
    return list(map(lambda x: int(x[2]) if type(x) == str else x, df[col]))


def state(old_names, new_names):
    return c_df['st'].replace(old_names, new_names)

In [9]:
c_df = pd.read_csv('Data/customer_analysis_data_cleaned.csv')
c_df

Unnamed: 0,state,gender,education,customer_lifetime_value,income,monthly_premium_auto,number_of_open_complaints,policy_type,vehicle_class,total_claim_amount
0,east,u,master,37707,37707,1000,0.0,personal auto,four-door car,3
1,central,f,bachelor,6980,37707,94,0.0,personal auto,four-door car,1131
2,central,f,bachelor,12887,48767,108,0.0,personal auto,two-door car,566
3,west,m,bachelor,7646,37707,106,0.0,corporate auto,suv,530
4,east,m,high school or below,5363,36357,68,0.0,personal auto,four-door car,17
...,...,...,...,...,...,...,...,...,...,...
7064,west,f,college,4100,47761,104,0.0,personal auto,four-door car,541
7065,west,m,bachelor,23406,71941,73,0.0,personal auto,four-door car,198
7067,west,m,bachelor,8164,37707,85,3.0,corporate auto,four-door car,791
7068,west,m,college,7524,21941,96,0.0,personal auto,four-door car,691


In [10]:
col_rename(c_df, {'state':'region'})

In [11]:
c_df.income.min()

10037

In [12]:
c_df.vehicle_class.unique()

array(['four-door car', 'two-door car', 'suv', 'luxury suv', 'sports car',
       'luxury car'], dtype=object)

In [13]:
c_df.vehicle_class.replace({'luxury suv': 'luxury vehicle', 'luxury car':'luxury vehicle'},inplace=True)
c_df.vehicle_class.unique()

array(['four-door car', 'two-door car', 'suv', 'luxury vehicle',
       'sports car'], dtype=object)

In [14]:
from scipy.stats import iqr

In [15]:
print(iqr.__doc__)


    Compute the interquartile range of the data along the specified axis.

    The interquartile range (IQR) is the difference between the 75th and
    25th percentile of the data. It is a measure of the dispersion
    similar to standard deviation or variance, but is much more robust
    against outliers [2]_.

    The ``rng`` parameter allows this function to compute other
    percentile ranges than the actual IQR. For example, setting
    ``rng=(0, 100)`` is equivalent to `numpy.ptp`.

    The IQR of an empty array is `np.nan`.

    .. versionadded:: 0.18.0

    Parameters
    ----------
    x : array_like
        Input array or object that can be converted to an array.
    axis : int or sequence of int, optional
        Axis along which the range is computed. The default is to
        compute the IQR for the entire array.
    rng : Two-element sequence containing floats in range of [0,100] optional
        Percentiles over which to compute the range. Each must be
        between 0

In [16]:
c_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8706 entries, 0 to 7069
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   region                     8706 non-null   object 
 1   gender                     8706 non-null   object 
 2   education                  8706 non-null   object 
 3   customer_lifetime_value    8706 non-null   int64  
 4   income                     8706 non-null   int64  
 5   monthly_premium_auto       8706 non-null   int64  
 6   number_of_open_complaints  8706 non-null   float64
 7   policy_type                8706 non-null   object 
 8   vehicle_class              8706 non-null   object 
 9   total_claim_amount         8706 non-null   int64  
dtypes: float64(1), int64(4), object(5)
memory usage: 748.2+ KB


In [17]:
numerics_df   = c_df.select_dtypes(include=np.number)
cats_df       = c_df.select_dtypes(include=object)

In [18]:
print(c_df.quantile.__doc__)


        Return values at the given quantile over requested axis.

        Parameters
        ----------
        q : float or array-like, default 0.5 (50% quantile)
            Value between 0 <= q <= 1, the quantile(s) to compute.
        axis : {0, 1, 'index', 'columns'}, default 0
            Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
        numeric_only : bool, default True
            If False, the quantile of datetime and timedelta data will be
            computed as well.
        interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
            This optional parameter specifies the interpolation method to use,
            when the desired quantile lies between two data points `i` and `j`:

            * linear: `i + (j - i) * fraction`, where `fraction` is the
              fractional part of the index surrounded by `i` and `j`.
            * lower: `i`.
            * higher: `j`.
            * nearest: `i` or `j` whichever is nearest.
   

In [19]:
c_df.quantile(.25)

customer_lifetime_value       3982.75
income                       34856.75
monthly_premium_auto            68.00
number_of_open_complaints        0.00
total_claim_amount             265.00
Name: 0.25, dtype: float64

In [25]:
def outlier_mask(df) -> pd.Series:
    """ returns a df of boolean values to used to index a trimmed table in the 1.5*IQR range"""
    q1 = df.quantile(0.25)
    q3 = df.quantile(0.75)
    iqr = q3 - q1
    return  ~((df < (q1 -  1.5*iqr)) |(df > (q3 + 1.5*iqr))) #

In [21]:
numerics_df.describe()

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,number_of_open_complaints,total_claim_amount
count,8706.0,8706.0,8706.0,8706.0,8706.0
mean,8011.788307,47396.068573,110.594762,0.38686,431.266942
std,6888.027238,21648.270739,591.626545,0.911164,291.519072
min,1898.0,10037.0,61.0,0.0,0.0
25%,3982.75,34856.75,68.0,0.0,265.0
50%,5789.5,37707.0,83.0,0.0,378.0
75%,8935.0,62373.75,109.0,0.0,547.0
max,83325.0,99981.0,35354.0,5.0,2893.0


In [28]:
numerics_df.columns

Index(['customer_lifetime_value', 'income', 'monthly_premium_auto',
       'number_of_open_complaints', 'total_claim_amount'],
      dtype='object')

In [29]:
numerics_df.head()

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,number_of_open_complaints,total_claim_amount
1,6980,37707,94,0.0,1131
2,12887,48767,108,0.0,566
3,7646,37707,106,0.0,530
4,5363,36357,68,0.0,17
5,8256,62902,69,0.0,159


In [26]:
cols_to_trim = list(numerics_df.drop(['total_claim_amount'],axis=1).columns)
cols_to_trim
# boolean index list of "inliers"
inliers_idx = outlier_mask(numerics_df[cols_to_trim]).all(axis='columns')
# select subset of the data
numerics_df = numerics_df[inliers_idx]
cats_df = cats_df[inliers_idx]

['customer_lifetime_value',
 'income',
 'monthly_premium_auto',
 'number_of_open_complaints']

In [27]:
inliers_idx

1       True
2       True
3       True
4       True
5       True
        ... 
7062    True
7063    True
7064    True
7068    True
7069    True
Length: 6047, dtype: bool

In [23]:
`type(outlier_mask(numerics_df[cols_to_trim]).all(axis='columns'))

pandas.core.series.Series

In [24]:
print(c_df.all.__doc__)


Return whether all elements are True, potentially over an axis.

Returns True unless there at least one element within a series or
along a Dataframe axis that is False or equivalent (e.g. zero or
empty).

Parameters
----------
axis : {0 or 'index', 1 or 'columns', None}, default 0
    Indicate which axis or axes should be reduced.

    * 0 / 'index' : reduce the index, return a Series whose index is the
      original column labels.
    * 1 / 'columns' : reduce the columns, return a Series whose index is the
      original index.
    * None : reduce all axes, return a scalar.

bool_only : bool, default None
    Include only boolean columns. If None, will attempt to use everything,
    then use only boolean data. Not implemented for Series.
skipna : bool, default True
    Exclude NA/null values. If the entire row/column is NA and skipna is
    True, then the result will be True, as for an empty row/column.
    If skipna is False, then NA are treated as True, because these are not
    e

In [19]:
c_df = pd.concat([cats_df, numerics_df],axis=1)

In [20]:
c_df.to_csv('Data/customer_analysis_data_cleaned_trimmed.csv', index_label=False)