In [3]:
import numpy as np
import pandas as pd
pd.options.display.max_rows = 100
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

Utility functions for cleaning

In [4]:

def col_rename(df, dict_rules:dict, ip=True):
    return  df.rename(columns = dict_rules, inplace=ip)

def print_unique(df, col:str):
    return df[col].unique()

def fill_nans_with_means(df, col:str):
    ''' fills nans in column with median '''
    return df[col].fillna(c_df.income.mean()).round()

def make_lower(df, col:str):
    return df[col].str.lower()

def lower_case_column_names(df) ->pd.DataFrame:
    ''' make column headers lower case '''
    df.columns = [i.lower() for i in df.columns]
    return df

def strip_char(df, col:str, char:"single character to strip"):
    return list(map(lambda x: round(float(x.strip(char))/100,0)  \
        if type(x)==str else round(x,0), df[col]))

def record_str_replace(df, col:str, rules:dict):
    return df[col].replace(rules) # additional replacement

def get_between_slash_with_split(df,col:str, place):
    return [x[place] for x in df[col].str.split('/') if type(x) != float]


def get_between_slash(df, col: str, place):
    ''' returns middle entry in a string formatted by "a/b/cc" and avoid nans '''
    return list(map(lambda x: x[place] if type(x) == str else x, df[col]))


def get_between_slash_and_join(df, col):
    ''' returns middle entry in a string formatted by "a/b/cc" and avoid nans '''
    return list(map(lambda x: int(x[2]) if type(x) == str else x, df[col]))


def state(old_names, new_names):
    return c_df['st'].replace(old_names, new_names)

In [5]:
# read the data
df_1 = pd.read_csv('Data/file1.csv')
df_2 = pd.read_csv('Data/file2.csv')
df_3 = pd.read_csv('Data/file3.csv')

In [6]:
df_1.columns
df_2.columns
df_3.columns

Index(['Customer', 'ST', 'GENDER', 'Education', 'Customer Lifetime Value',
       'Income', 'Monthly Premium Auto', 'Number of Open Complaints',
       'Policy Type', 'Vehicle Class', 'Total Claim Amount'],
      dtype='object')

Index(['Customer', 'ST', 'GENDER', 'Education', 'Customer Lifetime Value',
       'Income', 'Monthly Premium Auto', 'Number of Open Complaints',
       'Total Claim Amount', 'Policy Type', 'Vehicle Class'],
      dtype='object')

Index(['Customer', 'State', 'Customer Lifetime Value', 'Education', 'Gender',
       'Income', 'Monthly Premium Auto', 'Number of Open Complaints',
       'Policy Type', 'Total Claim Amount', 'Vehicle Class'],
      dtype='object')

In [7]:
col_rename(df_3, {'State': 'ST'}) # rename State to ST

In [8]:
dfs = [df_1, df_2, df_3]
dfs = list(map(lower_case_column_names,dfs)) # make the headers lowercase
c_df = pd.concat(dfs)                          # concat the data into a pandas frame
c_df = c_df.drop(labels=['customer'], axis=1)    # drop customer label
c_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12074 entries, 0 to 7069
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   st                         9137 non-null   object 
 1   gender                     9015 non-null   object 
 2   education                  9137 non-null   object 
 3   customer lifetime value    9130 non-null   object 
 4   income                     9137 non-null   float64
 5   monthly premium auto       9137 non-null   float64
 6   number of open complaints  9137 non-null   object 
 7   policy type                9137 non-null   object 
 8   vehicle class              9137 non-null   object 
 9   total claim amount         9137 non-null   float64
dtypes: float64(3), object(7)
memory usage: 1.0+ MB


In [9]:
c_df['education'].isna().value_counts()

False    9137
True     2937
Name: education, dtype: int64

In [10]:
gender_old = print_unique(c_df,'gender')
gender_old = list(gender_old)
gender_old

[nan, 'F', 'M', 'Femal', 'Male', 'female']

In [11]:
gender_new           = ['u','f','m','f','m','f']
gender_replace_rules = dict(zip(gender_old,gender_new))
c_df['gender']       = record_str_replace(c_df,'gender',gender_replace_rules)
print_unique(c_df,'gender')

array(['u', 'f', 'm'], dtype=object)

In [12]:
state_old = print_unique(c_df,'st')
state_old = list(state_old)
state_old

['Washington',
 'Arizona',
 'Nevada',
 'California',
 'Oregon',
 'Cali',
 'AZ',
 'WA',
 nan]

In [13]:
state_new = ['Washington',
 'Arizona',
 'Nevada',
 'California',
 'Oregon',
 'California',
 'Arizona',
 'Washington', 'Unknown']
state_replace_rules = dict(zip(state_old,state_new))
c_df['st']          = record_str_replace(c_df,'st',state_replace_rules)
print_unique(c_df,'st')

array(['Washington', 'Arizona', 'Nevada', 'California', 'Oregon',
       'Unknown'], dtype=object)

In [14]:
c_df['number of open complaints'].isna().value_counts()

False    9137
True     2937
Name: number of open complaints, dtype: int64

In [15]:
#c_df.dropna(subset=['number of open complaints'],inplace=True)

In [16]:
c_df['number of open complaints'] = get_between_slash_and_join(c_df,'number of open complaints')
print_unique(c_df,'number of open complaints')

array([ 0.,  2.,  1.,  3.,  5.,  4., nan])

In [19]:
c_df['customer lifetime value'].value_counts()

16468.220790    6
5246.278375     6
22332.439460    6
4270.034394     6
5107.163002     6
               ..
7477.176362     1
15700.284360    1
2968.077571     1
5452.171237     1
2611.836866     1
Name: customer lifetime value, Length: 8211, dtype: int64

In [59]:
c_df['customer lifetime value'] = strip_char(c_df,'customer lifetime value','%')

In [60]:
c_df['customer lifetime value'].sort_values()

3973    1898.0
146     1899.0
707     1904.0
5019    1918.0
4647    1941.0
         ...  
4007       NaN
325        NaN
372        NaN
580        NaN
861        NaN
Name: customer lifetime value, Length: 12074, dtype: float64

In [62]:
c_df.shape[0]
c_df = c_df.drop_duplicates() # remove duplicates

8713

In [18]:
c_df.education = make_lower(c_df, 'education')
c_df.education.unique()

array(['master', 'bachelor', 'high school or below', 'college',
       'bachelors', 'doctor', nan], dtype=object)

In [19]:
c_df.education.replace({'bachelors':'bachelor'},inplace=True)
c_df.education.unique()

array(['master', 'bachelor', 'high school or below', 'college', 'doctor',
       nan], dtype=object)

In [20]:
print(c_df.dropna.__doc__)


        Remove missing values.

        See the :ref:`User Guide <missing_data>` for more on which values are
        considered missing, and how to work with missing data.

        Parameters
        ----------
        axis : {0 or 'index', 1 or 'columns'}, default 0
            Determine if rows or columns which contain missing values are
            removed.

            * 0, or 'index' : Drop rows which contain missing values.
            * 1, or 'columns' : Drop columns which contain missing value.

            .. versionchanged:: 1.0.0

               Pass tuple or list to drop on multiple axes.
               Only a single axis is allowed.

        how : {'any', 'all'}, default 'any'
            Determine if row or column is removed from DataFrame, when we have
            at least one NA or all NA.

            * 'any' : If any NA values are present, drop that row or column.
            * 'all' : If all values are NA, drop that row or column.

        thresh : int, optional
  

In [21]:
c_df.dropna(subset=['education','gender'],inplace=True)

In [22]:
c_df.education.unique()

array(['master', 'bachelor', 'high school or below', 'college', 'doctor'],
      dtype=object)

In [23]:
c_df[['vehicle class','policy type']].value_counts()

vehicle class  policy type   
Four-Door Car  Personal Auto     3269
Two-Door Car   Personal Auto     1335
SUV            Personal Auto     1247
Four-Door Car  Corporate Auto     969
Two-Door Car   Corporate Auto     395
SUV            Corporate Auto     379
Sports Car     Personal Auto      322
Four-Door Car  Special Auto       196
Luxury SUV     Personal Auto      136
Luxury Car     Personal Auto      112
Sports Car     Corporate Auto     103
SUV            Special Auto        71
Two-Door Car   Special Auto        68
Luxury Car     Corporate Auto      40
Luxury SUV     Corporate Auto      32
Sports Car     Special Auto        25
Luxury Car     Special Auto         7
Luxury SUV     Special Auto         6
dtype: int64

In [24]:
c_df['customer lifetime value'] = fill_nans_with_means(c_df,'customer lifetime value').apply(round)
c_df['total claim amount']      = fill_nans_with_means(c_df,'total claim amount').apply(round)
c_df['monthly premium auto']    = fill_nans_with_means(c_df,'monthly premium auto').apply(round)
c_df['income']                  = fill_nans_with_means(c_df,'income').apply(round)

In [25]:
c_df.income = c_df.income.replace(0, c_df.income.mean()).round(0).astype(int)

In [42]:
c_df.shape[0]

8706

In [44]:
c_df = c_df.drop_duplicates(inplace=True) # remove duplicates
c_df.shape[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  c_df = c_df.drop_duplicates(inplace=True) # remove duplicates


AttributeError: 'NoneType' object has no attribute 'shape'

In [28]:
old_st     = ['California', 'Washington', 'Oregon', 'Arizona', 'Nevada']
new_st     = ['West', 'East', 'Northeast', 'Central', 'Central']
c_df['st'] = state(old_st,new_st)

In [29]:
cols_to_lower = ['st','gender','policy type', 'vehicle class', 'education']

In [30]:
for col in cols_to_lower:
    c_df.loc[:,col] = make_lower(c_df,col)

In [31]:
c_df.head(10)

Unnamed: 0,st,gender,education,customer lifetime value,income,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,east,u,master,37707,37707,1000,0.0,personal auto,four-door car,3
1,central,f,bachelor,6980,37707,94,0.0,personal auto,four-door car,1131
2,central,f,bachelor,12887,48767,108,0.0,personal auto,two-door car,566
3,west,m,bachelor,7646,37707,106,0.0,corporate auto,suv,530
4,east,m,high school or below,5363,36357,68,0.0,personal auto,four-door car,17
5,northeast,f,bachelor,8256,62902,69,0.0,personal auto,two-door car,159
6,northeast,f,college,5381,55350,67,0.0,corporate auto,four-door car,322
7,central,m,master,7216,37707,101,0.0,corporate auto,four-door car,363
8,northeast,m,bachelor,24128,14072,71,0.0,corporate auto,four-door car,511
9,northeast,f,college,7388,28812,93,0.0,special auto,four-door car,426


In [32]:
col_rename(c_df,{'st':'state'})

In [33]:
c_df

Unnamed: 0,state,gender,education,customer lifetime value,income,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,east,u,master,37707,37707,1000,0.0,personal auto,four-door car,3
1,central,f,bachelor,6980,37707,94,0.0,personal auto,four-door car,1131
2,central,f,bachelor,12887,48767,108,0.0,personal auto,two-door car,566
3,west,m,bachelor,7646,37707,106,0.0,corporate auto,suv,530
4,east,m,high school or below,5363,36357,68,0.0,personal auto,four-door car,17
...,...,...,...,...,...,...,...,...,...,...
7064,west,f,college,4100,47761,104,0.0,personal auto,four-door car,541
7065,west,m,bachelor,23406,71941,73,0.0,personal auto,four-door car,198
7067,west,m,bachelor,8164,37707,85,3.0,corporate auto,four-door car,791
7068,west,m,college,7524,21941,96,0.0,personal auto,four-door car,691


In [34]:
new_col_rules = dict(zip(list(c_df.columns), list(c_df.columns.str.replace('\s+','_', regex=True))))
new_col_rules

{'state': 'state',
 'gender': 'gender',
 'education': 'education',
 'customer lifetime value': 'customer_lifetime_value',
 'income': 'income',
 'monthly premium auto': 'monthly_premium_auto',
 'number of open complaints': 'number_of_open_complaints',
 'policy type': 'policy_type',
 'vehicle class': 'vehicle_class',
 'total claim amount': 'total_claim_amount'}

In [35]:
col_rename(c_df,new_col_rules)

In [36]:
c_df.policy_type

0        personal auto
1        personal auto
2        personal auto
3       corporate auto
4        personal auto
             ...      
7064     personal auto
7065     personal auto
7067    corporate auto
7068     personal auto
7069    corporate auto
Name: policy_type, Length: 8706, dtype: object

In [37]:
c_df.to_csv('Data/customer_analysis_data_cleaned.csv',index_label=False)

In [38]:
print(c_df.to_csv.__doc__)


Write object to a comma-separated values (csv) file.

Parameters
----------
path_or_buf : str, path object, file-like object, or None, default None
    String, path object (implementing os.PathLike[str]), or file-like
    object implementing a write() function. If None, the result is
    returned as a string. If a non-binary file object is passed, it should
    be opened with `newline=''`, disabling universal newlines. If a binary
    file object is passed, `mode` might need to contain a `'b'`.

    .. versionchanged:: 1.2.0

       Support for binary file objects was introduced.

sep : str, default ','
    String of length 1. Field delimiter for the output file.
na_rep : str, default ''
    Missing data representation.
float_format : str, default None
    Format string for floating point numbers.
columns : sequence, optional
    Columns to write.
header : bool or list of str, default True
    Write out the column names. If a list of strings is given it is
    assumed to be aliases fo

In [39]:
file4 = pd.read_csv('Data/Data_Marketing_Customer_Analysis_Round2.csv')

In [40]:
file4.head()

Unnamed: 0.1,Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,...,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size,Vehicle Type
0,0,DK49336,Arizona,4809.21696,No,Basic,College,2/18/11,Employed,M,...,0.0,9,Corporate Auto,Corporate L3,Offer3,Agent,292.8,Four-Door Car,Medsize,
1,1,KX64629,California,2228.525238,No,Basic,College,1/18/11,Unemployed,F,...,0.0,1,Personal Auto,Personal L3,Offer4,Call Center,744.924331,Four-Door Car,Medsize,
2,2,LZ68649,Washington,14947.9173,No,Basic,Bachelor,2/10/11,Employed,M,...,0.0,2,Personal Auto,Personal L3,Offer3,Call Center,480.0,SUV,Medsize,A
3,3,XL78013,Oregon,22332.43946,Yes,Extended,College,1/11/11,Employed,M,...,0.0,2,Corporate Auto,Corporate L3,Offer2,Branch,484.013411,Four-Door Car,Medsize,A
4,4,QA50777,Oregon,9025.067525,No,Premium,Bachelor,1/17/11,Medical Leave,F,...,,7,Personal Auto,Personal L2,Offer1,Branch,707.925645,Four-Door Car,Medsize,


In [41]:
c_df['customer lifetime value']

KeyError: 'customer lifetime value'

In [None]:
dates = list(print_unique(file4,'Effective To Date'))
date_ps = pd.Series(dates)
date_ps_mon = list(set([x[0] for x in date_ps.str.split('/')]))
date_ps_mon.sort()
date_ps_mon

In [None]:
months = ['January','February']

In [None]:
dates_dict = dict(zip(date_ps_mon,months))
dates_dict

In [None]:
file4['Effective To Date'] = [x[0] for x in file4['Effective To Date'].str.split('/')]
file4['Effective To Date'] = record_str_replace(file4, 'Effective To Date', dates_dict)
file4['Effective To Date']


In [None]:
c_df['income'].mean