In [340]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import statsmodels.api as sm
from scipy.stats import boxcox
pd.options.display.max_rows = 100

In [341]:
def convert_cols_to_lower(df):
    df.columns = [ i.lower() for i in df.columns ]
    return df

In [342]:
column_lst = sorted(['Customer', 'State', 'Gender', 'Education', 'Customer Lifetime Value',
       'Income', 'Monthly Premium Auto', 'Number of Open Complaints',
       'Policy Type', 'Vehicle Class', 'Total Claim Amount'])

def load_original_data():
    file1 = pd.read_csv('Data/file1.csv')
    file2 = pd.read_csv('Data/file2.csv')
    file3 = pd.read_csv('Data/file3.csv')
    
    convert_cols_to_lower(file1)
    convert_cols_to_lower(file2)
    convert_cols_to_lower(file3)
    
    file1 = file1.sort_index(axis=1)
    file2 = file2.sort_index(axis=1)
    file3 = file3.sort_index(axis=1)
    
    file1.columns = column_lst
    file2.columns = column_lst
    file3.columns = file3.columns.str.replace("state", "st")
    file3.columns = column_lst
    
    df = pd.concat([file1, file2, file3], axis=0)
    
    return df

In [343]:
#save this only for late, might be useful, here it doesn't work
def replace_str_in_columns(df, str_to_replace, str_to_replace_with):
    df.columns = sorted(df.columns.str.replace(str_to_replace, str_to_replace_with))
    return df

In [344]:
#load the full dataset, correctly concatenated and in the desired order
full_dataset = load_original_data()

In [345]:
#drop Customer column
full_dataset = full_dataset.drop("Customer", axis=1)

In [346]:
#remove duplicates
full_dataset = full_dataset.drop_duplicates()

In [347]:
full_dataset = full_dataset.dropna(subset="Customer Lifetime Value")

In [348]:
full_dataset = full_dataset.reset_index()

In [349]:
type(full_dataset["Customer Lifetime Value"])

pandas.core.series.Series

In [350]:
full_dataset["Customer Lifetime Value"] = full_dataset["Customer Lifetime Value"].apply(lambda x: x.replace("%", " ") if type(x) == str else x)

In [354]:
full_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8874 entries, 0 to 8873
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   index                      8874 non-null   int64  
 1   Customer Lifetime Value    8874 non-null   object 
 2   Education                  8874 non-null   object 
 3   Gender                     8754 non-null   object 
 4   Income                     8874 non-null   float64
 5   Monthly Premium Auto       8874 non-null   float64
 6   Number of Open Complaints  8874 non-null   object 
 7   Policy Type                8874 non-null   object 
 8   State                      8874 non-null   object 
 9   Total Claim Amount         8874 non-null   float64
 10  Vehicle Class              8874 non-null   object 
dtypes: float64(3), int64(1), object(7)
memory usage: 762.7+ KB


In [359]:
full_dataset["Customer Lifetime Value"] = full_dataset["Customer Lifetime Value"].astype('float64')
full_dataset["Customer Lifetime Value"] = full_dataset["Customer Lifetime Value"].astype('int64')

In [360]:
full_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8874 entries, 0 to 8873
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   index                      8874 non-null   int64  
 1   Customer Lifetime Value    8874 non-null   int64  
 2   Education                  8874 non-null   object 
 3   Gender                     8754 non-null   object 
 4   Income                     8874 non-null   float64
 5   Monthly Premium Auto       8874 non-null   float64
 6   Number of Open Complaints  8874 non-null   object 
 7   Policy Type                8874 non-null   object 
 8   State                      8874 non-null   object 
 9   Total Claim Amount         8874 non-null   float64
 10  Vehicle Class              8874 non-null   object 
dtypes: float64(3), int64(2), object(6)
memory usage: 762.7+ KB


In [None]:
split_complaints = full_dataset["Number of Open Complaints"].apply(lambda x: x.split("/"))
split_complaints