In [71]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import statsmodels.api as sm
import math
from scipy.stats import boxcox
pd.options.display.max_rows = 100

In [2]:
def convert_cols_to_lower(df):
    df.columns = [ i.lower() for i in df.columns ]
    return df

In [47]:
column_lst = sorted(['Customer', 'State', 'Gender', 'Education', 'Customer Lifetime Value',
       'Income', 'Monthly Premium Auto', 'Number of Open Complaints',
       'Policy Type', 'Vehicle Class', 'Total Claim Amount'])

def load_original_data():
    file1 = pd.read_csv('Data/file1.csv')
    file2 = pd.read_csv('Data/file2.csv')
    file3 = pd.read_csv('Data/file3.csv')
    
    convert_cols_to_lower(file1)
    convert_cols_to_lower(file2)
    convert_cols_to_lower(file3)
    
    file1 = file1.sort_index(axis=1)
    file2 = file2.sort_index(axis=1)
    file3 = file3.sort_index(axis=1)
    
    file1.columns = column_lst
    file2.columns = column_lst
    file3.columns = file3.columns.str.replace("state", "st")
    file3.columns = column_lst
    
    df = pd.concat([file1, file2, file3], axis=0)
    
    return df

In [48]:
#save this only for late, might be useful, here it doesn't work
def replace_str_in_columns(df, str_to_replace, str_to_replace_with):
    df.columns = sorted(df.columns.str.replace(str_to_replace, str_to_replace_with))
    return df

In [125]:
#load the full dataset, correctly concatenated and in the desired order
full_dataset = load_original_data()

In [126]:
#drop Customer column
full_dataset = full_dataset.drop("Customer", axis=1)

In [127]:
#remove duplicates
full_dataset = full_dataset.drop_duplicates()

In [128]:
#remove rows with NaN elements in the column Customer Lifetime Value
#full_dataset = full_dataset.dropna(subset="Customer Lifetime Value")

In [129]:
#reset the index to match the amout of rows
full_dataset = full_dataset.reset_index()

In [130]:
#remove "%" signs from Customer Lifetime Value where present
full_dataset["Customer Lifetime Value"] = (
    full_dataset["Customer Lifetime Value"].apply(lambda x: x.replace("%", " ") if type(x) == str else x))

In [131]:
#convert Customer Lifetime Value into integer by first converting it into float
full_dataset["Customer Lifetime Value"] = pd.to_numeric(full_dataset["Customer Lifetime Value"], errors = "coerce")
full_dataset["Customer Lifetime Value"] = full_dataset["Customer Lifetime Value"].apply(lambda x: math.trunc(x) if np.isnan(x) != True else 0)


#full_dataset["Customer Lifetime Value"] = full_dataset["Customer Lifetime Value"].astype('float64')
#full_dataset["Customer Lifetime Value"] = full_dataset["Customer Lifetime Value"].astype('int64')

In [132]:
#full_dataset

In [133]:
#function that checks if input is a string and splits it at "/"
#returning the 2nd element of the resulting list or returns x if it's not a string
def split(x):
    if isinstance(x, str):
        return x.split("/")[1]
    return x

In [134]:
full_dataset["Number of Open Complaints"] = full_dataset["Number of Open Complaints"].apply(lambda x: split(x))
full_dataset["Number of Open Complaints"] = pd.to_numeric(full_dataset["Number of Open Complaints"], errors = "coerce")
#full_dataset["Number of Open Complaints"] = full_dataset["Number of Open Complaints"].fillna(0).astype("int64")


In [148]:
full_dataset.loc[full_dataset["Number of Open Complaints"].isna()]

Unnamed: 0,index,Customer Lifetime Value,Education,Gender,Income,Monthly Premium Auto,Number of Open Complaints,Policy Type,State,Total Claim Amount,Vehicle Class
1070,1071,0,,,,,,,,,


In [139]:
full_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8882 entries, 0 to 8881
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   index                      8882 non-null   int64  
 1   Customer Lifetime Value    8882 non-null   int64  
 2   Education                  8881 non-null   object 
 3   Gender                     8759 non-null   object 
 4   Income                     8881 non-null   float64
 5   Monthly Premium Auto       8881 non-null   float64
 6   Number of Open Complaints  8881 non-null   float64
 7   Policy Type                8881 non-null   object 
 8   State                      8881 non-null   object 
 9   Total Claim Amount         8881 non-null   float64
 10  Vehicle Class              8881 non-null   object 
dtypes: float64(4), int64(2), object(5)
memory usage: 763.4+ KB


In [208]:
full_dataset = full_dataset.drop([1071])

In [209]:
full_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8881 entries, 0 to 8881
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   index                      8881 non-null   int64  
 1   Customer Lifetime Value    8881 non-null   int64  
 2   Education                  8880 non-null   object 
 3   Gender                     8881 non-null   object 
 4   Income                     8880 non-null   float64
 5   Monthly Premium Auto       8880 non-null   float64
 6   Number of Open Complaints  8880 non-null   float64
 7   Policy Type                8880 non-null   object 
 8   State                      8880 non-null   object 
 9   Total Claim Amount         8880 non-null   float64
 10  Vehicle Class              8880 non-null   object 
dtypes: float64(4), int64(2), object(5)
memory usage: 832.6+ KB


In [162]:
def clean_gender(string):
    m_lst = ['M', 'Male']
    f_lst = ['F', 'Femal', 'female']
    if isinstance(string, str) and string in m_lst:
        return "male"
    elif isinstance(string, str) and string in f_lst:
        return "female"
    else:
        return "error"

In [163]:
full_dataset['Gender'] = list(map(clean_gender, full_dataset['Gender']))

In [173]:
full_dataset
#full_dataset.info()

Unnamed: 0,index,Customer Lifetime Value,Education,Gender,Income,Monthly Premium Auto,Number of Open Complaints,Policy Type,State,Total Claim Amount,Vehicle Class
0,0,0,Master,error,0.0,1000.0,0.0,Personal Auto,Washington,2.704934,Four-Door Car
1,1,697953,Bachelor,female,0.0,94.0,0.0,Personal Auto,Arizona,1131.464935,Four-Door Car
2,2,1288743,Bachelor,female,48767.0,108.0,0.0,Personal Auto,Nevada,566.472247,Two-Door Car
3,3,764586,Bachelor,male,0.0,106.0,0.0,Corporate Auto,California,529.881344,SUV
4,4,536307,High School or Below,male,36357.0,68.0,0.0,Personal Auto,Washington,17.269323,Four-Door Car
...,...,...,...,...,...,...,...,...,...,...,...
8877,7065,23405,Bachelor,male,71941.0,73.0,0.0,Personal Auto,California,198.234764,Four-Door Car
8878,7066,3096,College,female,21604.0,79.0,0.0,Corporate Auto,California,379.200000,Four-Door Car
8879,7067,8163,Bachelor,male,0.0,85.0,3.0,Corporate Auto,California,790.784983,Four-Door Car
8880,7068,7524,College,male,21941.0,96.0,0.0,Personal Auto,California,691.200000,Four-Door Car


## Day 2

In [215]:
#replace 0s with np.nan
#full_dataset["Income"] = full_dataset["Income"].replace(to_replace=0, value=np.nan)
#fill mean of column to elements with NaN
full_dataset["Income"] = full_dataset["Income"].replace(to_replace=np.nan, value=full_dataset["Income"].mean())

In [217]:
#replace 0s with np.nan
full_dataset["Monthly Premium Auto"] = full_dataset["Monthly Premium Auto"].replace(to_replace=0, value=np.nan)
#fill mean of column to elements with NaN
full_dataset["Monthly Premium Auto"] = (
    full_dataset["Monthly Premium Auto"].replace(to_replace=np.nan, value=full_dataset["Monthly Premium Auto"].mean()))

In [219]:
#replace 0s with np.nan
full_dataset["Total Claim Amount"] = full_dataset["Total Claim Amount"].replace(to_replace=0, value=np.nan)
#fill mean of column to elements with NaN
full_dataset["Total Claim Amount"] = (
    full_dataset["Total Claim Amount"].replace(to_replace=np.nan, value=full_dataset["Total Claim Amount"].mean()))

Bucketing the data - Write a function to replace column "State" to different zones. California as West Region, Oregon as North West, and Washington as East, and Arizona and Nevada as Central

In [222]:
region_lst = ['California', 'Oregon', "Washington", "Arizona", "Nevada"]
def bucket(string, lst):
    if string == lst[0]:
        return "West Region"
    elif string == lst[1]:
        return "North West"
    elif string == lst[2]:
        return "East"
    elif string == (lst[3] or lst[4]):
        return "Central"

In [223]:
full_dataset["State"] = full_dataset["State"].apply(lambda x: bucket(x,region_lst))

In [224]:
full_dataset

Unnamed: 0,index,Customer Lifetime Value,Education,Gender,Income,Monthly Premium Auto,Number of Open Complaints,Policy Type,State,Total Claim Amount,Vehicle Class
0,0,0,Master,error,50704.717802,1000.0,0.0,Personal Auto,East,2.704934,Four-Door Car
1,1,697953,Bachelor,female,50704.717802,94.0,0.0,Personal Auto,Central,1131.464935,Four-Door Car
2,2,1288743,Bachelor,female,48767.000000,108.0,0.0,Personal Auto,,566.472247,Two-Door Car
3,3,764586,Bachelor,male,50704.717802,106.0,0.0,Corporate Auto,West Region,529.881344,SUV
4,4,536307,High School or Below,male,36357.000000,68.0,0.0,Personal Auto,East,17.269323,Four-Door Car
...,...,...,...,...,...,...,...,...,...,...,...
8877,7065,23405,Bachelor,male,71941.000000,73.0,0.0,Personal Auto,West Region,198.234764,Four-Door Car
8878,7066,3096,College,female,21604.000000,79.0,0.0,Corporate Auto,West Region,379.200000,Four-Door Car
8879,7067,8163,Bachelor,male,50704.717802,85.0,3.0,Corporate Auto,West Region,790.784983,Four-Door Car
8880,7068,7524,College,male,21941.000000,96.0,0.0,Personal Auto,West Region,691.200000,Four-Door Car
