In [673]:
import numpy as np
import pandas as pd
import math
import statistics as st
from scipy.stats import zscore

In [674]:
def convert_cols_to_lower(df):
    df.columns = [ i.lower() for i in df.columns ]
    return df

In [675]:
column_lst = sorted(['Customer', 'State', 'Gender', 'Education', 'Customer Lifetime Value',
       'Income', 'Monthly Premium Auto', 'Number of Open Complaints',
       'Policy Type', 'Vehicle Class', 'Total Claim Amount'])

def load_original_data():
    file1 = pd.read_csv('Data/file1.csv')
    file2 = pd.read_csv('Data/file2.csv')
    file3 = pd.read_csv('Data/file3.csv')
    
    convert_cols_to_lower(file1)
    convert_cols_to_lower(file2)
    convert_cols_to_lower(file3)
    
    file1 = file1.sort_index(axis=1)
    file2 = file2.sort_index(axis=1)
    file3 = file3.sort_index(axis=1)
    
    file1.columns = column_lst
    file2.columns = column_lst
    file3.columns = file3.columns.str.replace("state", "st")
    file3.columns = column_lst
    
    df = pd.concat([file1, file2, file3], axis=0)
    
    return df

In [676]:
#save this only for late, might be useful, here it doesn't work
def replace_str_in_columns(df, str_to_replace, str_to_replace_with):
    df.columns = sorted(df.columns.str.replace(str_to_replace, str_to_replace_with))
    return df

In [677]:
#load the full dataset, correctly concatenated and in the desired order
full_dataset = load_original_data()

In [678]:
full_dataset.columns = [ i for i in full_dataset.columns.str.replace(" ","_").str.lower()]

In [679]:
full_dataset["state"].unique()

array(['Washington', 'Arizona', 'Nevada', 'California', 'Oregon', 'Cali',
       'AZ', 'WA', nan], dtype=object)

In [680]:
#drop Customer column
full_dataset = full_dataset.drop("customer", axis=1)

In [681]:
#remove duplicates
full_dataset = full_dataset.drop_duplicates()

In [682]:
#remove rows with NaN elements in the column Customer Lifetime Value
#full_dataset = full_dataset.dropna(subset="Customer Lifetime Value")

In [683]:
#reset the index to match the amout of rows
full_dataset = full_dataset.reset_index()

In [684]:
#remove "%" signs from Customer Lifetime Value where present
full_dataset["customer_lifetime_value"] = (
    full_dataset["customer_lifetime_value"].apply(lambda x: x.replace("%", " ") if type(x) == str else x))

In [685]:
#convert Customer Lifetime Value into integer by first converting it into float
full_dataset["customer_lifetime_value"] = pd.to_numeric(full_dataset["customer_lifetime_value"], errors = "coerce")
full_dataset["customer_lifetime_value"] = full_dataset["customer_lifetime_value"].apply(lambda x: math.trunc(x) if np.isnan(x) != True else 0)

In [686]:
#full_dataset

In [687]:
#function that checks if input is a string and splits it at "/"
#returning the 2nd element of the resulting list or returns x if it's not a string
def split(x):
    if isinstance(x, str):
        return x.split("/")[1]
    return x

In [688]:
full_dataset["number_of_open_complaints"] = full_dataset["number_of_open_complaints"].apply(lambda x: split(x))
full_dataset["number_of_open_complaints"] = pd.to_numeric(full_dataset["number_of_open_complaints"], errors = "coerce")

In [689]:
full_dataset.loc[full_dataset["number_of_open_complaints"].isna()]

Unnamed: 0,index,customer_lifetime_value,education,gender,income,monthly_premium_auto,number_of_open_complaints,policy_type,state,total_claim_amount,vehicle_class
1070,1071,0,,,,,,,,,


In [691]:
full_dataset = full_dataset.drop([1071])

In [693]:
def clean_gender(string):
    m_lst = ['M', 'Male']
    f_lst = ['F', 'Femal', 'female']
    if isinstance(string, str) and string in m_lst:
        return "male"
    elif isinstance(string, str) and string in f_lst:
        return "female"
    else:
        return "error"

In [694]:
full_dataset['gender'] = list(map(clean_gender, full_dataset['gender']))

In [695]:
full_dataset.tail()
#full_dataset.info()

Unnamed: 0,index,customer_lifetime_value,education,gender,income,monthly_premium_auto,number_of_open_complaints,policy_type,state,total_claim_amount,vehicle_class
8877,7065,23405,Bachelor,male,71941.0,73.0,0.0,Personal Auto,California,198.234764,Four-Door Car
8878,7066,3096,College,female,21604.0,79.0,0.0,Corporate Auto,California,379.2,Four-Door Car
8879,7067,8163,Bachelor,male,0.0,85.0,3.0,Corporate Auto,California,790.784983,Four-Door Car
8880,7068,7524,College,male,21941.0,96.0,0.0,Personal Auto,California,691.2,Four-Door Car
8881,7069,2611,College,male,0.0,77.0,0.0,Corporate Auto,California,369.6,Two-Door Car


## Day 2

In [696]:
#replace 0s with np.nan
full_dataset["income"] = full_dataset["income"].replace(to_replace=0, value=np.nan)

In [697]:
#fill mean of column to elements with NaN
full_dataset["income"] = full_dataset["income"].replace(to_replace=np.nan, value=full_dataset["income"].mean())

In [698]:
#replace 0s with np.nan
full_dataset["monthly_premium_auto"] = full_dataset["monthly_premium_auto"].replace(to_replace=0, value=np.nan)
#fill mean of column to elements with NaN
full_dataset["monthly_premium_auto"] = (
    full_dataset["monthly_premium_auto"].replace(to_replace=np.nan, value=full_dataset["monthly_premium_auto"].mean()))

In [699]:
#replace 0s with np.nan
full_dataset["total_claim_amount"] = full_dataset["total_claim_amount"].replace(to_replace=0, value=np.nan)
#fill mean of column to elements with NaN
full_dataset["total_claim_amount"] = (
    full_dataset["total_claim_amount"].replace(to_replace=np.nan, value=full_dataset["total_claim_amount"].mean()))

Bucketing the data - Write a function to replace column "State" to different zones. California as West Region, Oregon as North West, and Washington as East, and Arizona and Nevada as Central

In [700]:
region_lst = ['California', 'Cali', 'Oregon', "Washington", 'WA', "Arizona", 'AZ', "Nevada"]
def bucket(string, lst):
    if string == (lst[0] or lst[1]):
        return "west_region"
    elif string == lst[2]:
        return "north_west"
    elif string == (lst[3] or lst[4]):
        return "east"
    elif string == (lst[5] or lst[6] or lst[7]):
        return "central"
    else:
        return np.nan

In [701]:
full_dataset["state"] = full_dataset["state"].apply(lambda x: bucket(x,region_lst))

In [702]:
full_dataset = full_dataset.drop("index", axis=1)

In [703]:
full_dataset.tail()

Unnamed: 0,customer_lifetime_value,education,gender,income,monthly_premium_auto,number_of_open_complaints,policy_type,state,total_claim_amount,vehicle_class
8877,23405,Bachelor,male,71941.0,73.0,0.0,Personal Auto,west_region,198.234764,Four-Door Car
8878,3096,College,female,21604.0,79.0,0.0,Corporate Auto,west_region,379.2,Four-Door Car
8879,8163,Bachelor,male,50704.717802,85.0,3.0,Corporate Auto,west_region,790.784983,Four-Door Car
8880,7524,College,male,21941.0,96.0,0.0,Personal Auto,west_region,691.2,Four-Door Car
8881,2611,College,male,50704.717802,77.0,0.0,Corporate Auto,west_region,369.6,Two-Door Car


In [704]:
full_dataset["education"] = full_dataset["education"].str.lower()
full_dataset["policy_type"] = full_dataset["policy_type"].str.lower()
full_dataset["vehicle_class"] = full_dataset["vehicle_class"].str.lower()

In [705]:
full_dataset["vehicle_class"].isna().sum()

1

In [706]:
full_dataset["state"].unique()

array(['east', 'central', nan, 'west_region', 'north_west'], dtype=object)

In [707]:
replace_dict = {"luxury suv": "luxury vehicle" , "luxury car" : "luxury vehicle"}
full_dataset["vehicle_class"] = full_dataset["vehicle_class"].replace(replace_dict)

In [708]:
#create index of all columns with numerical data
numeric_cols = full_dataset.select_dtypes(include=[np.number]).columns
#delete the ones that dont make any sense to remove the outliers from
numeric_cols = numeric_cols.drop(["number_of_open_complaints"])

In [722]:
full_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8521 entries, 0 to 8881
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   customer_lifetime_value    8521 non-null   int64  
 1   education                  8520 non-null   object 
 2   gender                     8521 non-null   object 
 3   income                     8521 non-null   float64
 4   monthly_premium_auto       8521 non-null   float64
 5   number_of_open_complaints  8520 non-null   float64
 6   policy_type                8520 non-null   object 
 7   state                      7486 non-null   object 
 8   total_claim_amount         8521 non-null   float64
 9   vehicle_class              8520 non-null   object 
dtypes: float64(4), int64(1), object(5)
memory usage: 732.3+ KB


In [728]:
#create a dataframe only containing numerical data, based on the index numeric_cols and then apply the zscore
numeric_data = full_dataset[numeric_cols]
z = full_dataset[numeric_cols].apply(zscore)
threshold = 3

In [747]:
# filter the dataframe to remove the outliers
full_dataset = full_dataset[(z < threshold).all(axis=1)]

In [737]:
numeric_data

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,total_claim_amount
1,697953,50704.717802,94.0,1131.464935
3,764586,50704.717802,106.0,529.881344
4,536307,36357.000000,68.0,17.269323
5,825629,62902.000000,69.0,159.383042
6,538089,55350.000000,67.0,321.600000
...,...,...,...,...
8877,23405,71941.000000,73.0,198.234764
8878,3096,21604.000000,79.0,379.200000
8879,8163,50704.717802,85.0,790.784983
8880,7524,21941.000000,96.0,691.200000


In [744]:
len(full_dataset)

8521

In [750]:
numeric_data.index

Int64Index([   1,    3,    4,    5,    6,    7,    9,   10,   11,   12,
            ...
            8871, 8872, 8874, 8875, 8876, 8877, 8878, 8879, 8880, 8881],
           dtype='int64', length=8093)

In [758]:
full_dataset = full_dataset.loc[numeric_data.index].reset_index()

In [762]:
full_dataset.shape

(8093, 11)