In [764]:
import numpy as np
import pandas as pd
import math
import statistics as st
from scipy.stats import zscore

In [765]:
def convert_cols_to_lower(df):
    df.columns = [ i.lower() for i in df.columns ]
    return df

In [766]:
column_lst = sorted(['Customer', 'State', 'Gender', 'Education', 'Customer Lifetime Value',
       'Income', 'Monthly Premium Auto', 'Number of Open Complaints',
       'Policy Type', 'Vehicle Class', 'Total Claim Amount'])

def load_original_data():
    file1 = pd.read_csv('Data/file1.csv')
    file2 = pd.read_csv('Data/file2.csv')
    file3 = pd.read_csv('Data/file3.csv')
    
    convert_cols_to_lower(file1)
    convert_cols_to_lower(file2)
    convert_cols_to_lower(file3)
    
    file1 = file1.sort_index(axis=1)
    file2 = file2.sort_index(axis=1)
    file3 = file3.sort_index(axis=1)
    
    file1.columns = column_lst
    file2.columns = column_lst
    file3.columns = file3.columns.str.replace("state", "st")
    file3.columns = column_lst
    
    df = pd.concat([file1, file2, file3], axis=0)
    
    return df

In [767]:
#save this only for late, might be useful, here it doesn't work
def replace_str_in_columns(df, str_to_replace, str_to_replace_with):
    df.columns = sorted(df.columns.str.replace(str_to_replace, str_to_replace_with))
    return df

In [768]:
#load the full dataset, correctly concatenated and in the desired order
full_dataset = load_original_data()

In [769]:
full_dataset.columns = [ i for i in full_dataset.columns.str.replace(" ","_").str.lower()]

In [770]:
full_dataset["state"].unique()

array(['Washington', 'Arizona', 'Nevada', 'California', 'Oregon', 'Cali',
       'AZ', 'WA', nan], dtype=object)

In [771]:
#drop Customer column
full_dataset = full_dataset.drop("customer", axis=1)

In [772]:
#remove duplicates
full_dataset = full_dataset.drop_duplicates()

In [773]:
#remove rows with NaN elements in the column Customer Lifetime Value
#full_dataset = full_dataset.dropna(subset="Customer Lifetime Value")

In [774]:
#reset the index to match the amout of rows
full_dataset = full_dataset.reset_index()

In [775]:
#remove "%" signs from Customer Lifetime Value where present
full_dataset["customer_lifetime_value"] = (
    full_dataset["customer_lifetime_value"].apply(lambda x: float(x.replace("%", " "))/100 if type(x) == str else x))

In [776]:
#convert Customer Lifetime Value into integer by first converting it into float
full_dataset["customer_lifetime_value"] = pd.to_numeric(full_dataset["customer_lifetime_value"], errors = "coerce")
full_dataset["customer_lifetime_value"] = full_dataset["customer_lifetime_value"].apply(lambda x: math.trunc(x) if np.isnan(x) != True else 0)

In [777]:
#full_dataset

In [778]:
#function that checks if input is a string and splits it at "/"
#returning the 2nd element of the resulting list or returns x if it's not a string
def split(x):
    if isinstance(x, str):
        return x.split("/")[1]
    return x

In [779]:
full_dataset["number_of_open_complaints"] = full_dataset["number_of_open_complaints"].apply(lambda x: split(x))
full_dataset["number_of_open_complaints"] = pd.to_numeric(full_dataset["number_of_open_complaints"], errors = "coerce")

In [780]:
full_dataset.loc[full_dataset["number_of_open_complaints"].isna()]

Unnamed: 0,index,customer_lifetime_value,education,gender,income,monthly_premium_auto,number_of_open_complaints,policy_type,state,total_claim_amount,vehicle_class
1070,1071,0,,,,,,,,,


In [781]:
full_dataset = full_dataset.drop([1071])

In [782]:
def clean_gender(string):
    m_lst = ['M', 'Male']
    f_lst = ['F', 'Femal', 'female']
    if isinstance(string, str) and string in m_lst:
        return "male"
    elif isinstance(string, str) and string in f_lst:
        return "female"
    else:
        return "error"

In [783]:
full_dataset['gender'] = list(map(clean_gender, full_dataset['gender']))

In [784]:
full_dataset.tail()
#full_dataset.info()

Unnamed: 0,index,customer_lifetime_value,education,gender,income,monthly_premium_auto,number_of_open_complaints,policy_type,state,total_claim_amount,vehicle_class
8877,7065,23405,Bachelor,male,71941.0,73.0,0.0,Personal Auto,California,198.234764,Four-Door Car
8878,7066,3096,College,female,21604.0,79.0,0.0,Corporate Auto,California,379.2,Four-Door Car
8879,7067,8163,Bachelor,male,0.0,85.0,3.0,Corporate Auto,California,790.784983,Four-Door Car
8880,7068,7524,College,male,21941.0,96.0,0.0,Personal Auto,California,691.2,Four-Door Car
8881,7069,2611,College,male,0.0,77.0,0.0,Corporate Auto,California,369.6,Two-Door Car


## Day 2

In [785]:
#replace 0s with np.nan
full_dataset["income"] = full_dataset["income"].replace(to_replace=0, value=np.nan)

In [786]:
#fill mean of column to elements with NaN
full_dataset["income"] = full_dataset["income"].replace(to_replace=np.nan, value=full_dataset["income"].mean())

In [787]:
#replace 0s with np.nan
full_dataset["monthly_premium_auto"] = full_dataset["monthly_premium_auto"].replace(to_replace=0, value=np.nan)
#fill mean of column to elements with NaN
full_dataset["monthly_premium_auto"] = (
    full_dataset["monthly_premium_auto"].replace(to_replace=np.nan, value=full_dataset["monthly_premium_auto"].mean()))

In [788]:
#replace 0s with np.nan
full_dataset["total_claim_amount"] = full_dataset["total_claim_amount"].replace(to_replace=0, value=np.nan)
#fill mean of column to elements with NaN
full_dataset["total_claim_amount"] = (
    full_dataset["total_claim_amount"].replace(to_replace=np.nan, value=full_dataset["total_claim_amount"].mean()))

Bucketing the data - Write a function to replace column "State" to different zones. California as West Region, Oregon as North West, and Washington as East, and Arizona and Nevada as Central

In [789]:
region_lst = ['California', 'Cali', 'Oregon', "Washington", 'WA', "Arizona", 'AZ', "Nevada"]
def bucket(string, lst):
    if string == (lst[0] or lst[1]):
        return "west_region"
    elif string == lst[2]:
        return "north_west"
    elif string == (lst[3] or lst[4]):
        return "east"
    elif string == (lst[5] or lst[6] or lst[7]):
        return "central"
    else:
        return np.nan

In [790]:
full_dataset["state"] = full_dataset["state"].apply(lambda x: bucket(x,region_lst))

In [791]:
full_dataset = full_dataset.drop("index", axis=1)

In [792]:
full_dataset.tail()

Unnamed: 0,customer_lifetime_value,education,gender,income,monthly_premium_auto,number_of_open_complaints,policy_type,state,total_claim_amount,vehicle_class
8877,23405,Bachelor,male,71941.0,73.0,0.0,Personal Auto,west_region,198.234764,Four-Door Car
8878,3096,College,female,21604.0,79.0,0.0,Corporate Auto,west_region,379.2,Four-Door Car
8879,8163,Bachelor,male,50704.717802,85.0,3.0,Corporate Auto,west_region,790.784983,Four-Door Car
8880,7524,College,male,21941.0,96.0,0.0,Personal Auto,west_region,691.2,Four-Door Car
8881,2611,College,male,50704.717802,77.0,0.0,Corporate Auto,west_region,369.6,Two-Door Car


In [793]:
full_dataset["education"] = full_dataset["education"].str.lower()
full_dataset["policy_type"] = full_dataset["policy_type"].str.lower()
full_dataset["vehicle_class"] = full_dataset["vehicle_class"].str.lower()

In [794]:
full_dataset["vehicle_class"].isna().sum()

1

In [795]:
full_dataset["state"].unique()

array(['east', 'central', nan, 'west_region', 'north_west'], dtype=object)

In [796]:
replace_dict = {"luxury suv": "luxury vehicle" , "luxury car" : "luxury vehicle"}
full_dataset["vehicle_class"] = full_dataset["vehicle_class"].replace(replace_dict)

In [797]:
#create index of all columns with numerical data
numeric_cols = full_dataset.select_dtypes(include=[np.number]).columns
#delete the ones that dont make any sense to remove the outliers from
numeric_cols = numeric_cols.drop(["number_of_open_complaints"])

In [806]:
full_dataset

Unnamed: 0,customer_lifetime_value,education,gender,income,monthly_premium_auto,number_of_open_complaints,policy_type,state,total_claim_amount,vehicle_class
0,0,master,error,50704.717802,1000.0,0.0,personal auto,east,2.704934,four-door car
1,6979,bachelor,female,50704.717802,94.0,0.0,personal auto,central,1131.464935,four-door car
2,12887,bachelor,female,48767.000000,108.0,0.0,personal auto,,566.472247,two-door car
3,7645,bachelor,male,50704.717802,106.0,0.0,corporate auto,west_region,529.881344,suv
4,5363,high school or below,male,36357.000000,68.0,0.0,personal auto,east,17.269323,four-door car
...,...,...,...,...,...,...,...,...,...,...
8877,23405,bachelor,male,71941.000000,73.0,0.0,personal auto,west_region,198.234764,four-door car
8878,3096,college,female,21604.000000,79.0,0.0,corporate auto,west_region,379.200000,four-door car
8879,8163,bachelor,male,50704.717802,85.0,3.0,corporate auto,west_region,790.784983,four-door car
8880,7524,college,male,21941.000000,96.0,0.0,personal auto,west_region,691.200000,four-door car


In [799]:
#create a dataframe only containing numerical data, based on the index numeric_cols and then apply the zscore
numeric_data = full_dataset[numeric_cols]
z = full_dataset[numeric_cols].apply(zscore)
threshold = 3

In [800]:
# filter the dataframe to remove the outliers
full_dataset = full_dataset[(z < threshold).all(axis=1)]

In [801]:
numeric_data

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,total_claim_amount
0,0,50704.717802,1000.0,2.704934
1,6979,50704.717802,94.0,1131.464935
2,12887,48767.000000,108.0,566.472247
3,7645,50704.717802,106.0,529.881344
4,5363,36357.000000,68.0,17.269323
...,...,...,...,...
8877,23405,71941.000000,73.0,198.234764
8878,3096,21604.000000,79.0,379.200000
8879,8163,50704.717802,85.0,790.784983
8880,7524,21941.000000,96.0,691.200000


In [802]:
len(full_dataset)

8521

In [803]:
numeric_data.index

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            8872, 8873, 8874, 8875, 8876, 8877, 8878, 8879, 8880, 8881],
           dtype='int64', length=8881)

In [804]:
full_dataset = full_dataset.loc[numeric_data.index].reset_index()

KeyError: '[44, 50, 51, 79, 114, 125, 196, 232, 248, 266, 303, 315, 354, 373, 375, 385, 404, 407, 408, 418, 435, 496, 512, 522, 541, 581, 626, 681, 741, 747, 810, 817, 820, 832, 836, 856, 869, 873, 888, 944, 1001, 1007, 1045, 1078, 1086, 1120, 1126, 1184, 1201, 1233, 1291, 1310, 1323, 1328, 1333, 1379, 1381, 1389, 1463, 1500, 1512, 1517, 1530, 1537, 1548, 1554, 1562, 1571, 1621, 1652, 1671, 1681, 1791, 1807, 1818, 1853, 1856, 1875, 1879, 1974, 2095, 2107, 2127, 2181, 2182, 2199, 2216, 2285, 2294, 2301, 2479, 2495, 2531, 2541, 2591, 2642, 2699, 2708, 2765, 2776, 2883, 2896, 2903, 3014, 3060, 3130, 3146, 3186, 3193, 3200, 3308, 3310, 3403, 3434, 3438, 3442, 3445, 3446, 3455, 3534, 3543, 3558, 3572, 3584, 3604, 3610, 3661, 3666, 3668, 3671, 3712, 3716, 3734, 3737, 3792, 3822, 3828, 3908, 3958, 4016, 4026, 4044, 4048, 4055, 4095, 4097, 4161, 4262, 4293, 4303, 4304, 4312, 4331, 4349, 4386, 4401, 4402, 4437, 4442, 4457, 4524, 4540, 4586, 4632, 4671, 4697, 4730, 4752, 4760, 4785, 4811, 4835, 4845, 4862, 4876, 4889, 4907, 4914, 4977, 4998, 5002, 5013, 5033, 5048, 5071, 5131, 5132, 5147, 5150, 5204, 5211, 5272, 5275, 5328, 5407, 5418, 5473, 5511, 5519, 5541, 5547, 5562, 5564, 5574, 5576, 5616, 5632, 5638, 5641, 5644, 5679, 5681, 5682, 5687, 5688, 5715, 5734, 5743, 5757, 5793, 5814, 5840, 5850, 5893, 5912, 5919, 6005, 6008, 6009, 6059, 6076, 6078, 6129, 6134, 6151, 6152, 6154, 6162, 6181, 6190, 6215, 6227, 6345, 6351, 6365, 6405, 6442, 6455, 6469, 6481, 6484, 6490, 6491, 6538, 6573, 6670, 6673, 6693, 6706, 6734, 6736, 6747, 6759, 6778, 6794, 6847, 6852, 6872, 6965, 6980, 6990, 6998, 7002, 7033, 7079, 7105, 7136, 7142, 7154, 7155, 7166, 7177, 7206, 7213, 7230, 7302, 7303, 7400, 7425, 7455, 7505, 7551, 7556, 7559, 7606, 7641, 7664, 7698, 7767, 7800, 7860, 7920, 7937, 7938, 7973, 8002, 8016, 8021, 8032, 8062, 8075, 8080, 8129, 8136, 8149, 8155, 8162, 8198, 8207, 8240, 8242, 8259, 8268, 8287, 8299, 8313, 8338, 8366, 8378, 8419, 8432, 8438, 8458, 8461, 8468, 8475, 8476, 8503, 8574, 8585, 8588, 8612, 8613, 8614, 8617, 8658, 8662, 8670, 8671, 8682, 8701, 8716, 8724, 8787, 8801, 8814, 8847, 8852, 8859, 8873] not in index'

In [None]:
full_dataset.shape