In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import minmax_scale
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.linear_model import LinearRegression
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import PowerTransformer
# from sklearn.linear_model import LinearRegression
# import statsmodels.api as sm
# from scipy.stats import boxcox
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import r2_score
# from sklearn.metrics import mean_squared_error as mse
# from sklearn.metrics import mean_absolute_error as mae
pd.options.display.max_rows = 50
import openpyxl
import xlrd


In [36]:
def st_to_state(this_df):
    """checks, if there is a column named ST and changes it to state"""
    if "ST" in list(this_df.columns) or "State" in list(this_df.columns):
        this_df.rename(columns={"ST":"state", "State":"state"}, inplace=True)
    return this_df

In [3]:
def lowercase_underscore_column_names(this_df):
    """changes all columnnames to lower case and replace space with underscore"""
    for col in this_df.columns:
        this_df.rename(columns={col:col.lower().replace(" ", "_")}, inplace=True)
    return this_df

In [4]:
def gender_generalize(gender:[str], replace_nan:[str] = "---"):
    """changes the genders to female if f in str, else male. if gender = nan returns ---"""
    if type(gender) != str:
        return replace_nan
    if "f" in gender.lower():
        return "female"
    else:
        return "male"

In [5]:
def correct_open_complaints(value):
    """the number of open complaints has in some cases the structure 1/0/00, where only the middle number is interesting.
       if value has this format, only the middle value is returned as int"""
    if "/" in str(value):
        return int(value.split("/")[1])
    else:
        return int(value)

In [6]:
def clean_cust_lifetime_value(value):
    """takes a value and checks, whether a % sign is in it, in that case returns value divided by 100 with % sign removed """
    if "%" in str(value):
        return float(value.replace("%",""))/100
    else:
        return float(value)

In [7]:
#source: https://www.50states.com/abbreviations.htm
def states_to_zones_flex(state:[str]):
    """takes state in str format and returns zone according to zones_dict"""
    # add / change as required
    zones_dict = {
        "east": ["wa", "washington"],
        "central": ["nevada", "arizona", "nv", "az"],
        "west": ["california", "cali", "ca"],
        "northwest": ["oregon", "oreg", "or"],
    }
    state = state.lower()
    for zone in zones_dict.items():
        if state in zone[1]:
            return zone[0]
    return None

In [8]:
def display_cat_num_columns(this_df):
    print("Numerical columns in DataFrame: ")
    for _ in list((combined_df.select_dtypes(include=['int','float']).dtypes).index):
        print(" +",_)
    print("\nCategorical columns in DataFrame:")
    for _ in list((combined_df.select_dtypes(exclude=['int','float']).dtypes).index):
        print(" +",_)


In [37]:
def split_df_in_numeric_categ_dfs(this_df):
    """takes in a dataframe, finds numerical (float, int) and categorical and returns them as tuple (num_df, cat_df)"""
    numer_lst = [_ for _ in this_df.select_dtypes(include=['int','float']).dtypes.index]
    cat_lst = [_ for _ in this_df.select_dtypes(exclude=['int','float']).dtypes.index]
    return (this_df[numer_lst], this_df[cat_lst])

In [9]:
def replace_nan_with_mean(col):
    col.fillna(np.mean(col))
    

In [10]:
# helper function, no real purpose
def all_columns_in_file1_to_3():
    """ONLY FOR THIS PROJECT: sort and print all columns next to each other to see difference in the names.
       Print True or False if set(all_columns) == set(file1_df.columns)"""
    all_columns = sorted(file1_df.columns) + sorted(file2_df.columns) + sorted(file3_df.columns)
    for _ in range(0,11):
        print(all_columns[_], all_columns[_+11], all_columns[_+22])
    print("\nThe length of the set(all_columns) is equal to length of columns in one df:",len(set(all_columns))==len(file1_df.columns))

In [11]:
file1_df = pd.read_csv("C:/Users/roman/Documents/GitHub/Customer_Data_Analysis/Data/file1.csv")
# file1_df

In [12]:
file2_df = pd.read_csv("C:/Users/roman/Documents/GitHub/Customer_Data_Analysis/Data/file2.csv")
# file2_df

In [13]:
file3_df = pd.read_csv("C:/Users/roman/Documents/GitHub/Customer_Data_Analysis/Data/file3.csv")
# file3_df

In [14]:
# gathering info on columns
all_columns_in_file1_to_3()

Customer Customer Customer
Customer Lifetime Value Customer Lifetime Value Customer Lifetime Value
Education Education Education
GENDER GENDER Gender
Income Income Income
Monthly Premium Auto Monthly Premium Auto Monthly Premium Auto
Number of Open Complaints Number of Open Complaints Number of Open Complaints
Policy Type Policy Type Policy Type
ST ST State
Total Claim Amount Total Claim Amount Total Claim Amount
Vehicle Class Vehicle Class Vehicle Class

The length of the set(all_columns) is equal to length of columns in one df: False


In [15]:
# change st to state if applicable
file1_df = st_to_state(file1_df)
file2_df = st_to_state(file2_df)
file3_df = st_to_state(file3_df)


In [16]:
# replace space with underscore and use only lower case
file1_df = lowercase_underscore_column_names(file1_df)
file2_df = lowercase_underscore_column_names(file2_df)
file3_df = lowercase_underscore_column_names(file3_df)

In [17]:
# checking for inacurracies
all_columns_in_file1_to_3()

customer customer customer
customer_lifetime_value customer_lifetime_value customer_lifetime_value
education education education
gender gender gender
income income income
monthly_premium_auto monthly_premium_auto monthly_premium_auto
number_of_open_complaints number_of_open_complaints number_of_open_complaints
policy_type policy_type policy_type
state state state
total_claim_amount total_claim_amount total_claim_amount
vehicle_class vehicle_class vehicle_class

The length of the set(all_columns) is equal to length of columns in one df: True


In [18]:
# combine all file_df's, delete empty rows and drop duplicates
combined_df = pd.concat([file1_df,file2_df,file3_df], axis=0)
combined_df.dropna(axis = 0, how = 'all', inplace = True)      # delete empty rows
combined_df.drop_duplicates(inplace=True)     
# combined_df.drop(columns=["customer"], inplace=True)

In [19]:
# re-arrange column order ("drop" customer)
combined_df = combined_df[["customer_lifetime_value", "monthly_premium_auto", "total_claim_amount", "number_of_open_complaints", "income", "policy_type", "vehicle_class", "gender", "education", "state"]]

In [20]:
# change gender to female / male / ---
combined_df["gender"] = list(map(gender_generalize,combined_df["gender"]))


In [21]:
# clean cls from %
combined_df["customer_lifetime_value"] = list(map(clean_cust_lifetime_value,(combined_df["customer_lifetime_value"])))

In [22]:
# check NaN is
print("CLV:", combined_df["customer_lifetime_value"].isna().sum())
print("Income:", combined_df["income"].isna().sum())
print("Monthly Premium Auto", combined_df["monthly_premium_auto"].isna().sum())
print("total_claim_amount:", combined_df["total_claim_amount"].isna().sum())


CLV: 7
Income: 0
Monthly Premium Auto 0
total_claim_amount: 0


In [23]:
# replace NaN in respective columns
combined_df["customer_lifetime_value"].fillna(np.mean(combined_df["customer_lifetime_value"]), inplace=True)
# combined_df["income"].fillna(np.mean(combined_df["income"]), inplace=True)
# combined_df["monthly_premium_auto"].fillna(np.mean(combined_df["monthly_premium_auto"]), inplace=True)
# combined_df["total_claim_amount"].fillna(np.mean(combined_df["total_claim_amount"]), inplace=True)


In [24]:
# correct number of open complaints and make column categorical
combined_df["number_of_open_complaints"] = list(map(correct_open_complaints,(combined_df["number_of_open_complaints"])))
combined_df["number_of_open_complaints"] = combined_df["number_of_open_complaints"].astype("category")

In [25]:
# create new column zones according to state
combined_df["zones"] = list(map(states_to_zones_flex,(combined_df["state"])))

In [26]:
# change all numeric values to int
combined_df["customer_lifetime_value"] = combined_df["customer_lifetime_value"].astype(np.int64)
combined_df["income"] = combined_df["income"].astype(np.int64)
combined_df["monthly_premium_auto"] = combined_df["monthly_premium_auto"].astype(np.int64)
combined_df["total_claim_amount"] = combined_df["total_claim_amount"].astype(np.int64)

In [27]:
# all lower strings in df
combined_df = combined_df.apply(lambda x: x if type(x)==str else x)

In [28]:
combined_df


Unnamed: 0,customer_lifetime_value,monthly_premium_auto,total_claim_amount,number_of_open_complaints,income,policy_type,vehicle_class,gender,education,state,zones
0,7977,1000,2,0,0,Personal Auto,Four-Door Car,---,Master,Washington,east
1,6979,94,1131,0,0,Personal Auto,Four-Door Car,female,Bachelor,Arizona,central
2,12887,108,566,0,48767,Personal Auto,Two-Door Car,female,Bachelor,Nevada,central
3,7645,106,529,0,0,Corporate Auto,SUV,male,Bachelor,California,west
4,5363,68,17,0,36357,Personal Auto,Four-Door Car,male,High School or Below,Washington,east
...,...,...,...,...,...,...,...,...,...,...,...
7065,23405,73,198,0,71941,Personal Auto,Four-Door Car,male,Bachelor,California,west
7066,3096,79,379,0,21604,Corporate Auto,Four-Door Car,female,College,California,west
7067,8163,85,790,3,0,Corporate Auto,Four-Door Car,male,Bachelor,California,west
7068,7524,96,691,0,21941,Personal Auto,Four-Door Car,male,College,California,west


In [29]:
# print numerical and categorical columns
display_cat_num_columns(combined_df)

Numerical columns in DataFrame: 
 + customer_lifetime_value
 + monthly_premium_auto
 + total_claim_amount
 + income

Categorical columns in DataFrame:
 + number_of_open_complaints
 + policy_type
 + vehicle_class
 + gender
 + education
 + state
 + zones


In [30]:
combined_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
customer_lifetime_value,9134.0,7977.328005,6807.580371,1898.0,3981.25,5768.0,8928.5,83325.0
monthly_premium_auto,9134.0,110.393146,581.471461,61.0,68.0,83.0,109.0,35354.0
total_claim_amount,9134.0,430.013795,289.633762,0.0,266.0,377.0,546.0,2893.0
income,9134.0,37824.847055,30359.232933,0.0,0.0,34240.0,62446.5,99981.0


In [45]:
pd.DataFrame.to_csv(combined_df, "C:/users/roman/documents/tester.csv", index=False )

In [31]:
### old code kept for later evaluation (probably trashing ;) ###

In [32]:
def columns_float_to_int(this_df):
    """checks dtype of all columns in df and if numeric turns it into int"""
    for col in this_df.columns:
        if np.issubdtype(this_df[col].dtype, np.number):
            this_df[col] = list(map(float_to_int,(this_df[col])))
    return this_df

In [33]:
# Bucketing the data - Write a function to replace column "State" to different zones. California as West Region, Oregon as North West, and Washington as East, and Arizona and Nevada as Central
# California    3032 west
# Oregon        2601 northwest
# Arizona       1630 central
# Nevada         882 central
# Washington     768 east
# Cali           120 west
# AZ              74 central
# WA              30 east

def states_to_zones(state:[str]):
    if state[:4].lower() == "cali":
        return "west"
    elif state[:4].lower() == "oreg":
        return "north west"
    elif state[:2].lower() == "wa":
        return "east"
    else:
        return "central"

In [34]:
def float_to_int(x):
    """returns int of x or nan if x == nan"""
#     if str(x).lower() != "nan":
#         return int(float(x))
#     else:
#         return "nan"
    return int(x)