# Market segmentation insurance

## 1. Introduction

## 2. Data information

## 3. ETL process

In [0]:
import os
import pandas as pd

In [0]:
PATH_DATA = os.path.join(os.getenv("PATH_DATA_PROJECTS"), "Tabular", "market_segmentation_insurance")

In [0]:
NUMERIC_FEATURES = ["balance", "balance_frequency", "purchases", "oneoff_purchases", "installments_purchases", "cash_advance", "purchases_frequency", 
                    "oneoff_purchases_frequency", "purchases_installments_frequency", "cash_advance_frequency", "cash_advance_trx", "purchases_trx", 
                    "credit_limit", "payments", "minimum_payments", "prc_full_payment"]

In [0]:
CATEGORICAL_FEATURES = ["tenure"]

### 3.1. Extract

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

In [0]:
spark = SparkSession.builder \
    .appName("Read CSV Market segmentation insurance") \
    .master("local[*]") \
    .getOrCreate()

In [0]:
schema = StructType([
    StructField("cust_id", StringType(), True),
    StructField("balance", StringType(), True),
    StructField("balance_frequency", StringType(), True),
    StructField("purchases", StringType(), True),
    StructField("oneoff_purchases", StringType(), True),
    StructField("installments_purchases", StringType(), True),
    StructField("cash_advance", StringType(), True),
    StructField("purchases_frequency", StringType(), True),
    StructField("oneoff_purchases_frequency", StringType(), True),
    StructField("purchases_installments_frequency", StringType(), True),
    StructField("cash_advance_frequency", StringType(), True),
    StructField("cash_advance_trx", StringType(), True),
    StructField("purchases_trx", StringType(), True),
    StructField("credit_limit", StringType(), True),
    StructField("payments", StringType(), True),
    StructField("minimum_payments", StringType(), True),
    StructField("prc_full_payment", StringType(), True),
    StructField("tenure", StringType(), True),
])

In [0]:
df = spark.read.csv(os.path.join(PATH_DATA, "CustomerData.csv"), header = True, schema = schema)

In [0]:
df.printSchema()

In [0]:
df.show(5)

### 3.2. Transform

#### 3.2.1. Data profiling

In [0]:
from pyspark.sql.functions import col, count, when, lit, trim

##### 3.2.1.1. Identify missing values

In [0]:
missing_values_count = df.select([
    count(when((col(c).isNull()) | (trim(col(c)) == ""), c)).alias(c) 
    for c in df.columns
])
print("Total missing values by columns:")
missing_values_count.show(vertical = True)

##### 3.2.1.2. Identify duplicate values

In [0]:
df.groupBy(df.columns) \
    .count() \
    .filter(col("count") > 1) \
    .show()

##### 3.2.1.3. Validate data consistency 

In [0]:
from pyspark.sql.functions import udf

In [0]:
def validate_number(number_string):
    try:
        float(number_string)
        return True
    except ValueError:
        return False

In [0]:
is_numeric = udf(validate_number)

In [0]:
df_with_validity = df.withColumn("balance_is_number", is_numeric(col("balance")))
df_with_validity.filter(col("balance_is_number") == False).show()

#### 3.2.2. Cleaning

##### 3.2.2.1. Update datatype

In [0]:
from pyspark.sql.types import FloatType

In [0]:
for c in NUMERIC_FEATURES:
    df = df.withColumn(c, col(c).cast(FloatType()))

In [0]:
df.printSchema()

##### 3.2.2.2. Filling missing values

In [0]:
import pyspark.sql.functions as f

In [0]:
df.select(f.median(col("minimum_payments")), f.median(col("credit_limit"))).show()

In [0]:
minimum_payments_median = df.agg(f.median("minimum_payments")).collect()[0][0]
credit_limit_median = df.agg(f.median("credit_limit")).collect()[0][0]

In [0]:
df = df.withColumn(
    "minimum_payments",
    when(col("minimum_payments").isNull(), minimum_payments_median).otherwise(col("minimum_payments"))
)

In [0]:
df = df.withColumn(
    "credit_limit",
    when(col("credit_limit").isNull(), credit_limit_median).otherwise(col("credit_limit"))
)

In [0]:
missing_values_count = df.select([
    count(when((col(c).isNull()) | (trim(col(c)) == ""), c)).alias(c) 
    for c in df.columns
])
print("Total missing values by columns:")
missing_values_count.show(vertical = True)

##### 3.2.2.3. Remove white spaces

In [0]:
df = df.withColumn("tenure", trim(col("tenure")))

### 3.3. Load