# Customer Lifetime value (Пожизненная ценность клиента)


# EDA

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
import warnings
warnings.filterwarnings('ignore')

In [None]:
data_file = "../csv/clv/customer_lifetime_value.csv"
dataset = pd.read_csv(data_file)
ds = dataset.copy()
ds.describe()

In [None]:
ds.head()

### Сразу можем дропнуть колонку Customer	

In [None]:
col_to_drop = ['Customer']

## Посмотрим на нашу целевую переменную

In [None]:
sns.boxplot(ds['Customer Lifetime Value'])

In [None]:
ds['Customer Lifetime Value'].hist()

In [None]:
ds[ds['Customer Lifetime Value']  > 40000].count()

In [None]:
ds['Customer Lifetime Value'].describe(percentiles=[0.99,0.95])

In [None]:
ds = ds[ds['Customer Lifetime Value'] < 30000]

In [None]:
ds['Customer Lifetime Value'].hist()

## Доход

In [None]:
ds['Income'].plot.hist()

In [None]:
ds[ds['Income'] == 0]['EmploymentStatus'].unique()

### Похоже, что нули в колонке Income не являются ошибками данных

In [None]:
ds['Income'].isna().any()

### Эту колонку можно оставить без изменений

# Monthly Premium Auto

In [None]:
ds['Monthly Premium Auto'].plot.hist()

In [None]:
ds['mon_prem_auto_log'] = np.log(ds['Monthly Premium Auto'])

In [None]:
ds['mon_prem_auto_log'].hist()

In [None]:
col_to_drop.append('Monthly Premium Auto')

# Months Since Last Claim

In [None]:
ds['Months Since Last Claim'].hist()

### Тут все хорошо, оставляем как есть

# Months Since Policy Inception

In [None]:
ds['Months Since Policy Inception'].hist()

### Тоже оставляем

# Number of Open Complaints

In [None]:
ds['Number of Open Complaints'].hist()

### Тут лучше сделать 0 / не 0

In [None]:
ds['сomplaints'] = ds['Number of Open Complaints'].apply(lambda x: 1 if x != 0 else 0)

In [None]:
ds['сomplaints'].hist()

In [None]:
col_to_drop.append('Number of Open Complaints')

# State

In [None]:
ds["State"].hist()

### Тут хорошо подойдет OneHotEncoding

In [None]:
ds = pd.get_dummies(ds, prefix='state_', columns=['State'])

In [None]:
ds.head()

# Response

In [None]:
ds['Response'].hist()

In [None]:
ds = pd.get_dummies(ds, prefix='response_', columns=['Response'])

# Coverage

In [None]:
ds['Coverage'].hist()

In [None]:
ds = pd.get_dummies(ds, prefix='coverage_', columns=['Coverage'])

In [None]:
ds.head()

# Education

In [None]:
ds['Education'].hist()

In [None]:
ds = pd.get_dummies(ds, prefix='edu_', columns=['Education'])

# EmploymentStatus

In [None]:
ds['EmploymentStatus'].hist()

In [None]:
ds = pd.get_dummies(ds, prefix='emp_stat_', columns=['EmploymentStatus'])

# Effective To Date

In [None]:
ds['Effective To Date'] = pd.to_datetime(
    ds['Effective To Date'], 
    format='%m/%d/%y', 
    errors='ignore'
)

In [None]:
plt.figure(figsize=(10, 10))
ds['Effective To Date'].hist()

### Не похоже на какой-то информативный признак

In [None]:
col_to_drop.append('Effective To Date')

# Gender

In [None]:
ds = pd.get_dummies(ds, prefix='gender_', columns=['Gender'])

# Location Code

In [None]:
ds['Location Code'].hist()

In [None]:
ds = pd.get_dummies(ds, prefix='location_', columns=['Location Code'])

In [None]:
ds.drop(columns=col_to_drop).head()

# Marital Status

In [None]:
ds['Marital Status'].hist()

In [None]:
ds = pd.get_dummies(ds, prefix='marital_', columns=['Marital Status'])

# Number of Policies

In [None]:
ds['Number of Policies'].hist()

### Так и оставим

In [None]:
ds.drop(columns=col_to_drop).columns

# Policy Type

In [None]:
ds['Policy Type'].hist()

In [None]:
ds = pd.get_dummies(ds, prefix='policy_type_', columns=['Policy Type'])

# Policy

In [None]:
plt.figure(figsize=(10, 10))
ds['Policy'].hist()

### Кажется, колонка Policy Type уже не нужна, можно разбить полис на Corporate/ Personal/Special и на L1/L2/L3