# Pre-processing

In [None]:
import altair as alt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

PATH_TO_DATA = '../data/'

data = pd.read_csv(PATH_TO_DATA + 'churn.csv')
data0 = data

data.head()
data.count()

## Filtering

In [None]:
# data.dropna(inplace=True)
data.count()

In [None]:
#remove negative values
data = data[data['days_since_last_login'] >= 0]
data['days_since_last_login'].sort_values(ascending=True)


In [None]:
#remove rows with security_no length greater than 7
data = data[data['security_no'].str.len() < 8]
data['security_no'].str.len().sort_values(ascending=True)

In [None]:
#remove rows with avg_time_spent less than 0
data=data[data['avg_time_spent'] >= 0]
data.count()

In [None]:
#remove rows with avg_transaction_value less than 0
data = data[data['avg_frequency_login_days'] != 'Error']
data.count()

In [None]:
#remove rows with avg_transaction_value less than 0
data = data[data['points_in_wallet'] >= 0]
data.count()

In [None]:
data = data[data['joined_through_referral'] != '?']
data.count()

## Normalization

### OneHotEncoder

In [None]:

def apply_one_hot_encoding(df, column_name):

    # df.dropna(subset=[column_name], inplace=True)
    
    column_df = df[[column_name]]
    
    encoder = OneHotEncoder()
    
    encoded_column = encoder.fit_transform(column_df)
    
    encoded_column_array = encoded_column.toarray()
    
    encoded_df = pd.DataFrame(encoded_column_array, columns=encoder.get_feature_names_out([column_name]))
    
    encoded_df = pd.concat([df, encoded_df], axis=1)
    
    encoded_df.drop(column_name, axis=1, inplace=True)
    
    return encoded_df

# data = apply_one_hot_encoding(data, 'joined_through_referral')
# data = apply_one_hot_encoding(data, 'used_special_discount')
# data = apply_one_hot_encoding(data, 'past_complaint')
# data = apply_one_hot_encoding(data, 'complaint_status')
# data = apply_one_hot_encoding(data, 'feedback')
# data = apply_one_hot_encoding(data, 'internet_option')
# data = apply_one_hot_encoding(data, 'membership_category')
# data = apply_one_hot_encoding(data, 'preferred_offer_types')
data.head()


### Mapping

In [None]:
memberships = ['No Membership', 'Basic Membership', 'Silver Membership', 'Gold Membership', 'Platinum Membership', 'Premium Membership']
joinded_through_ref = ['No', 'Yes']
used_special_discount = ['No', 'Yes']
past_complaint = ['No', 'Yes']
complaint_stat = ['No Information Available', 'Not Applicable', 'Solved', 'Solved in Follow-up', 'Unsolved']
good_feedbacks = ['Products always in Stock', 'Quality Customer Care', 'Reasonable Price', 'User Friendly Website']
bad_feedbacks = ['Poor Customer Service', 'Poor Product Quality', 'Poor Website', 'Too many ads']
neutral_feedbacks = ['No reason specified']

data['membership_category'] = data['membership_category'].apply(lambda x: memberships.index(x))
data['joined_through_referral'] = data['joined_through_referral'].apply(lambda x: joinded_through_ref.index(x))
data['used_special_discount'] = data['used_special_discount'].apply(lambda x: used_special_discount.index(x))
data['past_complaint'] = data['past_complaint'].apply(lambda x: past_complaint.index(x))
data['complaint_status'] = data['complaint_status'].apply(lambda x: complaint_stat.index(x))
data['feedback'] = data['feedback'].apply(lambda x: 2 if x in good_feedbacks else (0 if x in bad_feedbacks else (1)))


data.head()

### Export dataset

In [None]:
data.to_csv(PATH_TO_DATA + 'dataCleaned.csv', index=False)
data.head()

# Dataviz

In [None]:
data.dropna()
alt.data_transformers.disable_max_rows()
alt.Chart(data).mark_arc().encode(
    color='region_category',
    theta='mean(churn_risk_score)',
    tooltip=['region_category', 'mean(churn_risk_score)']
).properties(
    title='Churn Risk Score by Region Category'
)

In [None]:
alt.Chart(data).mark_line().encode(
    x='year(joining_date)',
    y='mean(churn_risk_score)',
).properties(
    width=300,
    height=200,
    title='Evolution of Churn Risk Score by Year'
)

In [None]:

alt.Chart(data).mark_bar().encode(
    x='preferred_offer_types',
    y='mean(churn_risk_score)',
    tooltip=['mean(churn_risk_score)', 'preferred_offer_types'],
    color = alt.Color('preferred_offer_types')
    
).properties(
    width=300,
    height=400,
    title='Average Churn Risk Score by Preferred Offer Types'
)

In [None]:
alt.Chart(data0).mark_bar().encode(
    alt.X('membership_category').sort('-y'),
    y='mean(churn_risk_score)',
    color = alt.Color('membership_category').sort('-y'),
    tooltip=['mean(churn_risk_score)', 'membership_category']
).properties(
    width=300,
    height=400,
    title='Average Churn Risk Score by Membership Category'
)

In [None]:
alt.Chart(data).mark_bar().encode(
    alt.X('preferred_offer_types'),
    y='mean(churn_risk_score)',
    color = alt.Color('preferred_offer_types'),
    tooltip=['mean(churn_risk_score)', 'preferred_offer_types']
).properties(
    width=300,
    height=400,
    title='Average Churn Risk Score by Preferred Offer Types'
)

In [None]:
alt.Chart(data).mark_bar().encode(
    alt.X('medium_of_operation'),
    y='mean(churn_risk_score)',
    tooltip=['mean(churn_risk_score)', 'medium_of_operation'],
    color = alt.Color('medium_of_operation')
).properties(
    width=300,
    height=400,
    title='Average Churn Risk Score by Medium of Operation'
)

In [None]:
alt.Chart(data0).mark_bar().encode(
    x='age',
    y='count()',
    tooltip=['age', 'count()']
).properties(
    width=700,
    height=300,
    title='Age Distribution'
)

In [None]:
alt.Chart(data).mark_bar().encode(
    x='internet_option',
    y='mean(churn_risk_score)',
    tooltip=['mean(churn_risk_score)', 'internet_option'],
    color = alt.Color('internet_option')
).properties(
    width=400,
    height=300,
    title='Average Churn Risk Score by Internet Option'
)

In [None]:
alt.Chart(data).mark_bar().encode(
    x='days_since_last_login',
    y='mean(churn_risk_score)',
    tooltip=['mean(churn_risk_score)', 'days_since_last_login'],
    
).properties(
    width=400,
    height=300,
    title='Average Churn Risk Score by Days Since Last Login'
)

In [None]:
data.head()