In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
import datetime
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb

In [None]:
import pandas as pd

train_data = pd.read_csv("ml_case_training_data.csv", encoding='utf-8')
train_hist = pd.read_csv("ml_case_training_hist_data.csv", encoding='cp949')
train_out = pd.read_csv("ml_case_training_output.csv", encoding='utf-8')

In [None]:
nan_count=((train_data.isna().sum()/train_data.shape[0])*100).sort_values(ascending=False)
nan_count=nan_count[nan_count>0]
train_data.drop(nan_count.index[:7],axis=1,inplace=True)

In [None]:
train_data['channel_sales']=train_data['channel_sales'].fillna('new category')
train_data[nan_count.index[8:]].nunique()
train_data[nan_count.index[8:]].isna().sum()

In [None]:
train_data[['date_modif_prod','date_renewal','date_end']]=train_data[['date_modif_prod','date_renewal',
                                                            'date_end']].fillna(method='bfill')

In [None]:
for col in [x for x in nan_count.index[8:] if x not in ['date_modif_prod','date_renewal','date_end','origin_up',
                                                        'forecast_price_energy_p1','forecast_price_energy_p2']]:
    train_data[col] = np.round(pd.to_numeric(train_data[col], errors='coerce')).astype('Int64')

In [None]:
origin_imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
train_data[['origin_up']]=origin_imp.fit_transform(train_data[['origin_up']])

In [None]:
imp_freq = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
train_data[[x for x in nan_count.index[8:] if x not in ['date_modif_prod',
                                                        'date_renewal','origin_up','date_end']]]=imp_freq.fit_transform(
    train_data[[x for x in nan_count.index[8:] if x not in ['date_modif_prod',
                                                        'date_renewal','origin_up','date_end']]])

In [None]:
train=pd.merge(train_data, train_out, on=['id'])

In [None]:
sales = train[["channel_sales",
               "churn", "id"]].groupby([
    "channel_sales","churn"])["id"].count().unstack(level=1).sort_values(by=[1],ascending=False)[:5]
sales_percentage = (sales.div(sales.sum(axis=1), axis=0)*100)
sales.plot(kind="bar",
 figsize=(18,10),
 stacked=True,
rot=0,
 title= "Top 5 sales channel with highest Churn Rate")
# Rename legend
plt.legend(["Retention", "Churn"], loc="upper right")
# Labels
plt.ylabel("No. of Consumers")
plt.xlabel("Channel_sales")
plt.show()

In [None]:
cons_12m = train[["has_gas",
               "churn", "id"]].groupby([
    "has_gas","churn"])["id"].count().unstack(level=1).sort_values(by=[1],ascending=False)
cons_12m_percentage = (cons_12m.div(cons_12m.sum(axis=1), axis=0)*100)
cons_12m.plot(kind="bar",
 figsize=(10,10),
 stacked=True,
rot=0,
 title= "Churn Rate - Has Gas Connection VS No Gas Connection")
# Rename legend
plt.legend(["Retention", "Churn"], loc="upper right")
# Labels
plt.ylabel("No. of Consumers")
plt.xlabel("has_gas")
plt.show()

In [None]:
cons_12m = train[["nb_prod_act",
               "churn", "id"]].groupby([
    "nb_prod_act","churn"])["id"].count().unstack(level=1).sort_values(by=[1],ascending=False)[:5]
cons_12m_percentage = (cons_12m.div(cons_12m.sum(axis=1), axis=0)*100)
cons_12m.plot(kind="bar",
 figsize=(18,10),
 stacked=True,
rot=0,
 title= "Churn Rate - No.of active connections")
# Rename legend
plt.legend(["Retention", "Churn"], loc="upper right")
# Labels
plt.ylabel("No. of Consumers")
plt.xlabel("active connections")
plt.show()

In [None]:
cons_12m = train[["num_years_antig",
               "churn", "id"]].groupby([
    "num_years_antig","churn"])["id"].count().unstack(level=1).sort_values(by=[1],ascending=False)[:10]
cons_12m_percentage = (cons_12m.div(cons_12m.sum(axis=1), axis=0)*100)
cons_12m.plot(kind="bar",
 figsize=(18,10),
 stacked=True,
rot=0,
 title= "Churn Rate - Antiquity of consumer")
# Rename legend
plt.legend(["Retention", "Churn"], loc="upper right")
# Labels
plt.ylabel("No. of Consumers")
plt.xlabel("Antiquity")
plt.show()

In [None]:
cons_12m = train[["origin_up",
               "churn", "id"]].groupby([
    "origin_up","churn"])["id"].count().unstack(level=1).sort_values(by=[1],ascending=False)
cons_12m_percentage = (cons_12m.div(cons_12m.sum(axis=1), axis=0)*100)
cons_12m.plot(kind="bar",
 figsize=(18,10),
 stacked=True,
rot=0,
 title= "Churn Rate - code of the electricity campaign the customer first subscribed to")
# Rename legend
plt.legend(["Retention", "Churn"], loc="upper right")
# Labels
plt.ylabel("No. of Consumers")
plt.xlabel("origin_up")
plt.show()

In [None]:
train_data.hist(bins=50, figsize=(20,15))
plt.tight_layout(pad=0.4)
plt.show()

In [None]:
train_hist.hist(bins=50, figsize=(20,15))
plt.tight_layout(pad=0.4)
plt.show()

In [None]:
for col in ['date_activ','date_end','date_modif_prod','date_renewal']:
    train[col]=pd.to_datetime(train_data[col])

In [None]:
train['contract_tenure']=train['date_end']-train['date_activ']
train['contract_tenure']=train['contract_tenure'].apply(lambda x:x.days)

In [None]:
train['contract_tenure_year']=train['contract_tenure'].apply(lambda x:int(np.round(x/365)))

In [None]:
tenure = train[["contract_tenure_year", "churn", "id"]].groupby(["contract_tenure_year",
                                                                 "churn"])["id"].count().unstack(level=1)
tenure_percentage = (tenure.div(tenure.sum(axis=1), axis=0)*100)
tenure.plot(kind="bar",
 figsize=(18,10),
 stacked=True,
rot=0,
 title= "Tenure VS Churn Rate")
# Rename legend
plt.legend(["Retention", "Churn"], loc="upper right")
# Labels
plt.ylabel("No. of Consumers")
plt.xlabel("No. of years")
plt.show()

In [None]:
train['bill_dev']=(train['cons_12m']/12)-train['cons_last_month']

In [None]:
def handleInf(x):
    if x==float('-inf') or x==float('inf'):
        return 0

In [None]:
train['cons_pattern']=train['forecast_cons_12m']/train['cons_12m']
train.cons_pattern=train.cons_pattern.apply(handleInf)

In [None]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
train_hist[['price_p1_var','price_p2_var','price_p3_var','price_p1_fix','price_p2_fix','price_p3_fix']]=imp_freq.fit_transform(
    train_hist[['price_p1_var','price_p2_var','price_p3_var','price_p1_fix','price_p2_fix','price_p3_fix']])

In [None]:
train_hist['price_p1']=train_hist['price_p1_var']+train_hist['price_p1_fix']
train_hist['price_p2']=train_hist['price_p2_var']+train_hist['price_p2_fix']
train_hist['price_p3']=train_hist['price_p3_var']+train_hist['price_p3_fix']
train_hist['pp12']=train_hist['price_p2']-train_hist['price_p1']
train_hist['pp23']=train_hist['price_p3']-train_hist['price_p2']
train_hist['pp13']=train_hist['price_p3']-train_hist['price_p1']

In [None]:
train_hist.drop(['price_date','price_p1_var','price_p2_var','price_p3_var',
                'price_p1_fix','price_p2_fix','price_p3_fix'],inplace=True,axis=1)

In [None]:
train=pd.merge(train_hist, train, on=['id'])

In [None]:
train.fillna(0,inplace=True)
X = train.drop(labels = ["id",'date_activ','date_end','date_modif_prod','date_renewal',"churn"],axis = 1)

In [None]:
train_2 = train.drop(labels = ["id",'date_activ','date_end','date_modif_prod','date_renewal'],axis = 1)

In [None]:
train_2['has_gas']=train_2['has_gas'].apply(lambda x:0 if x=='f' else 1)

In [None]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [6,24])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [None]:
train_2.head()

In [None]:
train_2.shape

In [None]:
columns_to_encode = ['channel_sales', 'origin_up']  # 원하는 칼럼명으로 수정
train_2 = pd.get_dummies(train_2, columns=columns_to_encode, drop_first=False)
train_2.to_csv('eda_v1.csv', index=False)