In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def eda(df:pd.DataFrame):
    print(f'Head(3):\n{"="*80}')
    print(df.head(3))
    print('=' * 80)
    print(f'\nShape:\n{"="*80}')
    print(df.shape)
    print('=' * 80)
    print(f'\nInfo:\n{"="*80}')
    print(df.info())
    print('=' * 80)
    print(f'\nDescribe Number columns:\n{"="*80}')
    print(df.describe())
    print('=' * 80)
    print(f'\nDescribe Object columns:\n{"="*80}')
    print(df.describe(include='object'))
    print('=' * 80)
    print(f'\n결측치:\n{"="*80}')
    print(df.isnull().sum())
    print('=' * 80)
    print(f'\n이상치:\n{"="*80}')
    df.select_dtypes(exclude='object').boxplot(figsize=(20,10))
    plt.show()
    print('=' * 80)

In [3]:
df = pd.read_csv('../data/cell2celltrain.csv', encoding='utf-8')

In [4]:
# eda(df)

In [5]:
mappings = {
    'NewCellphoneUser' : {'No' : 0, 'Yes' :1},
    'NotNewCellphoneUser' : {'No' : 0, 'Yes' : 1},
    'MadeCallToRetentionTeam' : {'No' : 0, 'Yes' : 1},
    'CreditRating' : {'1-Highest' : 1, '4-Medium': 4,  '3-Good': 3, '2-High' : 2, '5-Low' : 5, '6-VeryLow' : 6, '7-Lowest' : 7},
    'Churn' : {'No':0, 'Yes': 1},
    'ChildrenInHH': {'No':0, 'Yes': 1},
    'HandsetRefurbished': {'No':0, 'Yes': 1},
    'MaritalStatus': {'No':0, 'Yes': 1},
    'HandsetWebCapable': {'No':0, 'Yes': 1},
    'TruckOwner': {'No':0, 'Yes': 1},
    'RVOwner': {'No':0, 'Yes': 1},
    'Homeownership': {'Unknown':0, 'Known': 1},
    'BuysViaMailOrder': {'No':0, 'Yes': 1},
    'RespondsToMailOffers': {'No':0, 'Yes': 1},
    'OwnsComputer': {'No':0, 'Yes': 1},
    'HasCreditCard': {'No':0, 'Yes': 1},
    'OwnsMotorcycle': {'No':0, 'Yes': 1},
    'NonUSTravel': {'No':0, 'Yes': 1},
    'OptOutMailings': {'No':0, 'Yes': 1}
}
for col, mapping in mappings.items():
    try:
        df[col] = df[col].map(mapping)
    except Exception as e:
        pass

In [6]:
df.HandsetPrice = df.HandsetPrice.apply(lambda x : int(x) if x != 'Unknown' else 0)
price_mean = df.HandsetPrice.mean()
df.HandsetPrice = df.HandsetPrice.apply(lambda x : x if x > 0 else price_mean)

In [7]:
from sklearn.preprocessing import LabelEncoder
encoders = {}
for col in ['PrizmCode','Occupation','ServiceArea']:
    le = LabelEncoder()
    encoders[col] = le
    df[col] = le.fit_transform(df[col])
encoders

{'PrizmCode': LabelEncoder(),
 'Occupation': LabelEncoder(),
 'ServiceArea': LabelEncoder()}

In [8]:
# df.drop(columns=['CustomerID'], axis=1, inplace=True)

In [9]:
df.dropna(inplace=True)

In [10]:
df.Churn.value_counts()

Churn
0    22484
1     8639
Name: count, dtype: int64

In [11]:
# df = pd.concat([df[df.Churn == 0][:8639], df[df.Churn == 1]] ) 

In [12]:
X_data = df[['MonthlyRevenue', 'ServiceArea',
             'RoamingCalls','PercChangeRevenues','MonthsInService','RetentionCalls',
             'RetentionOffersAccepted','ReferralsMadeBySubscriber','AdjustmentsToCreditRating',
             'MadeCallToRetentionTeam','PeakCallsInOut',
             'ReceivedCalls','UnansweredCalls','OutboundCalls',
             'DroppedCalls','InboundCalls','BlockedCalls',
             'DirectorAssistedCalls','CustomerCareCalls', 'CurrentEquipmentDays'
             ]].copy()

y_data = df['Churn'].copy()

data_ = df[['CustomerID','MonthlyRevenue','MonthlyMinutes','TotalRecurringCharge','OverageMinutes',
             'RoamingCalls','PercChangeMinutes','PercChangeRevenues','MonthsInService','RetentionCalls',
             'RetentionOffersAccepted','NewCellphoneUser','NotNewCellphoneUser','ReferralsMadeBySubscriber',
             'AdjustmentsToCreditRating','MadeCallToRetentionTeam','CreditRating','PeakCallsInOut',
             'OffPeakCallsInOut','ReceivedCalls','UnansweredCalls','OutboundCalls','DroppedBlockedCalls',
             'DroppedCalls','InboundCalls','BlockedCalls','DirectorAssistedCalls','CustomerCareCalls',
             'CallWaitingCalls','CurrentEquipmentDays','HandsetRefurbished','IncomeGroup','PrizmCode',
             'Occupation','MaritalStatus','HandsetModels','AgeHH1','ChildrenInHH', 'Churn']].copy()

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, 
                                                    test_size=0.2, random_state=42, shuffle=True)

In [15]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [16]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((13822, 20), (3456, 20), (13822,), (3456,))

In [17]:
import torch
from torch.utils.data import DataLoader, TensorDataset

In [18]:
train_dataset = torch.utils.data.TensorDataset(torch.tensor( X_train, dtype=torch.float32).to('cuda:0'), 
                                               torch.tensor( y_train.to_numpy(), dtype=torch.float32).to('cuda:0'))
test_dataset  = torch.utils.data.TensorDataset(torch.tensor( X_test,  dtype=torch.float32).to('cuda:0'), 
                                               torch.tensor( y_test.to_numpy(), dtype=torch.float32).to('cuda:0'))

In [19]:
BATCH_SIZE = 32
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [20]:
class ChurnModel(nn.Module):
    def __init__(self, input_size):
        super(ChurnModel,self).__init__()
        self.fc1   = nn.Linear(input_size, 1024)
        self.fc2   = nn.Linear(1024, 1024)
        self.fc3   = nn.Linear(1024, 512)
        self.fc4   = nn.Linear(512, 512)
        self.fc5   = nn.Linear(512, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = F.leaky_relu(x)
        x = self.fc2(x)
        x = F.leaky_relu(x)
        x = self.fc3(x)
        x = F.leaky_relu(x)
        x = self.fc4(x)
        x = F.leaky_relu(x)
        x = self.fc5(x)
        x = torch.sigmoid(x)

        return x

In [21]:
class ChurnModel2(nn.Module):
    def __init__(self, input_size):
        super(ChurnModel2,self).__init__()
        self.input_size = input_size

        self.fc1_1   = nn.Linear(input_size//3, 512)
        self.fc1_2   = nn.Linear(input_size//3, 512)
        self.fc1_3   = nn.Linear(input_size//3 + input_size%3, 512)

        self.fc2_1   = nn.Linear(512, 512)
        self.fc2_2   = nn.Linear(512, 512)
        self.fc2_3   = nn.Linear(512, 512)

        self.fc3   = nn.Linear(512 * 3, 256)
        # self.fc4   = nn.Linear(256, 64)
     
        self.fc5   = nn.Linear(256, 1)

    def forward(self, x):
        d_ = self.input_size//3
        x1 = x[:, :d_]
        x2 = x[:, d_:d_ *2]
        x3 = x[:, d_*2:]

        x1 = self.fc1_1(x1)
        x1 = F.leaky_relu(x1)
        x2 = self.fc1_2(x2)
        x2 = F.leaky_relu(x2)
        x3 = self.fc1_3(x3)
        x3 = F.leaky_relu(x3)

        x1 = self.fc2_1(x1)
        x1 = F.leaky_relu(x1)
        x2 = self.fc2_2(x2)
        x2 = F.leaky_relu(x2)
        x3 = self.fc2_3(x3)
        x3 = F.leaky_relu(x3)

        x4 = torch.cat((x1, x2, x3), dim=1)
        # print(x1.shape, x2.shape, x3.shape, x4.shape)
        
        x4 = self.fc3(x4)
        x4 = F.leaky_relu(x4)

        # x4 = self.fc4(x4)
        # x4 = F.leaky_relu(x4)
  
        x4 = self.fc5(x4)
        x4 = torch.sigmoid(x4)

        return x4

In [22]:
def train(model, train_loader, optimizer, loss_fn):
    model.train()
    total_acc, total_loss = 0, 0
    for X, y in train_loader:
        optimizer.zero_grad()
        preds = model(X)
        # display(preds, y)
        loss = loss_fn(preds, y.reshape(-1,1))
        loss.backward()
        optimizer.step()

        total_acc += ((preds>=0.5).float() ==  y.reshape(-1,1)).float().sum().item()
        total_loss += loss.item()*y.size(0)
        # print(f'total_loss = {total_loss}, total_acc={total_acc}')
    return total_acc/len(train_loader.dataset), total_loss/len(train_loader.dataset)

In [23]:
def evaluate(model, test_loader, loss_fn):
    model.eval()
    total_acc, total_loss = 0, 0

    with torch.no_grad():   # torch를 변경하지 마라. 테스트 동안
        for X, y in test_loader:
            preds = model(X)
            loss = loss_fn(preds, y.reshape(-1,1))

            total_acc += ((preds>=0.5).float() ==  y.reshape(-1,1)).float().sum().item()
            total_loss += loss.item()*y.size(0)

    return total_acc/len(test_loader.dataset), total_loss/len(test_loader.dataset)

In [24]:
num_epochs = 10

torch.manual_seed(1)

<torch._C.Generator at 0x7f8b2e4b0c70>

In [25]:
def learning(num_epochs,model, train_loader, test_loader, optimizer, loss_fn):
    for epoch in range(num_epochs):
        acc_train, loss_train = train(model, train_loader, optimizer, loss_fn)
        acc_valid, loss_valid = evaluate(model, test_loader, loss_fn)
        print(f'에포크 {epoch} 정확도: {acc_train:.4f} 검증 정확도: {acc_valid:.4f} \
            훈련 Loss: {loss_train:.4f} 검증 Loss: {loss_valid:.4f}')
    #     break

In [26]:
train_dataset[0][0].__len__()

20

In [27]:
loss_fns = {'BCELoss': nn.BCELoss(), 
            'HingeEmbeddingLoss': nn.HingeEmbeddingLoss(), 
            'BCEWithLogitsLoss': nn.BCEWithLogitsLoss() }

models = {'ChurnModel' : ChurnModel(train_dataset[0][0].__len__()), 
          'ChurnModel2': ChurnModel2(train_dataset[0][0].__len__())}
for model_name, model in models.items():
    model.to('cuda:0')
    print(f'{"*"*20} {model_name} {"*"*20}')
    for fn_name, loss_fn in loss_fns.items():
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        print(f'{"-"*15} {fn_name} {"-"*15}')
        learning(num_epochs, model, train_loader, test_loader, optimizer, loss_fn)

******************** ChurnModel ********************
--------------- BCELoss ---------------
에포크 0 정확도: 0.7385 검증 정확도: 0.7459             훈련 Loss: 0.5174 검증 Loss: 0.4879
에포크 1 정확도: 0.7536 검증 정확도: 0.7405             훈련 Loss: 0.4816 검증 Loss: 0.4855
에포크 2 정확도: 0.7558 검증 정확도: 0.7439             훈련 Loss: 0.4773 검증 Loss: 0.4818
에포크 3 정확도: 0.7565 검증 정확도: 0.7494             훈련 Loss: 0.4817 검증 Loss: 0.4804
에포크 4 정확도: 0.7584 검증 정확도: 0.7477             훈련 Loss: 0.4711 검증 Loss: 0.4881
에포크 5 정확도: 0.7580 검증 정확도: 0.7462             훈련 Loss: 0.4751 검증 Loss: 0.4813
에포크 6 정확도: 0.7636 검증 정확도: 0.7509             훈련 Loss: 0.4706 검증 Loss: 0.5040
에포크 7 정확도: 0.7608 검증 정확도: 0.7457             훈련 Loss: 0.4645 검증 Loss: 0.4777
에포크 8 정확도: 0.7602 검증 정확도: 0.7468             훈련 Loss: 0.4714 검증 Loss: 0.4840
에포크 9 정확도: 0.7645 검증 정확도: 0.7237             훈련 Loss: 0.4650 검증 Loss: 0.4962
--------------- HingeEmbeddingLoss ---------------
에포크 0 정확도: 0.5120 검증 정확도: 0.4965             훈련 Loss: 0.5128 검증 Loss: 0.4965
에포크 1 정확도