In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import os
import numpy as np

In [None]:
data_folder = '/content/drive/MyDrive/data-analytics/uc5'

In [None]:
df = pd.read_csv(os.path.join(data_folder, 'Attrition - Data.csv'), header=0, sep=',')
df.set_index("New_ID")
print(df.columns)

Index(['New_ID', 'Month', 'Txn', 'Status', 'FUA', 'Services', 'Has_Payroll',
       'Has_Investment', 'Has_VISA', 'Age', 'Beacon score'],
      dtype='object')


In [None]:
df_1 = df[df["Status"] == "Inactive"]
df_2 = df[df["Status"] == "Closed"]
df_3 = df[df["Status"] == "Active"]
inActives = pd.unique(df_1["New_ID"])
closed = pd.unique(df_2["New_ID"])
active = pd.unique(df_3["New_ID"])
print("Inactive: ", len(inActives))
print("Closed:", len(closed))
print("Active", len(active))

Inactive:  1575
Closed: 51
Active 10402


In [None]:
inActives_close = set(active).difference(set(closed))
close_inActive = set(active).difference(set(inActives))
print(len(inActives_close))
print(len(close_inActive))
ins = set(inActives).intersection(set(closed))
print("Inactive turns to closed:", len(ins))
print("Total Months", len(df["Month"].unique()))

10351
8827
Inactive turns to closed: 0
Total Months 48


In [None]:
def refill_1_and_drop_other(customer_df):
    if customer_df['FUA'].isnull().sum() > 0:
        customer_df = customer_df.fillna(method='ffill', limit=1)
        customer_df = customer_df.dropna()
    return customer_df

In [None]:
df.set_index("New_ID")
print(len(df.index))
print(df['Status'].value_counts())

grouped = df.groupby('New_ID')

cleaned_df = pd.concat([refill_1_and_drop_other(group) for _, group in grouped])

cleaned_df.to_csv(os.path.join(data_folder, 'clean_up.csv'), index=False)

499296
Active      469893
Inactive     28350
Closed        1053
Name: Status, dtype: int64


In [None]:
df = cleaned_df
df['Has_Payroll'] = df['Has_Payroll'].replace('Yes', 1)
df['Has_Payroll'] = df['Has_Payroll'].replace('yes', 1)
df['Has_Payroll'] = df['Has_Payroll'].replace('No', 0)
df['Age'] = df['Age']/100
df['FUA'] = np.nan_to_num(np.log10(df['FUA']+1e-9), nan=-1)
df['FUA'] = (df['FUA'] - df['FUA'].min())/(df['FUA'].max() - df['FUA'].min())
df['Beacon score'] = (df['Beacon score'] - df['Beacon score'].min())/(df['Beacon score'].max() -
                                                                      df['Beacon score'].min())
df['Services'] = df['Services']/10
df['Txn'] = df['Txn']/100
df.loc[(df['Txn'] == 0) & (df['Status'] == 'Active'), 'Status'] = 'Churning'
df['target'] = df['Status']
df['target'] = df['target'].replace('Active', 0)
df['target'] = df['target'].replace('Churning', 1)
df['target'] = df['target'].replace('Inactive', 2)
df['target'] = df['target'].replace('Closed', 3)
df.to_csv(os.path.join(data_folder, 'preprocessed.csv'), index=False)

In [None]:
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch
from sklearn.model_selection import train_test_split
import torch.nn.functional as F

In [None]:
class Predictor(nn.Module):
    def __init__(self):
        super().__init__()
        self.tagger = Pre_trained()
        self.linear = nn.Linear(in_features=48, out_features=4)

    def forward(self, input):
        output = self.tagger(input)
        batchsize, _, _ = output.shape
        output = output.contiguous().view(batchsize, -1)
        predict = self.linear(output)
        return predict

In [None]:
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data = self.data[index][0].astype(np.float32)
        label = self.data[index][1]
        return torch.tensor(data), torch.tensor(label)

In [None]:
class MyDataset_pred(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data = self.data[index][0].astype(np.float32)
        id = self.data[index][1]  # str
        return torch.tensor(data), id

In [None]:
def train_model(model, trainloader, optimizer, criterion, epoch):
    device = "cuda"
    model.train()
    for param in model.tagger.parameters():
        if epoch < 2:
            param.requires_grad = False
        else:
            param.requires_grad = True
    tp, tn, fp, fn, train_loss = 0, 0, 0, 0, 0
    accept, refuse = 0, 0
    for idx, data in enumerate(trainloader):
        optimizer.zero_grad()
        input, gt_label = data
        input.to(device)
        # gt_label = F.one_hot(gt_label.to(torch.int64), num_classes=4)
        gt_label.to(device)
        predict = model(input)

        loss = criterion(predict, gt_label)
        predicted = torch.argmax(predict, dim=1).detach().cpu().numpy()
        labelled = gt_label.detach().cpu().numpy()

        tp += (np.sum((predicted == labelled) & (predicted != 0)))
        tn += (np.sum((predicted == labelled) & (predicted == 0)))
        fp += (np.sum((predicted != 0) & (labelled == 0)))
        fn += (np.sum((predicted == 0) & (labelled != 0)))
        train_loss += loss.item() * len(data)

        accept += (np.sum((predicted == labelled)))
        refuse += (np.sum((predicted != labelled)))

        loss.backward()
        optimizer.step()
    acc = accept / (accept + refuse)
    return train_loss, acc

In [None]:
def eval_model(model, val_dataloader, criterion):
    device = "cuda"
    model.eval()
    tp, tn, fp, fn, val_loss = 0, 0, 0, 0, 0
    accept, refuse = 0, 0
    with torch.no_grad():
        for idx, data in enumerate(val_dataloader):
            input, gt_label = data
            input.to(device)
            # gt_label = F.one_hot(gt_label.to(torch.int64), num_classes=4)
            gt_label.to(device)
            predict = model(input)

            loss = criterion(predict, gt_label)
            predicted = torch.argmax(predict, dim=1).detach().cpu().numpy()
            labelled = gt_label.detach().cpu().numpy()

            tp += (np.sum((predicted == labelled) & (predicted != 0)))
            tn += (np.sum((predicted == labelled) & (predicted == 0)))
            fp += (np.sum((predicted != 0) & (labelled == 0)))
            fn += (np.sum((predicted == 0) & (labelled != 0)))
            val_loss += loss.item() * len(data)

            accept += (np.sum((predicted == labelled)))
            refuse += (np.sum((predicted != labelled)))
        acc = accept / (accept + refuse)
    return val_loss, acc

In [None]:
def predict(model, predict_dataloader):
    model.eval()
    df = pd.DataFrame(columns=['New_ID', 'Pred_Status', 'Pred_P_Active', 'Pred_P_Churning',
                               'Pred_P_Inactive', 'Pred_P_Closed'])
    with torch.no_grad():
        for idx, data in enumerate(predict_dataloader):
            input, id = data
            input.to(device)
            # gt_label = F.one_hot(gt_label.to(torch.int64), num_classes=4)
            predict = model(input)
            pred_label = torch.argmax(predict, dim=1).detach().cpu().numpy()[0]
            pred = F.softmax(predict, dim=1)
            pred = pred[0].detach().cpu().numpy()
            df.loc[idx] = [id, pred_label, pred[0], pred[1], pred[2], pred[3]]
    return df

In [None]:
device = "cpu"
BATCH_SIZE = 32
EPOCH = 10
TRAIN = True

df = pd.read_csv(os.path.join(data_folder, 'preprocessed.csv'), header=0, sep=',')
selected_columns = ['Txn', 'FUA', 'Services', 'Has_Payroll', 'Has_Investment', 'Has_VISA', 'Age', 'Beacon score']

all_pred = []  # to predict
pred_by_last_status = [[], [], []]  # to predict  0 for active, 1 for churn, 2 for inactive, 202112

train_data = []
train_label = []
for customer, customer_df in df.groupby('New_ID'):
    if (customer_df['Status'] == 'Closed').sum() > 0:
        if len(customer_df.index) > 12:
            data_df = customer_df[selected_columns]
            slices = int(np.floor(len(customer_df.index) / 12))
            remove = len(customer_df.index) % 12
            if remove == 0:
                remove = 12
                slices = slices - 1
            data = data_df.values[remove-1:]
            data = data[:-1].reshape(slices, 12, 8)
            label_df = customer_df[['target']]
            label = label_df.values[remove:].ravel().reshape(slices, 12)
            for i in range(slices):
                train_data.append(data[i])
                train_label.append(label[i][11])
    else:
        data_df = customer_df[selected_columns]
        data = data_df.values.reshape(4, 12, 8)
        label_df = customer_df[['target']]
        label = label_df.values.ravel().reshape(4, 12)
        for i in range(3):
            train_data.append(data[i])
            train_label.append(label[i+1][0])
        all_pred.append([data[3], customer])
        pred_by_last_status[label[3][11]].append([data[3], customer])

prepare_dataset = []
for i in range(len(train_data)):
    prepare_dataset.append([train_data[i], train_label[i]])

train_val_set, test_set = train_test_split(prepare_dataset, test_size=0.2, random_state=42, shuffle=True)
train_set, val_set = train_test_split(train_val_set, test_size=0.1, random_state=42, shuffle=True)

train_dataset = MyDataset(train_set)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)

val_dataset = MyDataset(val_set)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=True)

model = Predictor()
model.to(device)

criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=5e-5, weight_decay=1e-6)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.8)

if TRAIN:
    # train
    # pretrained_model = torch.load(os.path.join(data_folder, "learnModel.pth"))
    # model.tagger.load_state_dict(pretrained_model)
    for epoch in range(EPOCH):
        for param in model.tagger.parameters():
            if epoch < 2:
                param.requires_grad = False
            else:
                param.requires_grad = True
        train_loss, train_acc = train_model(model, train_loader, optimizer, criterion, epoch)
        val_loss, val_acc = eval_model(model, val_loader, criterion)
        print("Epoch: ", epoch+1)
        print("Train: loss", train_loss / len(train_set), " acc: ", train_acc)
        print("Validate: loss", val_loss / len(val_set), " acc: ", val_acc)

    torch.save(model.state_dict(), os.path.join(data_folder, "2_churnModel.pth"))

    # test
    test_dataset = MyDataset(test_set)
    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, drop_last=False)

    test_loss, test_acc = eval_model(model, test_loader, criterion)
    print("\nTest: loss", test_loss / len(val_set), " acc: ", test_acc)
else:
    weights = torch.load(os.path.join(data_folder, "2_churnModel.pth"))
    model.load_state_dict(weights)

# predict
pred_dataset = MyDataset_pred(all_pred)
pred_loader = DataLoader(pred_dataset, batch_size=1, shuffle=False, drop_last=False)

pred_df = predict(model, pred_loader)
pred_df.to_csv(os.path.join(data_folder, "churn2_predict.csv"), index=False)

# predict based on last status
for i in range(3):
    pred_dataset = MyDataset_pred(pred_by_last_status[i])
    pred_loader = DataLoader(pred_dataset, batch_size=1, shuffle=False, drop_last=False)

    pred_df = predict(model, pred_loader)
    file_name = "churn2_predict_"+str(i+1)+".csv"
    pred_df.to_csv(os.path.join(data_folder, file_name), index=False)