## Load packages

In [22]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier

import torch
import torchvision
import torchvision.transforms as transforms

## Function to load data

In [13]:
def parse_telecom_data(filename_train,filename_test):
    '''
    Takes filename and returns X and Y after applying label encoding and OneHotEncoding

    Input:
        filename: name of CSV file to read
    Output:
        X: nparray of X data
        Y: nparray of labels
    '''
    X_train_in = pd.read_csv(filename_train)
    X_test_in = pd.read_csv(filename_test)
    ID_test = X_test_in[['customerID']].to_numpy()

    # get X with categorical data
    X_train_cat = X_train_in.drop(columns=['customerID','tenure','MonthlyCharges','TotalCharges','Discontinued'])
    X_test_cat = X_test_in.drop(columns=['customerID','tenure','MonthlyCharges','TotalCharges'])

    # get X with numeric data
    X_train_num = X_train_in[['tenure','MonthlyCharges']].to_numpy()
    X_test_num = X_test_in[['tenure','MonthlyCharges']].to_numpy()

    # get Y
    Y_train_cat = X_train_in.filter(['Discontinued'])

    # convert labels to numeric using LabelEncoder
    le = preprocessing.LabelEncoder()
    colList = X_train_cat.columns
    ncol_le = len(colList)
    m_train_rows = X_train_cat.shape[0]
    m_test_rows = X_test_cat.shape[0]
    X_train_le = np.zeros((m_train_rows,ncol_le))
    X_test_le = np.zeros((m_test_rows,ncol_le))
    for jdx in range(0,ncol_le):
        le.fit(X_train_cat[colList[jdx]])
        X_train_le[:,jdx] = le.transform(X_train_cat[colList[jdx]])
        X_test_le[:,jdx] = le.transform(X_test_cat[colList[jdx]])
    
    # get labels for training data
    Y_train = Y_train_cat.apply(le.fit_transform)
    Y_train = Y_train.to_numpy()

    # use OneHotEncoder (applied to encoded labels)
    enc = preprocessing.OneHotEncoder()
    enc.fit(X_train_le)
    X_train_ohl = enc.transform(X_train_le).toarray()
    X_test_ohl = enc.transform(X_test_le).toarray()

    # combine data
    n_ohl = X_train_ohl.shape[1]
    n_num = X_train_num.shape[1]
    X_train = np.zeros((m_train_rows,n_ohl+n_num))
    X_test = np.zeros((m_test_rows,n_ohl+n_num))
    X_train[:,0:n_ohl] = X_train_ohl
    X_test[:,0:n_ohl] = X_test_ohl
    X_train[:,n_ohl:] = X_train_num
    X_test[:,n_ohl:] = X_test_num

    return X_train,Y_train,X_test,ID_test

## Load Data

In [14]:
# get data
[X_train,Y_train,X_test,ID_test] = parse_telecom_data(filename_train='train.csv',filename_test='test.csv');

# check for NaN
if np.isnan(X_train).any():
    print('NaN in training data');
if np.isnan(X_test).any():
    print('NaN in test data');

## EDA

In [18]:
X_train_in = pd.read_csv('train.csv')
X_test_in = pd.read_csv('test.csv')

In [30]:
Y_train

array([[1],
       [0],
       [1],
       ...,
       [1],
       [0],
       [1]])

In [29]:
X_train_in.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Discontinued
0,1915-IOFGU,Female,0,No,No,1,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,No,Electronic check,70.5,70.5,Yes
1,6728-CZFEI,Female,0,No,No,15,Yes,No,DSL,No,...,No,No,Yes,No,One year,No,Mailed check,56.15,931.9,No
2,3863-IUBJR,Male,0,Yes,Yes,12,Yes,No,DSL,No,...,No,No,No,Yes,One year,No,Credit card (automatic),53.65,696.35,Yes
3,5572-ZDXHY,Female,0,No,No,22,Yes,No,Fiber optic,No,...,No,No,No,Yes,Month-to-month,Yes,Mailed check,84.3,1855.65,Yes
4,8348-HFYIV,Male,0,No,No,2,No,No phone service,DSL,No,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,49.25,90.35,Yes


In [21]:
X_train.shape

(5343, 45)

## Train Decision Tree Classifier

In [10]:
clf = RandomForestClassifier(n_estimators = 1000,criterion='gini');
#clf = DecisionTreeClassifier(criterion='gini');
clf.min_samples_leaf = 50;
clf.fit(X_train,Y_train.flatten());

## Compute metric for training data

In [11]:
y_train_prob = clf.predict_proba(X_train);
thisMetric = metrics.roc_auc_score(Y_train, y_train_prob[:,1]);
print("Training metric: ",thisMetric)

Training metric:  0.8563127960027641


## Save test prediction to CSV

In [12]:
y_test_prob = clf.predict_proba(X_test);
thisData = np.concatenate((ID_test, np.reshape(y_test_prob[:,1],(-1,1))), axis=1);
thisLabel = ['ID','TARGET'];
y_test_prob_pd = pd.DataFrame(data=thisData,columns=thisLabel)
fname_submit = 'test_submission.csv';
y_test_prob_pd.to_csv(fname_submit,index=False);

## Neural Network

### Init Neural Network Structure

In [26]:
print(X_train.shape)
print(Y_train.shape)

(5343, 45)
(5343, 1)


In [101]:
# set batch size
batch_size = 32

# Prepare dataset using DataLoader
class telecomDataset(torch.utils.data.Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]
    
# create dataset
# split training data into training and validation
N_train = int(X_train.shape[0] * 4 / 5)
print('Number of training data is ',N_train)
X_train_train = X_train[0:N_train,:]
X_train_val = X_train[N_train:,:]
Y_train_train = Y_train[0:N_train]
Y_train_val = Y_train[N_train:]

# create training dataset
train_dataset = telecomDataset(torch.tensor(X_train_train).float(), torch.tensor(Y_train_train).float())
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# create testing dataset
test_dataset = telecomDataset(torch.tensor(X_train_val).float(), torch.tensor(Y_train_val).float())
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

Number of training data is  4274


In [102]:
print(f'{len(train_loader)} training batches')
print(f'{len(train_loader) * batch_size} training samples')
print(f'{len(test_loader)} validation batches')

134 training batches
4288 training samples
34 validation batches


In [103]:
import torch.nn as nn

model = nn.Sequential(
    
    # nn.Flatten(),
    nn.Linear(45, 64),
    nn.ReLU(),

    nn.Linear(64, 128),
    nn.ReLU(),

    nn.Dropout(0.5),

    nn.Linear(128, 256),
    nn.ReLU(),

    nn.Linear(256, 256),
    nn.ReLU(),

    nn.Dropout(0.5),

    nn.Linear(256, 64),
    nn.ReLU(),

    nn.Linear(64, 32),
    nn.ReLU(),

    nn.Linear(32, 1),

    nn.Sigmoid()
    # PyTorch implementation of cross-entropy loss includes softmax layer
)

In [104]:
for p in model.parameters():
    print(p.data.shape)

torch.Size([64, 45])
torch.Size([64])
torch.Size([128, 64])
torch.Size([128])
torch.Size([256, 128])
torch.Size([256])
torch.Size([256, 256])
torch.Size([256])
torch.Size([64, 256])
torch.Size([64])
torch.Size([32, 64])
torch.Size([32])
torch.Size([1, 32])
torch.Size([1])


In [105]:
import torch.optim as optim
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [106]:
n_epochs = 100

# store metrics
training_accuracy_history = np.zeros([n_epochs, 1])
training_loss_history = np.zeros([n_epochs, 1])
validation_accuracy_history = np.zeros([n_epochs, 1])
validation_loss_history = np.zeros([n_epochs, 1])

for epoch in range(n_epochs):
    print(f'Epoch {epoch+1}/10:', end='')
    train_total = 0
    train_correct = 0
    # train
    model.train()
    for i, data in enumerate(train_loader):
        images, labels = data
        optimizer.zero_grad()
        # forward pass
        output = model(images)
        # calculate categorical cross entropy loss
        loss = criterion(output, labels)
        # backward pass
        loss.backward()
        optimizer.step()
        
        # track training accuracy
        output = (output > 0.5).float()
        train_total += labels.size(0)
        train_correct += (output == labels).sum().item()
        # track training loss
        training_loss_history[epoch] += loss.item()
        # progress update after 180 batches (~1/10 epoch for batch size 32)
        if i % 180 == 0: print('.',end='')
    training_loss_history[epoch] /= len(train_loader)
    training_accuracy_history[epoch] = train_correct / train_total
    print(f'\n\tloss: {training_loss_history[epoch,0]:0.4f}, acc: {training_accuracy_history[epoch,0]:0.4f}',end='')
        
    # validate
    test_total = 0
    test_correct = 0
    with torch.no_grad():
        model.eval()
        for i, data in enumerate(test_loader):
            images, labels = data
            # forward pass
            output = model(images)
            # find accuracy
            output = (output > 0.5).float()
            test_total += labels.size(0)
            test_correct += (output == labels).sum().item()
            # find loss
            loss = criterion(output, labels)
            validation_loss_history[epoch] += loss.item()
        validation_loss_history[epoch] /= len(test_loader)
        validation_accuracy_history[epoch] = test_correct / test_total
    print(f', val loss: {validation_loss_history[epoch,0]:0.4f}, val acc: {validation_accuracy_history[epoch,0]:0.4f}')

Epoch 1/10:.
	loss: 0.1643, acc: 0.7606, val loss: 0.2072, val acc: 0.7905
Epoch 2/10:.
	loss: 0.1510, acc: 0.7803, val loss: 0.2026, val acc: 0.7951
Epoch 3/10:.
	loss: 0.1445, acc: 0.7929, val loss: 0.1999, val acc: 0.7979
Epoch 4/10:.
	loss: 0.1433, acc: 0.7948, val loss: 0.1944, val acc: 0.8036
Epoch 5/10:.
	loss: 0.1402, acc: 0.7990, val loss: 0.2026, val acc: 0.7951
Epoch 6/10:.
	loss: 0.1389, acc: 0.7978, val loss: 0.1962, val acc: 0.8017
Epoch 7/10:.
	loss: 0.1387, acc: 0.7974, val loss: 0.1916, val acc: 0.8064
Epoch 8/10:.
	loss: 0.1387, acc: 0.7976, val loss: 0.1971, val acc: 0.8007
Epoch 9/10:.
	loss: 0.1396, acc: 0.7993, val loss: 0.2008, val acc: 0.7970
Epoch 10/10:.
	loss: 0.1374, acc: 0.8042, val loss: 0.1907, val acc: 0.8073
Epoch 11/10:.
	loss: 0.1382, acc: 0.8002, val loss: 0.1815, val acc: 0.8167
Epoch 12/10:.
	loss: 0.1354, acc: 0.8007, val loss: 0.1999, val acc: 0.7979
Epoch 13/10:.
	loss: 0.1371, acc: 0.8063, val loss: 0.1953, val acc: 0.8026
Epoch 14/10:.
	loss: 

In [111]:
model.eval()
y_train_prob = model(torch.from_numpy(X_train).float()).detach().numpy()
thisMetric = metrics.roc_auc_score(Y_train, y_train_prob);
print("Training metric: ",thisMetric)

Training metric:  0.870708706144556


In [112]:
model.eval()
y_test_prob = model(torch.from_numpy(X_test).float()).detach().numpy()
thisData = np.concatenate((ID_test, np.reshape(y_test_prob[:],(-1,1))), axis=1)
thisLabel = ['ID','TARGET']
y_test_prob_pd = pd.DataFrame(data=thisData,columns=thisLabel)
fname_submit = 'test_submission_Wei.csv'
y_test_prob_pd.to_csv(fname_submit,index=False)