In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('customer_churn.csv')

In [3]:
data

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [4]:
data.drop(columns = ['customerID'], inplace=True)

In [5]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

In [6]:
data.dropna(subset=['TotalCharges'], inplace=True)

In [7]:
data.rename(columns = {"Churn": "Class"}, inplace=True)

In [8]:
clss = data["Class"].unique()
print(clss)

['No' 'Yes']


In [9]:
nonNumericColumns = data.select_dtypes(exclude = ['int64', 'float64'])
nonNumericColumns.columns

Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod', 'Class'],
      dtype='object')

In [10]:
nonNumericColumns['SeniorCitizen'] = data['SeniorCitizen']

In [11]:
nonNumericColumns.columns

Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod', 'Class', 'SeniorCitizen'],
      dtype='object')

In [12]:
numericColumns = data.drop(columns = nonNumericColumns.columns)

In [13]:
cols = {}
for c in nonNumericColumns.columns:
    cols[c] = {val : index for index, val in enumerate(data[c].unique())}
    data[c] = [cols[c][val] for val in data[c]]
cols

{'gender': {'Female': 0, 'Male': 1},
 'Partner': {'Yes': 0, 'No': 1},
 'Dependents': {'No': 0, 'Yes': 1},
 'PhoneService': {'No': 0, 'Yes': 1},
 'MultipleLines': {'No phone service': 0, 'No': 1, 'Yes': 2},
 'InternetService': {'DSL': 0, 'Fiber optic': 1, 'No': 2},
 'OnlineSecurity': {'No': 0, 'Yes': 1, 'No internet service': 2},
 'OnlineBackup': {'Yes': 0, 'No': 1, 'No internet service': 2},
 'DeviceProtection': {'No': 0, 'Yes': 1, 'No internet service': 2},
 'TechSupport': {'No': 0, 'Yes': 1, 'No internet service': 2},
 'StreamingTV': {'No': 0, 'Yes': 1, 'No internet service': 2},
 'StreamingMovies': {'No': 0, 'Yes': 1, 'No internet service': 2},
 'Contract': {'Month-to-month': 0, 'One year': 1, 'Two year': 2},
 'PaperlessBilling': {'Yes': 0, 'No': 1},
 'PaymentMethod': {'Electronic check': 0,
  'Mailed check': 1,
  'Bank transfer (automatic)': 2,
  'Credit card (automatic)': 3},
 'Class': {'No': 0, 'Yes': 1},
 'SeniorCitizen': {0: 0, 1: 1}}

In [14]:
def featureScaling():
    for i in data.keys():
        if i != 'Class':
            m = max(data[i])
            data[i] /= m
    return
featureScaling()

In [15]:
trainData = data.sample(frac=0.8, random_state=0)

In [16]:
testData = data.drop(trainData.index)

### Performance Parameters

In [17]:
def accuracy(mat):
    return (mat[0][0] + mat[1][1])/(mat[0][0] + mat[0][1] + mat[1][0] + mat[1][1])
def precision(mat):
    return (mat[0][0])/(mat[0][0] + mat[1][0])
def recall(mat):
    return (mat[0][0])/(mat[0][0] + mat[0][1])
def F1Score(p,r):
    return (2*(p*r))/(p + r)

### Logistic Regression

In [18]:
def costFunction(x,y,w,b):
    m = len(x)
    mat = np.matmul(x,w) + b
    probs = 1 / (1 + np.exp((-1)*np.clip(mat, -500, 500))).astype(np.longdouble)
    epsilon = 1e-15
    probs = np.clip(probs, epsilon, 1 - epsilon)
    J = -1/m * (y * np.log(probs) + (1 - y) * np.log(1 - probs))
    return sum(J)

In [19]:
def gradient(x,y,w,b,learning_rate,iterations=10000,stoping_threshold=0.000000000001):
    x = x.to_numpy()
    y = y.to_numpy()
    y = y.reshape(len(y), 1)
    prev_cost = None
    bol = True
    for j in range(iterations):
        crnt_cost = costFunction(x,y,w,b)
        if prev_cost is not None and abs(prev_cost - crnt_cost) <= stoping_threshold:
            break
        m = len(x)
        prev_cost = crnt_cost
        mat = np.matmul(x,w) + b
        u = 1/(1 + np.exp((-1)*np.clip(mat, -500, 500))).astype(np.longdouble)
        derivative = np.subtract(u, y)
        n = len(w) 
        for i in range(n):
            derv = np.multiply(derivative,x[:,i].reshape(m,1))
            w[i,0] = w[i,0] - learning_rate*sum(derv)
        b = b - learning_rate*sum(derivative)
    print(j)
    return w, b

In [20]:
y = trainData['Class']

In [21]:
x = trainData.drop(columns = ['Class'])

In [22]:
w = np.ones(len(x.columns))
w = w.reshape(len(x.columns),1)

In [23]:
b, b0 = gradient(x,y,w,1,0.0001)

9999


In [24]:
print(b, b0)

[[-0.06590894]
 [ 0.20443265]
 [-0.04433875]
 [-0.20940883]
 [-3.89671074]
 [-0.9962877 ]
 [ 0.35360214]
 [ 0.97656503]
 [-0.94026788]
 [ 0.55875032]
 [-0.20444364]
 [-0.98595642]
 [ 0.19989975]
 [ 0.14602301]
 [-1.31840159]
 [-0.37433053]
 [-0.39793635]
 [ 1.50950269]
 [ 2.16895611]] [0.12152641]


In [25]:
yTest = testData['Class']

In [26]:
xTest = testData.drop(columns = ['Class'])

In [27]:
b = b.ravel()

In [28]:
yOutput = []
b = b.ravel()
for d in xTest.iterrows():
    out = sum(np.multiply(d[1].to_numpy(), b)) + b0[0]
    out = 1/(1 + np.exp((-1)*np.clip(out, -500, 500)))
    if out >= 0.5:
        out = 1
    else:
        out = 0
    yOutput.append((out, yTest[d[0]]))

In [29]:
def confusion():
    confmat = np.array([0,0,0,0])
    confmat = confmat.reshape(2,2)
    for _, d in testData.iterrows():
        y = d['Class']
        x = d.drop('Class')
        out = sum(np.multiply(x.to_numpy(), b)) + b0[0]
        predy = 1/(1 + np.exp((-1)*np.clip(out, -500, 500)))
        if predy >= 0.5:
            predy = 1
        else:
            predy = 0
        if y == predy:
            if y == 0:
                confmat[0][0] += 1
            else:
                confmat[1][1] += 1
        else:
            if y == 0:
                confmat[1][0] += 1
            else:
                confmat[0][1] += 1
    return confmat

In [30]:
mat = confusion()
mat

array([[947, 184],
       [ 86, 189]])

In [31]:
acc = accuracy(mat)
pre = precision(mat)
rec = recall(mat)
fscore = F1Score(pre, rec)
print(acc, pre, rec, fscore)

0.8079658605974396 0.9167473378509197 0.8373121131741822 0.8752310536044362


### Navie Bayes Classifier

In [32]:
def probabilitiesClass(cls):
    n = len(trainData[trainData['Class'] == cls])
    d = {}
    for key in nonNumericColumns.keys():
        if key != 'Class':
            dict = {}
            for i in set(trainData[key]):
                dict[i] = len(trainData[(trainData['Class'] == cls) & (trainData[key] == i)])/n
            d[key] = dict
    return d

In [33]:
numericColumns.columns

Index(['tenure', 'MonthlyCharges', 'TotalCharges'], dtype='object')

In [34]:
def probabilitiesNormalDistribution(cls):
    n = len(trainData[trainData['Class'] == cls])
    d = {}
    for key in numericColumns.keys():
        dict = {}
        dict['mean'] = trainData[key].mean()
        dict['std'] = trainData[key].std()
        d[key] = dict
    return d

In [35]:
probabilitiesNormalDistribution(1)

{'tenure': {'mean': 0.44896709720741, 'std': 0.3408913437310986},
 'MonthlyCharges': {'mean': 0.5438186614777255, 'std': 0.25379919961158853},
 'TotalCharges': {'mean': 0.26039639542669696, 'std': 0.259726672030239}}

In [36]:
statics = {}
for cls in set(data['Class']):
    statics[cls] = probabilitiesNormalDistribution(cls)
statics

{0: {'tenure': {'mean': 0.44896709720741, 'std': 0.3408913437310986},
  'MonthlyCharges': {'mean': 0.5438186614777255, 'std': 0.25379919961158853},
  'TotalCharges': {'mean': 0.26039639542669696, 'std': 0.259726672030239}},
 1: {'tenure': {'mean': 0.44896709720741, 'std': 0.3408913437310986},
  'MonthlyCharges': {'mean': 0.5438186614777255, 'std': 0.25379919961158853},
  'TotalCharges': {'mean': 0.26039639542669696, 'std': 0.259726672030239}}}

In [37]:
prob = {}
for cls in set(data['Class']):
    prob[cls] = probabilitiesClass(cls)
prob

{0: {'gender': {0.0: 0.49322033898305084, 1.0: 0.5067796610169492},
  'Partner': {0.0: 0.524455205811138, 1.0: 0.475544794188862},
  'Dependents': {0.0: 0.6537530266343826, 1.0: 0.34624697336561744},
  'PhoneService': {0.0: 0.09927360774818401, 1.0: 0.9007263922518159},
  'MultipleLines': {0.5: 0.49007263922518157,
   0.0: 0.09927360774818401,
   1.0: 0.41065375302663437},
  'InternetService': {0.0: 0.375544794188862,
   1.0: 0.2765133171912833,
   0.5: 0.3479418886198547},
  'OnlineSecurity': {0.0: 0.39322033898305087,
   1.0: 0.2765133171912833,
   0.5: 0.3302663438256659},
  'OnlineBackup': {0.5: 0.35980629539951575,
   1.0: 0.2765133171912833,
   0.0: 0.36368038740920094},
  'DeviceProtection': {0.0: 0.362227602905569,
   1.0: 0.2765133171912833,
   0.5: 0.3612590799031477},
  'TechSupport': {0.0: 0.3891041162227603,
   1.0: 0.2765133171912833,
   0.5: 0.33438256658595644},
  'StreamingTV': {0.0: 0.3593220338983051,
   1.0: 0.2765133171912833,
   0.5: 0.3641646489104116},
  'Stream

In [38]:
def NaiveBayesClassifier(X):
    predcls = None
    maxval = 0
    for cls in prob.keys():
        val = 1
        for k in X.keys():
            if k in numericColumns.keys():
                exponent = -((X[k] - statics[cls][k]['mean']) ** 2) / (2 * statics[cls][k]['std'] ** 2)
                val *= (1 / (np.sqrt(2 * np.pi) * statics[cls][k]['std'])) * np.exp(exponent)
            else:
                temp = prob[cls].get(k,{})
                val *= temp.get(X[k],0)
        if val > maxval:
            maxval = val
            predcls = cls
    return predcls  

In [39]:
def confusion():
    classes = {}
    clss = trainData["Class"].unique()
    i = 0
    for clas in clss:
        classes[clas] = i
        i += 1
    n = len(classes)
    confmat = np.zeros(n*n)
    confmat = confmat.reshape(n,n)
    for _, d in testData.iterrows():
        y = d['Class']
        x = d.drop('Class')
        predy = NaiveBayesClassifier(x)
        confmat[classes[predy]][classes[y]] += 1
    return confmat

In [40]:
mat = confusion() 
mat

array([[632.,  65.],
       [401., 308.]])

In [41]:
acc = accuracy(mat)
pre = precision(mat)
rec = recall(mat)
fscore = F1Score(pre, rec)
print(acc, pre, rec, fscore)

0.6685633001422475 0.611810261374637 0.9067431850789096 0.730635838150289


### ANN

In [42]:
y = trainData['Class']

In [43]:
x = trainData.drop(columns = ['Class'])

In [44]:
def backpropagation(x,y,learning_rate,n1,n2,n3,iterations=1000):
    w1 = np.ones(n1*n2)
    w1 = w1.reshape(n1,n2)
    w2 = np.ones(n2*n3)
    w2 = w2.reshape(n2,n3)
    for i in range(len(w1)):
        for j in range(len(w1[0])):
            w1[i][j] = 0.5
    x = x.to_numpy()
    y = y.to_numpy()
    n = len(x)
    for i in range(iterations):
        for d in range(n):
            #Forward Pass
            hnet = np.matmul(x[d],w1)
            hout = 1/(1 + np.exp((-1)*hnet)).astype(np.longdouble)
            onet = np.matmul(hout,w2)
            oout = 1/(1 + np.exp((-1)*onet)).astype(np.longdouble)
            #Backward pass
            delta_o = np.multiply(oout,np.multiply(np.subtract(1,oout),np.subtract(y[d],oout)))
            inter = np.matmul(w2,delta_o)
            delta_h = np.multiply(hout,np.multiply(np.subtract(1,hout),inter))
            #Update weights
            for i in range(len(w1)):
                for j in range(len(w1[0])):
                    delta = learning_rate*delta_h[j]*x[d][i]
                    w1[i][j] += delta
            for i in range(len(w2)):
                for j in range(len(w2[0])):
                    delta = learning_rate*delta_o[j]*hout[j]
                    w2[i][j] += delta
    return w1, w2

In [45]:
w1,w2 = backpropagation(x,y,0.1,19,7,1)

In [46]:
def confusion():
    classes = {}
    clss = trainData["Class"].unique()
    i = 0
    for clas in clss:
        classes[clas] = i
        i += 1
    n = len(classes)
    confmat = np.zeros(n*n)
    confmat = confmat.reshape(n,n)
    for _, d in testData.iterrows():
        y = d['Class']
        x = d.drop('Class')
        hnet = np.matmul(x,w1)
        hout = 1/(1 + np.exp((-1)*hnet))
        onet = np.matmul(hnet,w2)
        oout = 1/(1 + np.exp((-1)*onet))
        if oout > 0.5:
            predy = 1
        else:
            predy = 0
        confmat[classes[predy]][classes[y]] += 1
    return confmat 

In [47]:
mat = confusion()
mat

array([[609.,  49.],
       [424., 324.]])

In [48]:
acc = accuracy(mat)
pre = precision(mat)
rec = recall(mat)
fscore = F1Score(pre, rec)
print(acc, pre, rec, fscore)

0.6635846372688478 0.5895450145208132 0.925531914893617 0.7202838557066824


### Conclusion

Given the results from applying linear regression, the Naive Bayes Classifier, and an Artificial Neural Network (ANN), it appears that linear regression achieves the highest accuracy compared to the other models. As a result, it may be advisable to utilize the linear regression model for customer churn classification.