# Importing the data

In [1]:
%load_ext watermark
%watermark -v -m -p numpy,pandas,statsmodels,scikit-learn,tensorflow -g

Python implementation: CPython
Python version       : 3.8.10
IPython version      : 7.13.0

numpy       : 1.22.3
pandas      : 1.2.4
statsmodels : 0.12.2
scikit-learn: 0.24.2
tensorflow  : 2.8.0

Compiler    : GCC 9.4.0
OS          : Linux
Release     : 5.4.0-109-generic
Machine     : x86_64
Processor   : x86_64
CPU cores   : 32
Architecture: 64bit

Git hash: 



In [2]:
import pandas as pd
import numpy as np
import os

from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

from statsmodels.stats.outliers_influence import variance_inflation_factor  
import statsmodels.regression.linear_model as sm

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Activation, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.python.client import device_lib

import _pickle as cPickle
import json

In [3]:
# Дирректории рабочих файлов
PATH_TO_DATA = r'../../data' 

# Исходные данные
RAW = r'raw'

# Директории моделей
PATH_TO_MODELS = r'../../models' 

# Understanding data and manipulation

In [4]:
# Загрузка данных в формате CSV
path_to_csv_file = os.path.join(PATH_TO_DATA, RAW, 'WA_Fn_UseC__Telco_Customer_Churn.csv')

full_data = pd.read_csv(
    path_to_csv_file, 
    na_values=[' ','','#NA','NA','NULL','NaN', 'nan', 'n/a'], 
    dtype={'TotalCharges':np.float32, 'MonthlyCharges': np.float32},
    engine='c',
    sep=',',
    encoding='utf-8',
)

# Dropping column not having any significance in predicting the customer decision so we will drop it
full_data = full_data.drop(
    columns=[
        'customerID', 
        'PaperlessBilling', 
        'PaymentMethod'
    ], 
    axis=1, 
    inplace=False
)

# Remove na_values
full_data.loc[:,'TotalCharges'] = full_data.loc[:,'TotalCharges'].fillna(
    full_data.loc[:,'TotalCharges'].mean(), 
    inplace=False,
)


target_column = 'Churn'

data = full_data.drop(
    columns=target_column, 
    axis=1, 
    inplace=False
)

target = full_data.loc[:,[target_column]]

## Encoding the values and feature scaling

In [5]:
# columns having unique values lower than 5
# columns_for_encoder = [column for column in data.columns if data.loc[:,column].unique().shape[0] < 5]

columns_for_encoder = [
    'gender', 
    'SeniorCitizen', 
    'Partner', 
    'Dependents', 'PhoneService', 
    'MultipleLines', 
    'InternetService', 
    'OnlineSecurity', 
    'OnlineBackup', 
    'DeviceProtection', 
    'TechSupport', 
    'StreamingTV', 
    'StreamingMovies', 
    'Contract',
]


for column in columns_for_encoder:
    
    # creating label encoders
    label_encoder = LabelEncoder().fit(data.loc[:,column]) 
    
    # aplication label encoders
    data.loc[:,column] = label_encoder.transform(data.loc[:,column])
    
    # saving label encoders
    label_encoder_name = f'label_encoder_{column}'
    label_encoder_path = os.path.join(PATH_TO_MODELS, label_encoder_name + '.pkl')
    with open(label_encoder_path, 'wb') as fid:
        cPickle.dump(label_encoder, fid)

In [6]:
column = target_column


# creating label encoders
label_encoder = LabelEncoder()
label_encoder.classes_ = np.array(['No', 'Yes'])

# aplication label encoders
target.loc[:,column] = label_encoder.transform(target.loc[:,column])
    
# saving label encoders
label_encoder_name = f'label_encoder_{column}'
label_encoder_path = os.path.join(PATH_TO_MODELS, label_encoder_name + '.pkl')
with open(label_encoder_path, 'wb') as fid:
    cPickle.dump(label_encoder, fid)

In [7]:
columns_for_scaler = [
    'gender',
    'SeniorCitizen',
    'Partner',
    'Dependents',
    'tenure',
    'PhoneService',
    'MultipleLines',
    'InternetService',
    'OnlineSecurity',
    'OnlineBackup',
    'DeviceProtection',
    'TechSupport',
    'StreamingTV',
    'StreamingMovies',
    'Contract',
    'MonthlyCharges',
    'TotalCharges'
]


for column in columns_for_scaler:
    
    # creating Feature Scaling
    feature_scaler = StandardScaler().fit(data.loc[:,column].values.reshape(-1,1))
    
    # aplication Feature Scaling
    data.loc[:,column] = feature_scaler.transform(data.loc[:,column].values.reshape(-1,1))  
    
    # saving Feature Scaling
    feature_scaler_name = f'feature_scaler_{column}'
    feature_scaler_path = os.path.join(PATH_TO_MODELS, feature_scaler_name + '.pkl')
    with open(feature_scaler_path, 'wb') as fid:
        cPickle.dump(feature_scaler, fid)  

## Dealing multi-collinearity

The variance inflation factor is a measure for the increase of the
variance of the parameter estimates if an additional variable, given by
exog_idx is added to the linear regression. It is a measure for
multicollinearity of the design matrix, exog.

In [8]:
def calculate_vif_(data, thresh=5.0):
    variables = list(data.columns)
    dropped = True
    while dropped:
        dropped = False
        vif = [variance_inflation_factor(data.loc[:, variables].values, i) for i, _ in enumerate(variables)]
        maxloc = vif.index(max(vif))
        if max(vif) > thresh:
            variables.remove(variables[maxloc])
            dropped = True
    return variables


variables_list = calculate_vif_(data, 5)


# variables_list = [
#     'gender',
#     'SeniorCitizen',
#     'Partner',
#     'Dependents',
#     'tenure',
#     'PhoneService',
#     'MultipleLines',
#     'InternetService',
#     'OnlineSecurity',
#     'OnlineBackup',
#     'DeviceProtection',
#     'TechSupport',
#     'StreamingTV',
#     'StreamingMovies',
#     'Contract',
#     'MonthlyCharges',
# ]

data = data.loc[:,variables_list]

## Building optimal model using backward elimination

The adjusted R-squared is a modified version of R-squared that has been adjusted for the number of predictors in the model. The adjusted R-squared increases only if the new term improves the model more than would be expected by chance. It decreases when a predictor improves the model by less than expected by chance.<br>
 The p-value is a number between 0 and 1 and interpreted in the following way: A small p-value (typically ≤ 0.05) indicates strong evidence against the null hypothesis, so you reject the null hypothesis.<br>
A null hypothesis is a type of hypothesis used in statistics that proposes that no statistical significance exists in a set of given observations. The null hypothesis attempts to show that no variation exists between variables or that a single variable is no different than its mean.

In [9]:
def backwardElimination(data, target, SL:int=0.05):
    
    wdata = data.copy()
    
    wdata.insert(loc=0, column='A', value=1)

    temp = pd.DataFrame(0, index=np.arange(wdata.shape[0]), columns=wdata.columns)
    
    numVars = wdata.shape[1]
    
    for i in range(numVars):
        
        regressor_OLS = sm.OLS(target.values, wdata.values).fit()
        
        maxVar = max(regressor_OLS.pvalues).astype(float)
        
        adjR_before = regressor_OLS.rsquared_adj.astype(float)
        
        if maxVar > SL:
            
            for j in range(numVars-i):
                
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    
                    temp.iloc[:,j] = wdata.iloc[:, j]
                    wdata = wdata.drop(
                        columns=[wdata.columns[j]], 
                        inplace=False
                    )                    
                    
                    tmp_regressor = sm.OLS(target.values, wdata.values).fit()
                    adjR_after = tmp_regressor.rsquared_adj.astype(float)
                    
                    if (adjR_before >= adjR_after):
                        
                        x_rollback = pd.concat([wdata, temp.iloc[:,[0,j]]], axis=1, join="inner")
                        x_rollback = x_rollback.drop(
                            columns=[x_rollback.columns[j]], 
                            inplace=False
                        )
                        
                        x_rollback = x_rollback.drop(
                            columns=['A'], 
                            inplace=False
                        )
                        
                        print (regressor_OLS.summary())
                        
                        return x_rollback
                    
                    else:
                        continue
                        
    regressor_OLS.summary()
    
    wdata = wdata.drop(
        columns=['A'], 
        inplace=False
    )            
    
    return wdata.columns


modeled_columns = backwardElimination(data=data, target=target, SL=0.05)



# modeled_columns = [
#     'SeniorCitizen',
#     'Dependents',
#     'tenure',
#     'PhoneService',
#     'MultipleLines',
#     'InternetService',
#     'OnlineSecurity',
#     'OnlineBackup',
#     'DeviceProtection',
#     'TechSupport',
#     'Contract',
#     'MonthlyCharges'
# ]

data = data.loc[:,modeled_columns]

## Now Splitting Dataset into training and test set

In [10]:
train_data, test_data, train_target, test_target = train_test_split(
    data, target, 
    test_size=0.4, 
    random_state=32,
    shuffle=True
)

The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. The precision is intuitively the ability of the classifier not to label as positive a sample that is negative. The best value is 1 and the worst value is 0.<br><br>
The recall is the ratio tp / (tp + fn) where tp is the number of true positives and fn the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples. The best value is 1 and the worst value is 0.<br>

Here we can easily observe cluster of red dots i.e. Churn 'Yes' and blue dot i.e. Churn 'No'.

# Implementing Machine Learning Models

Устройства, доступные для вычислений

In [11]:
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 14981125403561195719
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 2257256448
locality {
  bus_id: 1
  links {
  }
}
incarnation: 12846258848837192699
physical_device_desc: "device: 0, name: NVIDIA GeForce GTX 1650, pci bus id: 0000:09:00.0, compute capability: 7.5"
xla_global_id: 416903419
]


In [12]:
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))

Device mapping:
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: NVIDIA GeForce GTX 1650, pci bus id: 0000:09:00.0, compute capability: 7.5



In [13]:
# Создание последовательной модели
model = Sequential([
    Dense(
        units=64, 
        activation='relu', 
        input_shape=(train_data.shape[1],), 
        kernel_initializer = 'glorot_uniform',
    ),
    Dense(
        units=128, 
        activation='relu', 
        input_shape=(train_data.shape[1],), 
        kernel_initializer="glorot_uniform",
    ),
    Dense(
        units=64, 
        activation='relu', 
        input_shape=(train_data.shape[1],),
        kernel_initializer="glorot_uniform",
    ),
    Dense(
        units=32, 
        activation='relu', 
        input_shape=(train_data.shape[1],), 
        kernel_initializer="glorot_uniform",
    ),
    Dense(
        units=1, 
        activation = 'sigmoid',
    )
])

# Компиляция модели
model.compile(
    optimizer=Adam(), 
    loss='binary_crossentropy', 
    metrics=['accuracy']
)

In [14]:
# Параметры скомпилированной сети
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                832       
                                                                 
 dense_1 (Dense)             (None, 128)               8320      
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 dense_3 (Dense)             (None, 32)                2080      
                                                                 
 dense_4 (Dense)             (None, 1)                 33        
                                                                 
Total params: 19,521
Trainable params: 19,521
Non-trainable params: 0
_________________________________________________________________
None


In [15]:
%%time

# Обучение сети
np.random.seed(42)
with tf.device("/gpu:0"):
    history = model.fit(
        # Данные для обучения
        train_data.values, train_target.values,
        # Размер мини-выборки
        batch_size=32, 
        # Количество эпох для обучения
        epochs=500,
        # Валидационная выборка
        validation_data=(test_data.values, test_target.values), 
        # Необходимость перемешивания данных
        shuffle=True,
        # Уровень вывода данных о процессе обучения
        verbose=1,
        initial_epoch=0,
        # Создание EarlyStopping Callback для остановки обучения сети в случае переобучения
        callbacks=[
            EarlyStopping(
                # Отслеживаемая на каждой эпохе метрика
                monitor='accuracy', 
                # Терпение - количество эпох, на которых может не быть улучшения выбранной метрики 
                # до ранней остановки
                patience=5
            ), 
            EarlyStopping(monitor='val_accuracy', patience=5)
        ]
    )

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
CPU times: user 3.54 s, sys: 261 ms, total: 3.8 s
Wall time: 2.72 s


In [16]:
print("***Training***")
train_target_pred = (model.predict(train_data.values) >= 0.5).astype(np.int32).reshape(-1,)
print("accuracy: ", accuracy_score(train_target.values,train_target_pred))
print("precision: ", precision_score(train_target.values,train_target_pred))
print("recall: ", recall_score(train_target.values,train_target_pred))

print("***Testing***")
test_target_pred = (model.predict(test_data.values) >= 0.5).astype(np.int32).reshape(-1,)
print("accuracy: ", accuracy_score(test_target.values,test_target_pred))
print("precision: ", precision_score(test_target.values,test_target_pred))
print("recall: ", recall_score(test_target.values,test_target_pred))

***Training***
accuracy:  0.8210650887573965
precision:  0.6955974842767295
recall:  0.5182755388940956
***Testing***
accuracy:  0.783179559971611
precision:  0.6666666666666666
recall:  0.4763092269326683


In [20]:
model_name = f'model_Sequential'
model_path = os.path.join(PATH_TO_MODELS, model_name + '.h5')

model.save(filepath=model_path, overwrite=True)