<a href="https://colab.research.google.com/github/SolemnShark871/Churn/blob/main/Churn_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ML Experiments for Churn

In [1]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV   #####Revisar esta librería.
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
import pickle
import datetime
import numpy as np

import tensorflow as tf
from tensorflow.keras.optimizers import SGD, Adam
from keras import layers, models
from keras.layers import Dense, Dropout
from keras.models import Sequential
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

from google.colab import drive

drive.mount('/content/drive', force_remount= True)
os.chdir('/content/drive/MyDrive/Churn/Churn_Alejo') #Thomas & Favio & Juan
os.listdir("./")

Mounted at /content/drive


['Datasets_Churn-Update.ipynb',
 '.~lock.predictions_2022-01-19 08_30_05.545007.csv#',
 'predictions_2022-01-19 08_30_05.545007.csv',
 'configuration.py',
 '__pycache__',
 '.idea',
 'Results_2022-01-18 17_12_10.040628',
 'retired_people2022-01-18 10:30:52.715565.csv',
 'non_retired_people2022-01-18 10:34:45.437345.csv',
 'Results_2022-01-19 17:39:25.232474',
 'Results_2022-01-19 20:22:09.864107',
 'Results_2022-01-20 14:42:48.649244',
 'Results_2022-01-20 14:53:57.580864',
 'data_to_predict_2022-01-18 16:12:55.449522.csv',
 'Results_2022-01-20 19:48:40.398198',
 'Results_2022-01-20 20:16:52.366066',
 'Data_for_prediction.ipynb',
 'Results_2022-01-21 13:51:21.197030',
 'Results_2022-01-21 13:51:30.639300',
 'Results_2022-01-24 13:47:16.391509',
 'Results_2022-01-24 14:25:46.575473',
 'Results_2022-01-24 14:56:24.455407',
 'Results_2022-01-24 15:54:38.628937',
 'Results_2022-01-24 16:07:34.582229',
 'Results_2022-01-24 16:07:46.698835',
 'Results_2022-01-24 16:26:13.730285',
 'Results_20

## 1. Preprocessing
Importing the datasets for retired and non retired people


In [2]:
df_retired = pd.read_csv("retired_people2022-01-18 10:30:52.715565.csv", index_col=False)
df_non_retired = pd.read_csv("non_retired_people2022-01-18 10:34:45.437345.csv", index_col=False)

#### Preprocessing function

In [13]:
def preprocessing(df_retired, df_non_retired):
    #Analysis empty columns
    list_retired = []
    for column in df_retired.columns:
        if df_retired[column].any() == False:
            list_retired.append(column)

    list_non_retired = []
    for column in df_non_retired.columns:
        if df_non_retired[column].any() == False:
            list_non_retired.append(column)

    common_lists = []
    for i in list_retired:
        if i in list_non_retired:
            common_lists.append(i)

    #Drop empty columns
    df_retired.drop(columns = common_lists, inplace = True)
    df_non_retired.drop(columns = common_lists, inplace = True)

    #Delete weird data from EDAD and TIEMPO_EMP columns
    df_retired = df_retired.where((df_retired['EDAD'] >= 16) & (df_retired['TIEMPO_EMP'] >= 0)).dropna()
    df_non_retired = df_non_retired.where((df_non_retired['EDAD'] >= 16) & (df_non_retired['TIEMPO_EMP'] >= 0)).dropna()

    #label each set
    df_retired['LABEL'] = 1
    df_non_retired['LABEL'] = 0

    # Merging Data
    df_merged = pd.concat([df_retired, df_non_retired], join = 'inner', ignore_index = True)
    df_merged.drop(['PERSONA'],axis = 1, inplace=True)

    return df_merged, common_lists

In [4]:
def normalize_data(df_merged):

    labels = df_merged['LABEL']
    df_merged_scaled = df_merged.loc[:, df_merged.columns != 'LABEL']

    cols_ = df_merged_scaled.columns

    scaler = StandardScaler()
    scaler.fit(df_merged_scaled)
    df_merged_scaled = scaler.transform(df_merged_scaled)

    df_merged_scaled = pd.DataFrame(df_merged_scaled, columns=cols_)
    df_merged_scaled['LABEL'] = labels

    return df_merged_scaled, scaler

In [5]:
def pca(df, components):

    labels = df['LABEL']
    df_1 = df.loc[:, df.columns != 'LABEL']

    pca_ = PCA(n_components = components)
    pca_.fit(df_1)
    df_1_transformed = pca_.transform(df_1)

    df_pca = pd.DataFrame(df_1_transformed)
    df_pca['LABEL'] = labels

    return df_pca, pca_

## 2. Training

#### SVM

In [6]:
def svm_churn(df, param_grid):

    ### Train test split FOR NUMERICAL ALGORITHMS: 20% test
    X = df.drop(['LABEL'],axis = 1)
    y = df['LABEL']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y)

    svm = SVC()

    grid_search = GridSearchCV(svm, param_grid=param_grid,cv=5,verbose=2,scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train,y_train)

    best_model_params = grid_search.best_params_

    best_model =   SVC(kernel = best_model_params['kernel'], C = best_model_params['C'],
                    class_weight = best_model_params['class_weight'], gamma = best_model_params['gamma'])

    best_model.fit(X_train, y_train)

    y_pred = best_model.predict(X_test)

    print('----------Model report on all classes ----------')
    print(classification_report(y_test,y_pred, output_dict=True))

    return best_model, classification_report

In [7]:
def svm_churn_tts(df, param_grid, test_train_split):

    ### Train test split FOR NUMERICAL ALGORITHMS: 20% test
    X = df.drop(['LABEL'],axis = 1)
    y = df['LABEL']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_train_split, stratify = y)

    svm = SVC()

    grid_search = GridSearchCV(svm, param_grid=param_grid,cv=5,verbose=2,scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train,y_train)

    best_model_params = grid_search.best_params_

    best_model =   SVC(kernel = best_model_params['kernel'], C = best_model_params['C'],
                    class_weight = best_model_params['class_weight'], gamma = best_model_params['gamma'])

    best_model.fit(X_train, y_train)

    y_pred = best_model.predict(X_test)

    print('----------Model report on all classes ----------')
    print(classification_report(y_test,y_pred, output_dict=False))

    return best_model, classification_report

#### Logistic Regression

In [8]:
def log_reg_churn(df, grid_param):

    ### Train test split FOR NUMERICAL ALGORITHMS: 20% test
    X = df.drop(['LABEL'],axis = 1)
    y = df['LABEL']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y)

    log_reg = LogisticRegression()

    grid_search = GridSearchCV(estimator = log_reg, param_grid = grid_param, n_jobs = -1, cv = 5,
                               verbose = 2, return_train_score = True, scoring = "accuracy")
    grid_search.fit(X_train, y_train)

    best_model_params = grid_search.best_params_
    print(grid_search.best_params_)

    best_model = LogisticRegression(C = best_model_params['C'], penalty = best_model_params['penalty'], solver=best_model_params['solver'],
                                    max_iter=best_model_params['max_iter'], n_jobs = -1)


    best_model.fit(X_train, y_train)

    y_pred = best_model.predict(X_test)


    print('----------Model report on all classes ----------')
    print(classification_report(y_test,y_pred, output_dict=True))

    return best_model, classification_report

#### Neural networks

In [9]:
def nn_churn(df, grid_param):
    X = df.drop(['LABEL'], axis = 1)
    y = df['LABEL']

    # Nested function to create model, required for KerasClassifier
    def create_model(
                    # Default values
                    activation: 'relu',
                    dropout_rate : 0,
                    init_mode: 'uniform',
                    #weight_constraint: 1,
                    optimizer: 'adam',
                    hiden_layers: 2,
                    units: [2, 2]) -> tf.keras.Sequential:
    
        # Create the model
        model = Sequential()
        model.add(Dense(X.shape[1], kernel_initializer =  init_mode, activation = activation))
    
        for i in range(hiden_layers):
            model.add(Dense(units = units[i], activation = activation))
        
        model.add(Dropout(dropout_rate))
        model.add(Dense(1, kernel_initializer = init_mode, activation = 'sigmoid'))
        model.compile(loss = "binary_crossentropy", optimizer = optimizer, metrics = ['accuracy'])
    
        return model

    #Model creation
    model_nn = KerasClassifier(build_fn = create_model)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)
    
    grid_search = GridSearchCV(estimator = model_nn, param_grid = grid_param, n_jobs = -1, 
                               cv = 5, verbose = 2, return_train_score = True, scoring = 'accuracy')
    
    grid_search.fit(X_train, y_train)
    
    best_model_params = grid_search.best_params_
    print(best_model_params)

    best_model = create_model(activation = best_model_params['activation'], dropout_rate = best_model_params['dropout_rate'],
                              init_mode = best_model_params['init_mode'], optimizer = best_model_params['optimizer'], 
                             hiden_layers = best_model_params['hiden_layers'], units = best_model_params['units'])
    

    best_model.fit(X_train, y_train, batch_size = best_model_params['batch_size'], epochs = best_model_params['epochs'], verbose = 2)

    y_pred = np.round(best_model.predict(X_test))


    print('----------Model report on all classes ----------')
    print(classification_report(y_test, y_pred, output_dict=True))

    return best_model, classification_report


## 3. ML Pipeline

In [14]:
## Creating the folder
dir_str = 'Results_' + str(datetime.datetime.now())
os.mkdir(dir_str)

## Preprocessing
preprocessed_data, common_empty_columns = preprocessing(df_retired, df_non_retired)
## Save
preprocessed_data.to_csv(os.path.join(dir_str, 'preprocessed_data.csv'))
pickle.dump(common_empty_columns, open(os.path.join(dir_str,'common_empty_columns.pkl'), 'wb'))


## Normalization
scaled_data, scaler = normalize_data(preprocessed_data)
## Save
scaled_data.to_csv(os.path.join(dir_str,'scaled_data.csv'))
pickle.dump(scaler, open(os.path.join(dir_str,'scaler.pkl'), 'wb'))


## PCA
pca_data, pca_ = pca(scaled_data, 250)
## Save
pca_data.to_csv(os.path.join(dir_str,'pca_data.csv'))
pickle.dump(pca_, open(os.path.join(dir_str,'pca_model.pkl'), 'wb'))

##---------------------------------------------------------------------------------------------------------
## ML SVM
grid_param_svm = {'C':  [0.0001, 0.001,0.01, 0.1, 1, 10, 100],
                  'gamma' : ['scale', 'auto', 0.0001, 0.001, 0.01, 0.1, 10, 100],
                  'class_weight': ['balanced', None],
                  'kernel' : ['sigmoid','poly','rbf']}
best_svm, report_svm = svm_churn(pca_data, grid_param_svm)
## Save
pickle.dump(best_svm, open(os.path.join(dir_str,'svm.sav'), 'wb'))
pickle.dump(report_svm, open(os.path.join(dir_str,'svm_metrics.pkl'), 'wb'))


##----------------------------------------------------------------------------------------------------------
## ML Logistic Regression
grid_param_lr = {"penalty": ["l1", "l2", "elasticnet", "none"],
              "C": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
              "solver": ["newton-cg", "lbfgs", "liblinear", 'saga'],
              "max_iter": [500]
              }
best_lr, report_lr = log_reg_churn(pca_data, grid_param_lr)
## Save
pickle.dump(best_lr, open(os.path.join(dir_str,'lr.sav'), 'wb'))
pickle.dump(report_lr, open(os.path.join(dir_str,'lr_metrics.pkl'), 'wb'))

##----------------------------------------------------------------------------------------------------------
## ML Neural Network
grid_param_nn = {
    "activation": ['selu','softplus','softmax'],
    "init_mode": ['he_normal', 'glorot_normal'],
    "dropout_rate": [0.8],
    "units": [[8, 4]],
    "optimizer": ['RMSprop', 'Adam', 'SGD'],
    "hiden_layers": [2],
    "epochs": [15],
    "batch_size":  [128]
}
best_nn, report_nn = nn_churn(pca_data, grid_param_nn)
## Save
pickle.dump(best_nn, open(os.path.join(dir_str,'nn.sav'), 'wb'))
pickle.dump(report_nn, open(os.path.join(dir_str,'nn_metrics.pkl'), 'wb'))

Fitting 5 folds for each of 336 candidates, totalling 1680 fits
----------Model report on all classes ----------
{'0': {'precision': 0.775, 'recall': 0.7209302325581395, 'f1-score': 0.7469879518072289, 'support': 86}, '1': {'precision': 0.8811881188118812, 'recall': 0.9081632653061225, 'f1-score': 0.8944723618090452, 'support': 196}, 'accuracy': 0.851063829787234, 'macro avg': {'precision': 0.8280940594059406, 'recall': 0.814546748932131, 'f1-score': 0.8207301568081371, 'support': 282}, 'weighted avg': {'precision': 0.8488045081103855, 'recall': 0.851063829787234, 'f1-score': 0.8494948467021083, 'support': 282}}
Fitting 5 folds for each of 112 candidates, totalling 560 fits


245 fits failed out of a total of 560.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 449, in _check_solver
    % (solver, penalty)
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

--------------------------------

{'C': 1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'saga'}




----------Model report on all classes ----------
{'0': {'precision': 0.7386363636363636, 'recall': 0.7558139534883721, 'f1-score': 0.7471264367816092, 'support': 86}, '1': {'precision': 0.8917525773195877, 'recall': 0.8826530612244898, 'f1-score': 0.8871794871794872, 'support': 196}, 'accuracy': 0.8439716312056738, 'macro avg': {'precision': 0.8151944704779757, 'recall': 0.819233507356431, 'f1-score': 0.8171529619805482, 'support': 282}, 'weighted avg': {'precision': 0.8450575617991719, 'recall': 0.8439716312056738, 'f1-score': 0.8444682732283614, 'support': 282}}
Fitting 5 folds for each of 18 candidates, totalling 90 fits




Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
{'activation': 'softplus', 'batch_size': 128, 'dropout_rate': 0.8, 'epochs': 15, 'hiden_layers': 2, 'init_mode': 'glorot_normal', 'optimizer': 'RMSprop', 'units': [8, 4]}
Epoch 1/15
9/9 - 1s - loss: 0.7868 - accuracy: 0.6282 - 804ms/epoch - 89ms/step
Epoch 2/15
9/9 - 0s - loss: 0.6692 - accuracy: 0.6664 - 43ms/epoch - 5ms/step
Epoch 3/15
9/9 - 0s - loss: 0.6570 - accuracy: 0.6593 - 37ms/epoch - 4ms/step
Epoch 4/15
9/9 - 0s - loss: 0.6568 - accuracy: 0.6610 - 39ms/epoch - 4ms/step
Epoch 5/15
9/9 - 0s - loss: 0.6277 - accuracy: 0.6619 - 42ms/epoch - 5ms/step
Epoch 6/15
9/9 - 0s - loss: 0.6217 - accuracy: 0.6646 - 42ms/epoch - 5ms/step
Epoch 7/15
9/9 - 0s - loss: 0.5942 - accuracy: 0.6841 - 46ms/epoch - 5ms/step
Epoch 8/15
9/9 - 0s - loss: 0.5748 - accuracy: 0.7019 - 45ms/epoch - 5ms/step
Epoch 9/15
9/9 - 0s - loss: 0.58

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


INFO:tensorflow:Assets written to: ram://0c92f825-0c06-4c0a-9db7-61279d6352d8/assets


In [None]:
#TTS for SVM

## Creating the folder
dir_str = 'Results_' + str(datetime.datetime.now())
os.mkdir(dir_str)

## Preprocessing
preprocessed_data, common_empty_columns = preprocessing(df_retired, df_non_retired)
## Save
preprocessed_data.to_csv(os.path.join(dir_str, 'preprocessed_data.csv'))
pickle.dump(common_empty_columns, open(os.path.join(dir_str,'common_empty_columns.pkl'), 'wb'))


## Normalization
scaled_data, scaler = normalize_data(preprocessed_data)
## Save
scaled_data.to_csv(os.path.join(dir_str,'scaled_data.csv'))
pickle.dump(scaler, open(os.path.join(dir_str,'scaler.pkl'), 'wb'))


## PCA
pca_data, pca_ = pca(scaled_data, 250)
## Save
pca_data.to_csv(os.path.join(dir_str,'pca_data.csv'))
pickle.dump(pca_, open(os.path.join(dir_str,'pca_model.pkl'), 'wb'))

##---------------------------------------------------------------------------------------------------------
## ML SVM
grid_param_svm = {'C':  [0.0001, 0.001,0.01, 0.1, 1, 10, 100],
                  'gamma' : ['scale', 'auto', 0.0001, 0.001, 0.01, 0.1, 10, 100],
                  'class_weight': ['balanced', None],
                  'kernel' : ['sigmoid','poly','rbf']}

## Perform 3 test-train splits

TTS = [0.2,0.1,0.05] #Try reversing this to see if that does anything 


#TODO try renaming these best_svmTTS (without for loop)
print("20% Test")
best_svm20, report_svm20 = svm_churn_tts(pca_data, grid_param_svm, 0.2)
print()

print("10% Test")
best_svm10, report_svm10 = svm_churn_tts(pca_data, grid_param_svm, 0.1)
print()

print("5% Test")
best_svm5, report_svm5 = svm_churn_tts(pca_data, grid_param_svm, 0.05)

20% Test
Fitting 5 folds for each of 336 candidates, totalling 1680 fits
----------Model report on all classes ----------
              precision    recall  f1-score   support

           0       0.80      0.73      0.76        86
           1       0.89      0.92      0.90       199

    accuracy                           0.86       285
   macro avg       0.84      0.83      0.83       285
weighted avg       0.86      0.86      0.86       285


10% Test
Fitting 5 folds for each of 336 candidates, totalling 1680 fits
----------Model report on all classes ----------
              precision    recall  f1-score   support

           0       0.86      0.74      0.80        43
           1       0.90      0.95      0.92       100

    accuracy                           0.89       143
   macro avg       0.88      0.85      0.86       143
weighted avg       0.89      0.89      0.89       143


5% Test
Fitting 5 folds for each of 336 candidates, totalling 1680 fits
----------Model report on al

In [None]:
## Save
pickle.dump(best_svm, open(os.path.join(dir_str,'svm.sav'), 'wb'))
pickle.dump(report_svm, open(os.path.join(dir_str,'svm_metrics.pkl'), 'wb'))

## 4. Inference

This just works after you execute the queries to predict

In [None]:
df_from_queries = pd.read_csv('data_to_predict_2022-01-18 16:12:55.449522.csv', index_col=False)

def inference(df, model, common_lists, scaler, pca_model):

    ids = df['PERSONA']
    df.drop(['PERSONA'],axis = 1, inplace=True)

    df.drop(columns = common_lists, inplace = True)
    df_scaled = scaler.transform(df)
    data_pca = pca_model.transform(df_scaled)

    predictions = model.predict(data_pca)

    df_predictions = pd.DataFrame(predictions)
    df_predictions['PERSONAS'] = ids

    df_predictions.to_csv('predictions_' + str(datetime.datetime.now()) + '.csv')

In [None]:
## EXAMPLE

folder = 'Results_2022-01-18 17:12:10.040628'
common_empty_columns = pickle.load(open(os.path.join(folder, 'common_empty_columns.pkl'),'rb'))
scaler = pickle.load(open(os.path.join(folder, 'scaler.pkl'),'rb'))
pca_model = pickle.load(open(os.path.join(folder, 'pca_model.pkl'),'rb'))
model = pickle.load(open(os.path.join(folder, 'lr.sav'),'rb'))

inference(df_from_queries, model, common_empty_columns, scaler, pca_model)

FileNotFoundError: ignored