# ML Experiments for Churn

In [1]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV   #####Revisar esta librería.
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
import pickle
import datetime

from google.colab import drive

drive.mount('/content/drive', force_remount= True)
os.chdir('/content/drive/MyDrive/Churn/Churn_Alejo') #Thomas & Favio & Juan
os.listdir("./")

Mounted at /content/drive


['Datasets_Churn-Update.ipynb',
 'Data_for_prediction.ipynb',
 '.~lock.predictions_2022-01-19 08_30_05.545007.csv#',
 'predictions_2022-01-19 08_30_05.545007.csv',
 'configuration.py',
 'data_to_predict_2022-01-18 16_12_55.449522.csv',
 '__pycache__',
 '.idea',
 'Results_2022-01-18 17_12_10.040628',
 'retired_people2022-01-18 10:30:52.715565.csv',
 'non_retired_people2022-01-18 10:34:45.437345.csv',
 'Results_2022-01-19 17:39:25.232474',
 'Results_2022-01-19 20:22:09.864107',
 'Churn_ML.ipynb']

## 1. Preprocessing
Importing the datasets for retired and non retired people


In [2]:
df_retired = pd.read_csv("retired_people2022-01-18 10:30:52.715565.csv", index_col=False)
df_non_retired = pd.read_csv("non_retired_people2022-01-18 10:34:45.437345.csv", index_col=False)

#### Preprocessing function

In [3]:
def preprocessing(df_retired, df_non_retired):
    #Analysis empty columns
    list_retired = []
    for column in df_retired.columns:
        if df_retired[column].any() == False:
            list_retired.append(column)

    list_non_retired = []
    for column in df_non_retired.columns:
        if df_non_retired[column].any() == False:
            list_non_retired.append(column)

    common_lists = []
    for i in list_retired:
        if i in list_non_retired:
            common_lists.append(i)

    #Drop empty columns
    df_retired.drop(columns = common_lists, inplace = True)
    df_non_retired.drop(columns = common_lists, inplace = True)

    #label each set
    df_retired['LABEL'] = 1
    df_non_retired['LABEL'] = 0

    # Merging Data
    df_merged = pd.concat([df_retired, df_non_retired], join = 'inner', ignore_index = True)
    df_merged.drop(['PERSONA'],axis = 1, inplace=True)

    return df_merged, common_lists

In [4]:
def normalize_data(df_merged):

    labels = df_merged['LABEL']
    df_merged_scaled = df_merged.loc[:, df_merged.columns != 'LABEL']

    cols_ = df_merged_scaled.columns

    scaler = StandardScaler()
    scaler.fit(df_merged_scaled)
    df_merged_scaled = scaler.transform(df_merged_scaled)

    df_merged_scaled = pd.DataFrame(df_merged_scaled, columns=cols_)
    df_merged_scaled['LABEL'] = labels

    return df_merged_scaled, scaler

In [5]:
def pca(df, components):

    labels = df['LABEL']
    df_1 = df.loc[:, df.columns != 'LABEL']

    pca_ = PCA(n_components = components)
    pca_.fit(df_1)
    df_1_transformed = pca_.transform(df_1)

    df_pca = pd.DataFrame(df_1_transformed)
    df_pca['LABEL'] = labels

    return df_pca, pca_

## 2. Training

#### SVM

In [6]:
def svm_churn(df, param_grid):

    ### Train test split FOR NUMERICAL ALGORITHMS: 20% test
    X = df.drop(['LABEL'],axis = 1)
    y = df['LABEL']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y)

    svm = SVC()

    grid_search = GridSearchCV(svm, param_grid=param_grid,cv=5,verbose=2,scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train,y_train)

    best_model_params = grid_search.best_params_

    best_model =   SVC(kernel = best_model_params['kernel'], C = best_model_params['C'],
                    class_weight = best_model_params['class_weight'], gamma = best_model_params['gamma'])

    best_model.fit(X_train, y_train)

    y_pred = best_model.predict(X_test)

    print('----------Model report on all classes ----------')
    print(classification_report(y_test,y_pred, output_dict=True))

    return best_model, classification_report

#### Logistic Regression

In [7]:
def log_reg_churn(df, grid_param):

    ### Train test split FOR NUMERICAL ALGORITHMS: 20% test
    X = df.drop(['LABEL'],axis = 1)
    y = df['LABEL']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y)

    log_reg = LogisticRegression()

    grid_search = GridSearchCV(estimator = log_reg, param_grid = grid_param, n_jobs = -1, cv = 5,
                               verbose = 2, return_train_score = True, scoring = "accuracy")
    grid_search.fit(X_train, y_train)

    best_model_params = grid_search.best_params_
    print(grid_search.best_params_)

    best_model = LogisticRegression(C = best_model_params['C'], penalty = best_model_params['penalty'], solver=best_model_params['solver'],
                                    max_iter=best_model_params['max_iter'], n_jobs = -1)


    best_model.fit(X_train, y_train)

    y_pred = best_model.predict(X_test)


    print('----------Model report on all classes ----------')
    print(classification_report(y_test,y_pred, output_dict=True))

    return best_model, classification_report

#### Neural networks

In [8]:
# Function to create model, required for KerasClassifier
def create_model(
    # Default values
    activation: 'relu',
    dropout_rate : 0,
    init_mode: 'uniform',
    #weight_constraint: 1,
    optimizer: 'adam',
    hiden_layers: 2,
    units: [2, 2],
    X) -> tf.keras.Sequential:
    
    # Create the model
    model = Sequential()
    model.add(Dense(X.shape[1], kernel_initializer =  init_mode, activation = activation))
    
    for i in range(hiden_layers):
        model.add(Dense(units = units[i], activation = activation))
        
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, kernel_initializer = init_mode, activation = 'sigmoid'))
    model.compile(loss = "binary_crossentropy", optimizer = optimizer, metrics = ['accuracy'])
    
    return model

NameError: ignored

In [None]:
def nn_churn(df, grid_param):
    #Model creation
    model_nn = KerasClassifier(build_fn = create_model)

    X = df.drop(['LABEL'], axis = 1)
    y = df['LABEL']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)
    
    grid_search = GridSearchCV(estimator = model_nn, param_grid = grid_param, n_jobs = -1, 
                               cv = 5, verbose = 2, return_train_score = True, scoring = 'accuracy')
    grid_search.fit(X_train, y_train)

    best_model_params = grid_search.best_params_
    print(best_model_params)

    best_model = create_model(activation = best_model_params['activation'], dropout_rate = best_model_params['dropout_rate'],
                              init_mode = best_model_params['init_mode'], optimizer = best_model_params['optimizer'], 
                              hiden_layers = best_model_params['hiden_layers'], units = best_model_params['units'])

    return best_model, classification_report


## 3. ML Pipeline

In [None]:
## Creating the folder
dir_str = 'Results_' + str(datetime.datetime.now())
os.mkdir(dir_str)

## Preprocessing
preprocessed_data, common_empty_columns = preprocessing(df_retired, df_non_retired)
## Save
preprocessed_data.to_csv(os.path.join(dir_str, 'preprocessed_data.csv'))
pickle.dump(common_empty_columns, open(os.path.join(dir_str,'common_empty_columns.pkl'), 'wb'))


## Normalization
scaled_data, scaler = normalize_data(preprocessed_data)
## Save
scaled_data.to_csv(os.path.join(dir_str,'scaled_data.csv'))
pickle.dump(scaler, open(os.path.join(dir_str,'scaler.pkl'), 'wb'))


## PCA
pca_data, pca_ = pca(scaled_data, 250)
## Save
pca_data.to_csv(os.path.join(dir_str,'pca_data.csv'))
pickle.dump(pca_, open(os.path.join(dir_str,'pca_model.pkl'), 'wb'))

##---------------------------------------------------------------------------------------------------------
## ML SVM
param_grid = {'C':  [0.0001, 0.001,0.01, 0.1, 1, 10, 100],
                  'gamma' : ['scale', 'auto', 0.0001, 0.001, 0.01, 0.1, 10, 100],
                  'class_weight': ['balanced', None],
                  'kernel' : ['sigmoid','poly','rbf']}
best_svm, report_svm = svm_churn(pca_data, param_grid)
## Save
pickle.dump(best_svm, open(os.path.join(dir_str,'svm.sav'), 'wb'))
pickle.dump(report_svm, open(os.path.join(dir_str,'svm_metrics.pkl'), 'wb'))


##----------------------------------------------------------------------------------------------------------
## ML Logistic Regression
grid_param = {"penalty": ["l1", "l2", "elasticnet", "none"],
              "C": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
              "solver": ["newton-cg", "lbfgs", "liblinear", 'saga'],
              "max_iter": [500]
              }
best_lr, report_lr = log_reg_churn(pca_data, grid_param)
## Save
pickle.dump(best_lr, open(os.path.join(dir_str,'lr.sav'), 'wb'))
pickle.dump(report_lr, open(os.path.join(dir_str,'lr_metrics.pkl'), 'wb'))

Fitting 5 folds for each of 336 candidates, totalling 1680 fits
----------Model report on all classes ----------
{'0': {'precision': 0.8309859154929577, 'recall': 0.686046511627907, 'f1-score': 0.751592356687898, 'support': 86}, '1': {'precision': 0.8738317757009346, 'recall': 0.9396984924623115, 'f1-score': 0.9055690072639225, 'support': 199}, 'accuracy': 0.8631578947368421, 'macro avg': {'precision': 0.8524088455969462, 'recall': 0.8128725020451093, 'f1-score': 0.8285806819759103, 'support': 285}, 'weighted avg': {'precision': 0.8609028494627381, 'recall': 0.8631578947368421, 'f1-score': 0.8591058776164204, 'support': 285}}
Fitting 5 folds for each of 112 candidates, totalling 560 fits


245 fits failed out of a total of 560.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 449, in _check_solver
    % (solver, penalty)
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

--------------------------------

{'C': 0.01, 'max_iter': 500, 'penalty': 'l2', 'solver': 'saga'}
----------Model report on all classes ----------
{'0': {'precision': 0.828125, 'recall': 0.6162790697674418, 'f1-score': 0.7066666666666667, 'support': 86}, '1': {'precision': 0.8506787330316742, 'recall': 0.9447236180904522, 'f1-score': 0.8952380952380953, 'support': 199}, 'accuracy': 0.8456140350877193, 'macro avg': {'precision': 0.8394018665158371, 'recall': 0.780501343928947, 'f1-score': 0.800952380952381, 'support': 285}, 'weighted avg': {'precision': 0.8438730451694847, 'recall': 0.8456140350877193, 'f1-score': 0.8383358395989975, 'support': 285}}


## 4. Inference

This just works after you execute the queries to predict

In [None]:
df_from_queries = pd.read_csv('data_to_predict_2022-01-18 16:12:55.449522.csv', index_col=False)

def inference(df, model, common_lists, scaler, pca_model):

    ids = df['PERSONA']
    df.drop(['PERSONA'],axis = 1, inplace=True)

    df.drop(columns = common_lists, inplace = True)
    df_scaled = scaler.transform(df)
    data_pca = pca_model.transform(df_scaled)

    predictions = model.predict(data_pca)

    df_predictions = pd.DataFrame(predictions)
    df_predictions['PERSONAS'] = ids

    df_predictions.to_csv('predictions_' + str(datetime.datetime.now()) + '.csv')

FileNotFoundError: ignored

In [None]:
## EXAMPLE

folder = 'Results_2022-01-18 17:12:10.040628'
common_empty_columns = pickle.load(open(os.path.join(folder, 'common_empty_columns.pkl'),'rb'))
scaler = pickle.load(open(os.path.join(folder, 'scaler.pkl'),'rb'))
pca_model = pickle.load(open(os.path.join(folder, 'pca_model.pkl'),'rb'))
model = pickle.load(open(os.path.join(folder, 'lr.sav'),'rb'))

inference(df_from_queries, model, common_empty_columns, scaler, pca_model)