In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics
import pandas as pd
import numpy as np

In [2]:
data_train = pd.read_csv(r'C:\Users\BTC\Downloads\csv_files (1)\data\data_train.csv')
data_test = pd.read_csv(r'C:\Users\BTC\Downloads\csv_files (1)\data\data_test.csv')

In [3]:
def identify_customers(data_train, data_test):
    
    for col in data_train.columns:
        if col not in ['age','account','duration','label']:
            data_train[col] = data_train[col].map({'yes': 1, 'no': 0})
            data_test[col] = data_test[col].map({'yes': 1, 'no': 0})
    
    onehot_train = data_train.copy()
    onehot_test = data_test.copy()

    # Calculate the proportion of positive class labels in onehot_train
    prop = round((onehot_train['label'] == 1).mean(), 3)

    # Build the LogisticRegression classifier
    lr_class_weights = {0: prop, 1: 1 - prop}
    lr_model = LogisticRegression(class_weight=lr_class_weights, random_state=0, max_iter=50)
    lr_model.fit(onehot_train.drop('label', axis=1), onehot_train['label'])

    # Build the RandomForest classifier
    rf_class_weights = {0: prop, 1: 1 - prop}
    rf_model = RandomForestClassifier(max_depth=10, random_state=0, n_estimators=30, class_weight=rf_class_weights)
    rf_model.fit(onehot_train.drop('label', axis=1), onehot_train['label'])

    # Get negative impact variables from the LogisticRegression classifier
    coefs = pd.Series(lr_model.coef_[0], index=onehot_train.drop('label', axis=1).columns)
    negative_impact = list(coefs.sort_values().index)[:5]

    # Get feature importance from the RandomForest classifier
    feature_importance = list(zip(onehot_train.drop('label', axis=1).columns, rf_model.feature_importances_))
    feature_importance.sort(key=lambda x: x[1], reverse=True)
    feature_importance = feature_importance[:5]

    # Calculate recall scores for the classifiers on both train and test sets
    lr_train_recall = round(lr_model.score(onehot_train.drop('label', axis=1), onehot_train['label']), 3)
    lr_test_recall = round(lr_model.score(onehot_test.drop('label', axis=1), data_test['label']), 3)
    rf_train_recall = round(rf_model.score(onehot_train.drop('label', axis=1), onehot_train['label']), 3)
    rf_test_recall = round(rf_model.score(onehot_test.drop('label', axis=1), data_test['label']), 3)

    # Get sorted indexes of test set predictions for both classifiers
    lr_probs = lr_model.predict_proba(onehot_test.drop('label', axis=1))[:, 1]
    lr_obs = list(lr_probs.argsort()[::-1])
    rf_probs = rf_model.predict_proba(onehot_test.drop('label', axis=1))[:, 1]
    rf_obs = list(rf_probs.argsort()[::-1])
    

    return {
        'onehot_train': onehot_train.drop('label', axis=1),
        'onehot_test': onehot_test.drop('label', axis=1),
        'prop': prop,
        'negative_impact': negative_impact,
        'feature_importance': feature_importance,
        'lr_recall': (lr_train_recall, lr_test_recall),
        'rf_recall': (rf_train_recall,rf_test_recall),
        'lr_obs': lr_obs,
        'rf_obs': rf_obs,
    }


In [4]:
identify_customers(data_train,data_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'onehot_train':       age  account  if_marital  if_default  if_housing  if_loan  \
 0      41     2408           0           0           0        0   
 1      59     4007           1           0           0        0   
 2      35      482           1           0           0        0   
 3      49        0           1           0           1        0   
 4      23      834           0           0           1        0   
 ...   ...      ...         ...         ...         ...      ...   
 4395   24     -389           1           0           1        1   
 4396   27     1977           0           0           1        0   
 4397   25      448           1           0           0        0   
 4398   48        0           1           0           0        0   
 4399   33      136           0           0           1        0   
 
       if_active_selling  duration  occupation_cleaner  occupation_management  \
 0                     0       122                   0                      1   
 1  