In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score


In [7]:
from collections import Counter


# создаем собственный класс, заменяющий отрицательные
# и нулевые значения на небольшие положительные
class Replacer(BaseEstimator, TransformerMixin):
    """
    Заменяет отрицательные и нулевые значения на
    небольшое положительное значение.
    
    Параметры
    ----------
    repl: float, по умолчанию 0.1
        Значение для замены.
    """
    def __init__(self, repl_value=0.1):
        self.repl_value = repl_value
    
    # fit здесь бездельничает
    def fit(self, X, y=None):
        return self
    
    # transform выполняет всю работу: применяет преобразование 
    # с помощью заданного значения параметра repl_value
    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            X[X <= 0] = self.repl_value
        else:
            X = np.where(X <= 0, self.repl_value, X)
        return X
    
    

In [8]:
pd.set_option('display.max_rows', 200)

In [9]:
data = pd.read_csv('/home/tigran/Downloads/code/Data/Verizon.csv', sep=';')
data.head()

Unnamed: 0,longdist,internat,local,int_disc,billtype,pay,age,gender,marital,children,income,churn
0,8.62,,8.49,Нет,Бюджетный,CH,43.0,Мужской,_Женат,0.0,33935.8,0
1,21.27,0.0,21812.0,Нет,Бюджетный,CH,60.0,,_Одинокий,2.0,959306.0,1
2,613.0,0.0,,Да,,,25.0,Женский,,2.0,29534.0,1
3,16.46,0.0,5766.0,Да,Бесплатный,,93.0,Женский,Одинокий,0.0,,1
4,,0.0,1601.0,Да,Бесплатный,CC,68.0,Женский&*,,0.0,998329.0,1


In [10]:
data.describe()

Unnamed: 0,age,children,churn
count,1475.0,1476.0,1477.0
mean,57.585763,0.988482,0.436019
std,22.833863,0.824541,0.496057
min,18.0,0.0,0.0
25%,38.0,0.0,0.0
50%,58.0,1.0,0.0
75%,77.0,2.0,1.0
max,97.0,2.0,1.0


In [11]:
data[data.duplicated(keep=False)]

Unnamed: 0,longdist,internat,local,int_disc,billtype,pay,age,gender,marital,children,income,churn
0,8.62,,8.49,Нет,Бюджетный,CH,43.0,Мужской,_Женат,0.0,33935.8,0
13,8.62,,8.49,Нет,Бюджетный,CH,43.0,Мужской,_Женат,0.0,33935.8,0
14,8.62,,8.49,Нет,Бюджетный,CH,43.0,Мужской,_Женат,0.0,33935.8,0


In [12]:
data.drop_duplicates(subset=None, keep='first', inplace=True)

In [14]:
cols_lst = data.columns.to_list()
uniq = [data[col].nunique() for col in cols_lst]
types = data.dtypes
pd.DataFrame({'type': types, 'n_uniq': uniq})

Unnamed: 0,type,n_uniq
longdist,object,1081
internat,object,218
local,object,1372
int_disc,object,2
billtype,object,2
pay,object,4
age,float64,80
gender,object,4
marital,object,5
children,float64,3


In [15]:
for col in ['longdist', 'internat', 'local', 'income']:
    data[col] = data[col].str.replace(',', '.').astype('float')

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1475 entries, 0 to 1476
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   longdist  1467 non-null   float64
 1   internat  1469 non-null   float64
 2   local     1466 non-null   float64
 3   int_disc  1471 non-null   object 
 4   billtype  1450 non-null   object 
 5   pay       1470 non-null   object 
 6   age       1473 non-null   float64
 7   gender    1469 non-null   object 
 8   marital   1471 non-null   object 
 9   children  1474 non-null   float64
 10  income    1471 non-null   float64
 11  churn     1475 non-null   int64  
dtypes: float64(6), int64(1), object(5)
memory usage: 149.8+ KB


In [17]:
data.isnull().sum()

longdist     8
internat     6
local        9
int_disc     4
billtype    25
pay          5
age          2
gender       6
marital      4
children     1
income       4
churn        0
dtype: int64

In [18]:
cat_cols = data.select_dtypes(include=['object']).columns.tolist()
for col in cat_cols:
    print(col, data[col].unique())

int_disc ['Нет' 'Да' nan]
billtype ['Бюджетный' nan 'Бесплатный']
pay ['CH' nan 'CC' 'CD' 'Auto']
gender ['Мужской' nan 'Женский' 'Женский&*' 'Мужской&*']
marital ['_Женат' '_Одинокий' nan 'Одинокий' 'Женат' 'Же&нат']


In [19]:
for col in ['gender', 'marital']:
    data[col] = data[col].str.replace('[*&_]', '')
for col in ['gender', 'marital']:
    print(col, data[col].unique())
    

gender ['Мужской' nan 'Женский']
marital ['Женат' 'Одинокий' nan]


  data[col] = data[col].str.replace('[*&_]', '')


In [20]:
for col in cat_cols:
    print(data[col].value_counts(dropna=False))
    print('')

Нет    1015
Да      456
NaN       4
Name: int_disc, dtype: int64

Бюджетный     731
Бесплатный    719
NaN            25
Name: billtype, dtype: int64

CC      846
CH      324
Auto    297
NaN       5
CD        3
Name: pay, dtype: int64

Женский    743
Мужской    726
NaN          6
Name: gender, dtype: int64

Женат       872
Одинокий    599
NaN           4
Name: marital, dtype: int64



In [21]:
data.loc[data['pay'] == 'CD', 'pay'] = 'CC'

In [22]:
data['gender_marital'] = np.where((data['gender'].isnull()) | (data['marital'].isnull()), np.NaN, data.apply(lambda x: f"{x['gender']} + {x['marital']}", axis=1))

In [23]:
data.describe

<bound method NDFrame.describe of       longdist  internat   local int_disc    billtype   pay   age   gender  \
0         8.62       NaN    8.49      Нет   Бюджетный    CH  43.0  Мужской   
1        21.27      0.00  218.12      Нет   Бюджетный    CH  60.0      NaN   
2         6.13      0.00     NaN       Да         NaN   NaN  25.0  Женский   
3        16.46      0.00   57.66       Да  Бесплатный   NaN  93.0  Женский   
4          NaN      0.00   16.01       Да  Бесплатный    CC  68.0  Женский   
...        ...       ...     ...      ...         ...   ...   ...      ...   
1472     25.46      0.00   43.91       Да  Бесплатный  Auto  46.0  Мужской   
1473      8.46      0.00   22.65      Нет  Бесплатный    CC  51.0  Женский   
1474      0.00      0.00    1.65      Нет  Бесплатный  Auto  88.0  Женский   
1475     12.86      0.04  156.51       Да   Бюджетный    CC  18.0  Женский   
1476     14.77      0.00   30.42      Нет  Бесплатный    CH  86.0  Мужской   

       marital  children    i

In [24]:
cond = (data['age'] == 0) | (data['longdist'] == 0)
data['ratio'] = np.where(cond, 0, data['age'] / data['longdist'])

cond = (data['longdist'] == 0) | (data['internat'] == 0)
data['ratio2'] = np.where(cond, 0, data['longdist'] / data['internat'])

cond = (data['income'] == 0) | (data['age'] == 0)
data['ratio3'] = np.where(cond, 0, data['income'] / data['age'])

cond = (data['age'] == 0) | (data['children'] == 0)
data['ratio4'] = np.where(cond, 0, data['age'] / data['children'])

In [26]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('churn', axis=1), \
                                                    data['churn'], test_size=0.3, stratify=data['churn'],\
                                                    random_state=42)

In [27]:
X_train.head()

Unnamed: 0,longdist,internat,local,int_disc,billtype,pay,age,gender,marital,children,income,gender_marital,ratio,ratio2,ratio3,ratio4
1110,19.37,0.0,121.68,Да,Бюджетный,CH,59.0,Женский,Женат,0.0,29953.1,Женский + Женат,3.045947,0.0,507.679661,0.0
152,0.0,0.0,2.71,Нет,Бесплатный,CH,42.0,Женский,Женат,1.0,82666.3,Женский + Женат,0.0,0.0,1968.245238,42.0
700,23.24,0.0,87.62,Нет,Бюджетный,CC,25.0,Женский,Одинокий,0.0,91106.4,Женский + Одинокий,1.075731,0.0,3644.256,0.0
1473,8.46,0.0,22.65,Нет,Бесплатный,CC,51.0,Женский,Одинокий,2.0,2638.11,Женский + Одинокий,6.028369,0.0,51.727647,25.5
1357,13.62,0.0,188.56,Нет,Бюджетный,CC,91.0,Женский,Женат,1.0,82051.9,Женский + Женат,6.681351,0.0,901.669231,91.0


In [28]:
cat_columns = X_train.select_dtypes(include='object').columns.tolist()
num_columns = X_train.select_dtypes(exclude='object').columns.tolist()

In [29]:
cat_pipe = Pipeline([
    ('imp', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

num_pipe = Pipeline([
    ('imp', SimpleImputer()),
    ('repl', Replacer(repl_value=0.1)),
    ('boxcox', PowerTransformer(method='box-cox', standardize=True))
])

In [30]:
transformers = [('cat', cat_pipe, cat_columns), ('num', num_pipe, num_columns)]

In [31]:
ct = ColumnTransformer(transformers=transformers)

In [32]:
ml_pipe = Pipeline([
    ('tr', ct),
    ('logreg', LogisticRegression(solver='liblinear'))
])

In [33]:
ml_pipe.fit(X_train, y_train)

Pipeline(steps=[('tr',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('imp',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['int_disc', 'billtype',
                                                   'pay', 'gender', 'marital',
                                                   'gender_marital']),
                                                 ('num',
                                                  Pipeline(steps=[('imp',
                                                                   SimpleImputer()),
                 

In [34]:
print('AUC_ROC on train subset : {:.3f}'.format(roc_auc_score(y_train, ml_pipe.predict_proba(X_train)[:, 1])))

AUC_ROC on train subset : 0.882


In [35]:
print('AUC_ROC on test subset : {:.3f}'.format(roc_auc_score(y_test, ml_pipe.predict_proba(X_test)[:, 1])))

AUC_ROC on test subset : 0.874


In [37]:
param_grid = {
    'tr__num__imp__strategy': ['mean', 'median', 'constant'],
    'tr__num__repl__repl_value': [0.1, 0.2, 0.3],
    'tr__cat__imp__strategy': ['most_frequent', 'constant'],
    'logreg__C': np.logspace(-2, 1, 10)
}

In [38]:
gs = GridSearchCV(ml_pipe, param_grid, cv=5, scoring='roc_auc')
gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tr',
                                        ColumnTransformer(transformers=[('cat',
                                                                         Pipeline(steps=[('imp',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('ohe',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                                        sparse=False))]),
                                                                         ['int_disc',
                                                                          'billtype',
                                                                          'pay',
                         

In [39]:
print('Hyperparameters best combination is: ', gs.best_params_)
print('Best ROC_AUC score: ', gs.best_score_)
print('AUC_ROC on test set: ', roc_auc_score(y_test, gs.predict_proba(X_test)[:, 1]))

Hyperparameters best combination is:  {'logreg__C': 0.021544346900318832, 'tr__cat__imp__strategy': 'constant', 'tr__num__imp__strategy': 'constant', 'tr__num__repl__repl_value': 0.3}
Best ROC_AUC score:  0.8727492175768038
AUC_ROC on test set:  0.8738031088082903


In [40]:
results = pd.DataFrame(gs.cv_results_)

In [41]:
table = results.pivot_table(values=['mean_test_score'], \
                           index=['param_logreg__C',\
                                  'param_tr__num__imp__strategy',\
                                  'param_tr__num__repl__repl_value',\
                                  'param_tr__cat__imp__strategy'])

In [42]:
table.sort_values('mean_test_score', ascending=False, inplace=True)

In [43]:
table

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,mean_test_score
param_logreg__C,param_tr__num__imp__strategy,param_tr__num__repl__repl_value,param_tr__cat__imp__strategy,Unnamed: 4_level_1
0.021544,constant,0.3,constant,0.872749
0.215443,constant,0.3,constant,0.872732
4.641589,constant,0.3,most_frequent,0.872709
10.0,constant,0.3,most_frequent,0.872672
2.154435,constant,0.3,most_frequent,0.87267
1.0,constant,0.3,most_frequent,0.872593
0.1,constant,0.3,constant,0.872521
0.021544,constant,0.3,most_frequent,0.872518
0.464159,constant,0.3,constant,0.872482
0.464159,constant,0.3,most_frequent,0.872478
