In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("churn_data.csv")
df.head(3)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [3]:
df['Exited'].value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


### Столбцы RowNumber CustomerId и Surname нам не нужны. Удаляем их.

In [5]:
num = ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']

cat = ['Geography', 'Gender', 'NumOfProducts', 'HasCrCard', 'IsActiveMember']

target = ['Exited']

In [6]:
df = df[num+cat+target]
df.head(3)

Unnamed: 0,CreditScore,Age,Tenure,Balance,EstimatedSalary,Geography,Gender,NumOfProducts,HasCrCard,IsActiveMember,Exited
0,619,42,2,0.0,101348.88,France,Female,1,1,1,1
1,608,41,1,83807.86,112542.58,Spain,Female,1,0,1,0
2,502,42,8,159660.8,113931.57,France,Female,3,1,0,1


In [7]:
X_train, X_test, y_train, y_test = train_test_split(df, df[target], random_state=12)

In [8]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [9]:
final_transformers = list()

for cat_col in cat:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in num:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col))
            ])
    
    final_transformers.append((cont_col, cont_transformer))

In [10]:
feats = FeatureUnion(final_transformers)
feature_processing = Pipeline([('feats', feats)])

### RandomForestClassifier:

In [11]:
pipeline = Pipeline([
    ('features', feats),
    ('classifier', RandomForestClassifier(random_state=12)),
])

In [12]:
pipeline.fit(X_train, y_train)

In [13]:
preds = pipeline.predict(X_test)

In [14]:
dict_rez = {'RForest':[], 'PU_10':[], 'PU_25':[], 'PU_30':[]}

In [15]:
f1 = f1_score(y_test, preds)
roc = roc_auc_score(y_test, preds)
rec = recall_score(y_test, preds, average='binary')
prc = precision_score(y_test, preds, average='binary')

dict_rez['RForest'].append(f1)
dict_rez['RForest'].append(roc)
dict_rez['RForest'].append(rec)
dict_rez['RForest'].append(prc)

### PU learning(25%):

In [16]:
mod_data = df.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 510/2037 as positives and unlabeling the rest


In [17]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    9490
 1     510
Name: class_test, dtype: int64


In [18]:
mod_data.head(5)

Unnamed: 0,CreditScore,Age,Tenure,Balance,EstimatedSalary,Geography,Gender,NumOfProducts,HasCrCard,IsActiveMember,Exited,class_test
0,619,42,2,0.0,101348.88,France,Female,1,1,1,1,-1
1,608,41,1,83807.86,112542.58,Spain,Female,1,0,1,0,-1
2,502,42,8,159660.8,113931.57,France,Female,3,1,0,1,1
3,699,39,1,0.0,93826.63,France,Female,2,0,0,0,-1
4,850,43,2,125510.82,79084.1,Spain,Female,1,1,1,0,-1


In [19]:
x_data = mod_data.iloc[:,:-2] # just the X 
y_labeled = mod_data.iloc[:,-1] # new class (just the P & U)
y_positive = mod_data.iloc[:,-2] # original class

### Random negative sampling:

In [20]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(510, 12) (510, 12)


In [21]:
pipeline = Pipeline([
    ('features', feats),
    ('classifier', RandomForestClassifier(random_state=12)),
])

In [22]:
pipeline.fit(sample_train.iloc[:,:-2], sample_train.iloc[:,-2])

In [23]:
preds = pipeline.predict(sample_test.iloc[:,:-2])

In [24]:
f1_PU25 = f1_score(sample_test.iloc[:,-2].values, preds)
roc_PU25 = roc_auc_score(sample_test.iloc[:,-2].values, preds)
rec_PU25 = recall_score(sample_test.iloc[:,-2].values, preds, average='binary')
prc_PU25 = precision_score(sample_test.iloc[:,-2].values, preds, average='binary')

dict_rez['PU_25'].append(f1_PU25)
dict_rez['PU_25'].append(roc_PU25)
dict_rez['PU_25'].append(rec_PU25)
dict_rez['PU_25'].append(prc_PU25)

### PU learning(10%):

In [25]:
mod_data = df.copy()
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
np.random.shuffle(pos_ind)
pos_sample_len = int(np.ceil(0.10 * len(pos_ind)))
pos_sample = pos_ind[:pos_sample_len]

mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1

x_data = mod_data.iloc[:,:-2]
y_labeled = mod_data.iloc[:,-1]
y_positive = mod_data.iloc[:,-2]

mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

pipeline = Pipeline([('features', feats),('classifier', RandomForestClassifier(random_state=12)),])
pipeline.fit(sample_train.iloc[:,:-2], sample_train.iloc[:,-2])
preds = pipeline.predict(sample_test.iloc[:,:-2])

f1_PU10 = f1_score(sample_test.iloc[:,-2].values, preds)
roc_PU10 = roc_auc_score(sample_test.iloc[:,-2].values, preds)
rec_PU10 = recall_score(sample_test.iloc[:,-2].values, preds, average='binary')
prc_PU10 = precision_score(sample_test.iloc[:,-2].values, preds, average='binary')

dict_rez['PU_10'].append(f1_PU10)
dict_rez['PU_10'].append(roc_PU10)
dict_rez['PU_10'].append(rec_PU10)
dict_rez['PU_10'].append(prc_PU10)

### PU learning(30%):

In [26]:
mod_data = df.copy()
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
np.random.shuffle(pos_ind)
pos_sample_len = int(np.ceil(0.30 * len(pos_ind)))
pos_sample = pos_ind[:pos_sample_len]

mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1

x_data = mod_data.iloc[:,:-2]
y_labeled = mod_data.iloc[:,-1]
y_positive = mod_data.iloc[:,-2]

mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

pipeline = Pipeline([('features', feats),('classifier', RandomForestClassifier(random_state=12)),])
pipeline.fit(sample_train.iloc[:,:-2], sample_train.iloc[:,-2])
preds = pipeline.predict(sample_test.iloc[:,:-2])

f1_PU30 = f1_score(sample_test.iloc[:,-2].values, preds)
roc_PU30 = roc_auc_score(sample_test.iloc[:,-2].values, preds)
rec_PU30 = recall_score(sample_test.iloc[:,-2].values, preds, average='binary')
prc_PU30 = precision_score(sample_test.iloc[:,-2].values, preds, average='binary')

dict_rez['PU_30'].append(f1_PU30)
dict_rez['PU_30'].append(roc_PU30)
dict_rez['PU_30'].append(rec_PU30)
dict_rez['PU_30'].append(prc_PU30)

In [27]:
df_rez = pd.DataFrame(dict_rez, index=['F1','Roc', 'Recall', 'Precision'])
df_rez

Unnamed: 0,RForest,PU_10,PU_25,PU_30
F1,0.606335,0.503377,0.478341,0.464384
Roc,0.725575,0.739386,0.753314,0.754246
Recall,0.482014,0.788419,0.806787,0.813711
Precision,0.817073,0.369713,0.339947,0.324903


### Вывод: как видно по результирующей таблице, обычный классификатор RForest показывает лучшие результаты, чем PU learning(25%), значительно проигрывая лишь по Recall. В свою очередь, разница результатов при различном проценте позитивно размеченных данных у PU learning не очень значительная, но наилучший результат показывает все же изначальные 25%, проигрывая варианту с 10% только по Recall.