In [1]:
'''
written by @alon.agmon
based on code by 
and 
'''

import pandas as pd
import numpy as np
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt"
data = pd.read_csv(url, header=None)

### Load data set

#### The data set will contain 4 features and target var indicating whether the note is forged or authentic

In [2]:
print(data.shape)
data.head(10)

(1372, 5)


Unnamed: 0,0,1,2,3,4
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0
5,4.3684,9.6718,-3.9606,-3.1625,0
6,3.5912,3.0129,0.72888,0.56421,0
7,2.0922,-6.81,8.4636,-0.60216,0
8,3.2032,5.7588,-0.75345,-0.61251,0
9,1.5356,9.1772,-2.2718,-0.73535,0


#### Check how balanced the data set is in terms of postives and negatives

In [3]:
data.iloc[:, -1].value_counts()

0    762
1    610
Name: 4, dtype: int64

#### Train a classifier to create a baseline

In [4]:
from sklearn.model_selection import train_test_split

x_data = data.iloc[:,:-1]
y_data = data.iloc[:,-1]

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=7)

In [5]:
import xgboost as xgb

model = xgb.XGBClassifier()

model.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [6]:
y_predict = model.predict(x_test)

#### Establish our baseline

In [7]:
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 

    
evaluate_results(y_test, y_predict)

Classification results:
f1: 99.57%
roc: 99.57%
recall: 99.15%
precision: 100.00%


### Test the PU learning approach

#### Keep aside 20% of the positives -- they will be the only labeled samples

In [8]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 20% of the positives marked
pos_sample_len = int(np.ceil(0.20 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]


Using 122/610 as positives and unlabeling the rest


#### Create the target col 'class_test' that will be 1 for postive and -1 for unlabebed 

In [9]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    1250
 1     122
Name: class_test, dtype: int64


#### We now have just 130 positive samples labeled as 1 in the 'class_test' col while the rest is unlabeled as -1. 
#### Recall that col 4 still holds the actual label 

In [10]:
mod_data.head(10)

Unnamed: 0,0,1,2,3,4,class_test
0,3.6216,8.6661,-2.8073,-0.44699,0,-1
1,4.5459,8.1674,-2.4586,-1.4621,0,-1
2,3.866,-2.6383,1.9242,0.10645,0,-1
3,3.4566,9.5228,-4.0112,-3.5944,0,-1
4,0.32924,-4.4552,4.5718,-0.9888,0,-1
5,4.3684,9.6718,-3.9606,-3.1625,0,-1
6,3.5912,3.0129,0.72888,0.56421,0,-1
7,2.0922,-6.81,8.4636,-0.60216,0,-1
8,3.2032,5.7588,-0.75345,-0.61251,0,-1
9,1.5356,9.1772,-2.2718,-0.73535,0,-1


#### Remember that this data frame (x_data) includes the former target variable that we keep here just to compare the results
[:-2] is the original class label for positive and negative data
[:-1] is the new class for positive and unlabeled data

In [11]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

#### The training set will be divided into a fitting-set that will be used to fit the estimator in order to estimate P(s=1|X) and a held-out set of positive samples that will be used to estimate P(s=1|y=1)
   

In [12]:
def fit_PU_estimator(X,y, hold_out_ratio, estimator):
    
    # find the indices of the positive/labeled elements
    assert (type(y) == np.ndarray), "Must pass np.ndarray rather than list as y"
    positives = np.where(y == 1.)[0] 
    # hold_out_size = the *number* of positives/labeled samples 
    # that we will use later to estimate P(s=1|y=1)
    hold_out_size = int(np.ceil(len(positives) * hold_out_ratio))
    np.random.shuffle(positives)
    # hold_out = the *indices* of the positive elements 
    # that we will later use  to estimate P(s=1|y=1)
    hold_out = positives[:hold_out_size] 
    # the actual positive *elements* that we will keep aside
    X_hold_out = X[hold_out] 
    # remove the held out elements from X and y
    X = np.delete(X, hold_out,0) 
    y = np.delete(y, hold_out)
    # We fit the estimator on the unlabeled samples + (part of the) positive and labeled ones.
    # In order to estimate P(s=1|X) or  what is the probablity that an element is *labeled*
    estimator.fit(X, y)
    # We then use the estimator for prediction of the positive held-out set 
    # in order to estimate P(s=1|y=1)
    hold_out_predictions = estimator.predict_proba(X_hold_out)
    #take the probability that it is 1
    hold_out_predictions = hold_out_predictions[:,1]
    # save the mean probability 
    c = np.mean(hold_out_predictions)
    return estimator, c

def predict_PU_prob(X, estimator, prob_s1y1):
    predicted_s = estimator.predict_proba(X)
    predicted_s = predicted_s[:,1]
    return predicted_s / prob_s1y1

#### test the PU estimation approach

In [13]:
predicted = np.zeros(len(x_data))
learning_iterations = 12
for index in range(learning_iterations):
    pu_estimator, probs1y1 = fit_PU_estimator(x_data, y_labeled, 0.2, xgb.XGBClassifier())
    predicted += predict_PU_prob(x_data, pu_estimator, probs1y1)
    if(index%3 == 0): 
        print(f'Learning Iteration::{index}/{learning_iterations} => P(s=1|y=1)={round(probs1y1,2)}')

Learning Iteration::0/12 => P(s=1|y=1)=0.12999999523162842
Learning Iteration::3/12 => P(s=1|y=1)=0.1899999976158142
Learning Iteration::6/12 => P(s=1|y=1)=0.15000000596046448
Learning Iteration::9/12 => P(s=1|y=1)=0.15000000596046448


#### compare the performance of the predictions of the PU approacj (y_predict) with the actuall original classes (y_positive) that we have saved aside

In [14]:
y_predict = [1 if x > 0.5 else 0 for x in (predicted/learning_iterations)]
evaluate_results(y_positive, y_predict)

Classification results:
f1: 92.39%
roc: 92.95%
recall: 86.56%
precision: 99.06%


## Use bagging and LGBMClassifier

In [33]:
mod_data = data.copy()
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    1250
 1     122
Name: class_test, dtype: int64


In [35]:
y_data = mod_data.iloc[:,-1]
df_orig_positive  = mod_data.iloc[y_data.values == 1]
df_orig_unlabeled = mod_data.iloc[y_data.values != 1]

In [36]:
x_data_pos = df_orig_positive.iloc[:,:-3].values
x_data_unl = df_orig_unlabeled.iloc[:,:-3].values

In [37]:
len_pos = x_data_pos.shape[0] # size of positives
len_unlabeled = x_data_unl.shape[0] #size of unlabeled
learners_num = 128 #learners
bootstrap_sample_size = len_pos # random bootstrap sample size


In [38]:
#create a label set for each learning cycle
train_labels = np.zeros(shape=(len_pos + bootstrap_sample_size,))
#populate the first part of the set with the positive label, 
train_labels[:len_pos] = 1.0
#place holder array for the number of times the datapoint is predicted
n_oob = np.zeros(shape=(len_unlabeled,))
#holds the results of the prediction of the data point
f_oob = np.zeros(shape=(len_unlabeled, 2))

In [39]:
import lightgbm as lgb

for i in range(learners_num):
    # Bootstrap resample
    bootstrap_sample = np.random.choice(np.arange(len_unlabeled), replace=True, size=bootstrap_sample_size)
    # Positive set + bootstrapped unlabeled set
    data_bootstrap = np.concatenate((x_data_pos,x_data_unl[bootstrap_sample, :]), axis=0)
    # Train model
    model = lgb.LGBMClassifier()
    model.fit(data_bootstrap, train_labels)
    # Index for the out of the bag (oob) samples
    idx_oob = sorted(set(range(len_unlabeled)) - set(np.unique(bootstrap_sample)))
    # Transductive learning of oob samples
    f_oob[idx_oob] += model.predict_proba(x_data_unl[idx_oob])
    n_oob[idx_oob] += 1
    if(i%10 == 0): print(f'learner {i}/{learners_num} completed')
        
predicted = f_oob[:, 1]/n_oob

learner 0/128 completed
learner 10/128 completed
learner 20/128 completed
learner 30/128 completed
learner 40/128 completed
learner 50/128 completed
learner 60/128 completed
learner 70/128 completed
learner 80/128 completed
learner 90/128 completed
learner 100/128 completed
learner 110/128 completed
learner 120/128 completed


In [40]:
df_orig_predicted = df_orig_unlabeled.copy()
df_orig_predicted['pred'] = [1 if x > 0.5 else 0 for x in predicted]
df_orig_positive.loc[:,'pred'] = 1
df_outcome = pd.concat([df_orig_positive,df_orig_predicted])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [22]:
df_outcome

Unnamed: 0,0,1,2,3,4,class_test,pred
762,-1.39710,3.31910,-1.392700,-1.99480,1,-1,1
763,0.39012,-0.14279,-0.031994,0.35084,1,-1,1
764,-1.66770,-7.15350,7.892900,0.96765,1,-1,1
765,-3.84830,-12.80470,15.682400,-1.28100,1,-1,1
766,-3.56810,-8.21300,10.083000,0.96765,1,-1,1
...,...,...,...,...,...,...,...
757,2.66060,3.16810,1.961900,0.18662,0,-1,0
758,3.93100,1.85410,-0.023425,1.23140,0,-1,0
759,0.01727,8.69300,1.398900,-3.96680,0,-1,0
760,3.24140,0.40971,1.401500,1.19520,0,-1,0


In [45]:
evaluate_results(df_orig_predicted.iloc[:,-3].values, df_orig_predicted.iloc[:,-1].values)

Classification results:
f1: 98.34%
roc: 98.40%
recall: 96.93%
precision: 99.79%
