###### 1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
###### 2. сделать feature engineering
###### 3. обучить любой классификатор (какой вам нравится)
###### 4. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
###### 5. применить random negative sampling для построения классификатора в новых условиях
###### 6. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)

Для работы был скачан dataset отсюда https://www.kaggle.com/keesvb/balanced-bank

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
import lightgbm
import plotly.offline as py#visualization
py.init_notebook_mode(connected=True)#visualization
import plotly.graph_objs as go#visualization
import plotly.tools as tls#visualization
import plotly.figure_factory as ff#visualization

In [4]:
def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0))

In [5]:
# https://archive.ics.uci.edu/ml/datasets/bank+marketing - описание датасета
# дата-сет оказался почищенным и сбалансированным
data = pd.read_csv('balanced_bank.csv')

In [6]:
data.head(2)

Unnamed: 0.1,Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,34579,35,admin.,single,university.degree,no,yes,no,cellular,may,...,1,999,1,failure,-1.8,92.893,-46.2,1.266,5099.1,no
1,446,42,technician,married,professional.course,no,no,no,telephone,may,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,yes


In [7]:
data = data.drop(['Unnamed: 0'], axis=1)

In [8]:
data['y'].loc[data.y == 'no'] = 0
data['y'].loc[data.y == 'yes'] = 1
data['y'].value_counts()

1    4640
0    4640
Name: y, dtype: int64

In [9]:
def age_lab(data) :
    
    if data["age"] <= 21 :
        return "Tenure_0-21"
    elif (data["age"] > 21) & (data["age"] <= 35 ):
        return "age_21-35"
    elif (data["age"] > 35) & (data["age"] <= 50) :
        return "age_35-50"
    elif (data["age"] > 50) & (data["age"] <= 65) :
        return "age_50-65"
    elif data["age"] > 65 :
        return "age_gt_65"
    
data["age_group"] = data.apply(lambda data:age_lab(data),
                                      axis = 1)

data.drop(columns=['age'] , inplace=True, axis=1)

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9280 entries, 0 to 9279
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   job             9280 non-null   object 
 1   marital         9280 non-null   object 
 2   education       9280 non-null   object 
 3   default         9280 non-null   object 
 4   housing         9280 non-null   object 
 5   loan            9280 non-null   object 
 6   contact         9280 non-null   object 
 7   month           9280 non-null   object 
 8   day_of_week     9280 non-null   object 
 9   duration        9280 non-null   int64  
 10  campaign        9280 non-null   int64  
 11  pdays           9280 non-null   int64  
 12  previous        9280 non-null   int64  
 13  poutcome        9280 non-null   object 
 14  emp.var.rate    9280 non-null   float64
 15  cons.price.idx  9280 non-null   float64
 16  cons.conf.idx   9280 non-null   float64
 17  euribor3m       9280 non-null   f

In [11]:
data.nunique()

job                 12
marital              4
education            8
default              2
housing              3
loan                 3
contact              2
month               10
day_of_week          5
duration          1395
campaign            30
pdays               26
previous             7
poutcome             3
emp.var.rate        10
cons.price.idx      26
cons.conf.idx       26
euribor3m          293
nr.employed         11
y                    2
age_group            5
dtype: int64

In [12]:
cat_cols = data.select_dtypes(['object']).columns
num_cols = data.select_dtypes(['float64', 'int64']).columns
data[cat_cols] = data[cat_cols].astype('category')

In [13]:
pos = data.loc[data['y']==1]
neg = data.loc[data['y']==0]

#function  for pie plot for customer attrition types
def plot_pie(column) :
    
    trace1 = go.Pie(values  = pos[column].value_counts().values.tolist(),
                    labels  = pos[column].value_counts().keys().tolist(),
                    hoverinfo = "label+percent+name",
                    domain  = dict(x = [0,.48]),
                    name    = "Subscribed",
                    marker  = dict(line = dict(width = 2,
                                               color = "rgb(243,243,243)")
                                  ),
                    hole    = .6
                   )
    trace2 = go.Pie(values  = neg[column].value_counts().values.tolist(),
                    labels  = neg[column].value_counts().keys().tolist(),
                    hoverinfo = "label+percent+name",
                    marker  = dict(line = dict(width = 2,
                                               color = "rgb(243,243,243)")
                                  ),
                    domain  = dict(x = [.52,1]),
                    hole    = .6,
                    name    = "Non subscribed" 
                   )


    layout = go.Layout(dict(title = column + " distribution in subscribtion ",
                            plot_bgcolor  = "rgb(243,243,243)",
                            paper_bgcolor = "rgb(243,243,243)",
                            annotations = [dict(text = "subs",
                                                font = dict(size = 13),
                                                showarrow = False,
                                                x = .15, y = .5),
                                           dict(text = "Non subs",
                                                font = dict(size = 13),
                                                showarrow = False,
                                                x = .88,y = .5
                                               )
                                          ]
                           )
                      )
    data = [trace1,trace2]
    fig  = go.Figure(data = data,layout = layout)
    py.iplot(fig)


#function  for histogram for customer attrition types
def histogram(column) :
    trace1 = go.Histogram(x  = pos[column],
                          histnorm= "percent",
                          name = "Subs",
                          marker = dict(line = dict(width = .5,
                                                    color = "black"
                                                    )
                                        ),
                         opacity = .9 
                         ) 
    
    trace2 = go.Histogram(x  = neg[column],
                          histnorm = "percent",
                          name = "Non subs",
                          marker = dict(line = dict(width = .5,
                                              color = "black"
                                             )
                                 ),
                          opacity = .9
                         )
    
    data = [trace1,trace2]
    layout = go.Layout(dict(title =column + " distribution in subs ",
                            plot_bgcolor  = "rgb(243,243,243)",
                            paper_bgcolor = "rgb(243,243,243)",
                            xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                             title = column,
                                             zerolinewidth=1,
                                             ticklen=5,
                                             gridwidth=2
                                            ),
                            yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                             title = "percent",
                                             zerolinewidth=1,
                                             ticklen=5,
                                             gridwidth=2
                                            ),
                           )
                      )
    fig  = go.Figure(data=data,layout=layout)
    
    py.iplot(fig)
    
#for all categorical columns plot pie
for i in cat_cols :
    plot_pie(i)


In [14]:
train, test = train_test_split(data, test_size=0.3, random_state=10)

In [15]:
x_train = train.drop(columns=['y'], axis=1)
x_test = test.drop(columns=['y'], axis=1)
y_train = train['y']
y_test = test['y']

In [16]:
model = lightgbm.LGBMClassifier(random_state=10)

In [17]:
model.fit(x_train, y_train)

LGBMClassifier(random_state=10)

In [18]:
pred = model.predict(x_test)

In [19]:
evaluate_results(y_test, pred)

Classification results:
f1: 88.71%
roc: 88.16%
recall: 92.42%
precision: 85.28%


In [20]:
mod_data = data.copy()

In [21]:
pos_ind = np.where(mod_data['y'].values == 1)[0]

In [22]:
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))

In [23]:
pos_sample = pos_ind[:pos_sample_len]

In [24]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample, 'class_test'] = 1

In [25]:
# x_data = mod_data.drop(['y','class_test'], axis=1) # just the X 
# y_labeled = mod_data['class_test'] # new class (just the P & U)
# y_positive = mod_data['y'] # original class

In [26]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
neg_sample['y'] = 0 # считаем неразмеченные данные за 0
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(1160, 22) (1160, 22)


In [27]:
model.fit(sample_train.drop(['y', 'class_test'], axis=1), 
          sample_train['y'].values)

y_predict = model.predict(sample_test.drop(['y', 'class_test'], axis=1))
                          
evaluate_results(sample_test['y'], y_predict)

Classification results:
f1: 78.35%
roc: 81.26%
recall: 75.38%
precision: 81.57%
