In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

In [2]:
data = pd.read_csv("heart.csv")
#data = data.drop(data.index[[0]])
data.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2,1
7,44,1,1,120,263,0,1,173,0,0.0,2,0,3,1
8,52,1,2,172,199,1,1,162,0,0.5,2,0,3,1
9,57,1,2,150,168,0,1,174,0,1.6,2,0,2,1


In [3]:
#data.columns

In [4]:
print(data.shape)

(303, 14)


In [5]:
data.iloc[:, 1].value_counts()

1    207
0     96
Name: sex, dtype: int64

In [6]:
data.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2,1
7,44,1,1,120,263,0,1,173,0,0.0,2,0,3,1
8,52,1,2,172,199,1,1,162,0,0.5,2,0,3,1
9,57,1,2,150,168,0,1,174,0,1.6,2,0,2,1


In [7]:
from sklearn.model_selection import train_test_split

x_data = data.iloc[:,:13]
#x_data.drop(["Gender"],axis='columns', inplace=True)
y_data = data.iloc[:,13]

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=7)

In [8]:
x_test.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
125,34,0,1,118,210,0,1,192,0,0.7,2,0,2
57,45,1,0,115,260,0,0,185,0,0.0,2,0,2
282,59,1,2,126,218,1,1,134,0,2.2,1,1,1
66,51,1,2,100,222,0,1,143,1,1.2,1,0,2
114,55,1,1,130,262,0,1,155,0,0.0,2,0,2
281,52,1,0,128,204,1,1,156,1,1.0,1,0,0
132,42,1,1,120,295,0,1,162,0,0.0,2,0,2
255,45,1,0,142,309,0,0,147,1,0.0,1,3,3
37,54,1,2,150,232,0,0,165,0,1.6,2,0,3
273,58,1,0,100,234,0,1,156,0,0.1,2,1,3


In [9]:
model = AdaBoostClassifier(n_estimators=10, random_state=42)

model.fit(x_train, y_train)
y_predict = model.predict(x_test)

Проверяем качество

In [10]:
def evaluate_results(y_test, y_predict, p=0):
    print('Classification results(', p, ')')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 

    
evaluate_results(y_test, y_predict)

Classification results( 0 )
f1: 74.63%
roc: 71.99%
recall: 80.65%
precision: 69.44%


### PU learning (доля P - 0.85)

In [11]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 85% of the positives marked
pos_sample_len = int(np.ceil(0.85 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 141/165 as positives and unlabeling the rest


In [12]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    162
 1    141
Name: class_test, dtype: int64


In [13]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

### random negative sampling (доля P - 0.85)

In [14]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(141, 15) (141, 15)


In [15]:
model = AdaBoostClassifier(n_estimators=10, random_state=42)

model.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)
y_predict = model.predict(sample_test.iloc[:,:-2].values)
y_test_1 = sample_test.iloc[:,-2].values
y_predict_1 = y_predict
evaluate_results(sample_test.iloc[:,-2].values, y_predict)

Classification results( 0 )
f1: 71.43%
roc: 87.50%
recall: 100.00%
precision: 55.56%


### PU learning (доля P - 0.45)

In [16]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 85% of the positives marked
pos_sample_len = int(np.ceil(0.45 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 75/165 as positives and unlabeling the rest


In [17]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    228
 1     75
Name: class_test, dtype: int64


In [18]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

### random negative sampling (доля P - 0.85)

In [19]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(75, 15) (75, 15)


In [20]:
model = AdaBoostClassifier(n_estimators=10, random_state=42)

model.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)
y_predict = model.predict(sample_test.iloc[:,:-2].values)
y_test_4 = sample_test.iloc[:,-2].values
y_predict_4 = y_predict
evaluate_results(sample_test.iloc[:,-2].values, y_predict)

Classification results( 0 )
f1: 74.32%
roc: 76.98%
recall: 87.30%
precision: 64.71%


### PU learning (доля P - 0.25)

In [21]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 42/165 as positives and unlabeling the rest


Создаем столбец для новой целевой переменной, где у нас два класса - P (1) и U (-1)

In [22]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    261
 1     42
Name: class_test, dtype: int64


In [23]:
mod_data.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,class_test
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1,-1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1,-1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1,-1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1,-1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1,-1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1,-1
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2,1,-1
7,44,1,1,120,263,0,1,173,0,0.0,2,0,3,1,1
8,52,1,2,172,199,1,1,162,0,0.5,2,0,3,1,-1
9,57,1,2,150,168,0,1,174,0,1.6,2,0,2,1,-1


In [24]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

### random negative sampling (доля P - 0.25)

In [25]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(42, 15) (42, 15)


In [26]:
model = AdaBoostClassifier(n_estimators=10, random_state=42)

model.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)
y_predict = model.predict(sample_test.iloc[:,:-2].values)
y_test_2 = sample_test.iloc[:,-2].values
y_predict_2 = y_predict
evaluate_results(sample_test.iloc[:,-2].values, y_predict)

Classification results( 0 )
f1: 74.59%
roc: 73.23%
recall: 91.00%
precision: 63.19%


### PU learning (доля P - 0.15)

In [27]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.15 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 25/165 as positives and unlabeling the rest


Создаем столбец для новой целевой переменной, где у нас два класса - P (1) и U (-1)

In [28]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    278
 1     25
Name: class_test, dtype: int64


In [29]:
mod_data.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,class_test
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1,-1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1,-1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1,1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1,-1
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2,1,1
7,44,1,1,120,263,0,1,173,0,0.0,2,0,3,1,1
8,52,1,2,172,199,1,1,162,0,0.5,2,0,3,1,-1
9,57,1,2,150,168,0,1,174,0,1.6,2,0,2,1,-1


In [30]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

### random negative sampling (доля P - 0.15)

In [31]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(25, 15) (25, 15)


In [32]:
model = AdaBoostClassifier(n_estimators=10, random_state=42)

model.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)
y_predict = model.predict(sample_test.iloc[:,:-2].values)
y_test_3 = sample_test.iloc[:,-2].values
y_predict_3 = y_predict
evaluate_results(sample_test.iloc[:,-2].values, y_predict)

Classification results( 0 )
f1: 81.21%
roc: 77.80%
recall: 95.28%
precision: 70.76%


In [33]:
evaluate_results(y_test_1, y_predict_1, 0.85)
evaluate_results(y_test_4, y_predict_4, 0.45)
evaluate_results(y_test_2, y_predict_2, 0.25)
evaluate_results(y_test_3, y_predict_3, 0.15)

Classification results( 0.85 )
f1: 71.43%
roc: 87.50%
recall: 100.00%
precision: 55.56%
Classification results( 0.45 )
f1: 74.32%
roc: 76.98%
recall: 87.30%
precision: 64.71%
Classification results( 0.25 )
f1: 74.59%
roc: 73.23%
recall: 91.00%
precision: 63.19%
Classification results( 0.15 )
f1: 81.21%
roc: 77.80%
recall: 95.28%
precision: 70.76%
