In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [2]:
# data loding - already preprocessed in Dataset_Preprocessing.ipynb

data_gini = pd.read_csv('gini_split_data', index_col=0)
data_quantiles = pd.read_csv('quantile_split_data', index_col=0)

# Cropping dataset

After some experiments it turns out that doing lazy classification on 60k+ objects is not a good idea from a running time perspective. So I decided to cut some (a lot of) objects out of it in order to be able to apply lazy classifier with appropriate amount of running time.

In [3]:
data_gini['RainTomorrowNOT'] = (data_gini['RainTomorrow'] + 1) % 2

df1 = data_gini.sample(500, random_state=13, weights='RainTomorrow')
df2 = data_gini.sample(500, random_state=17, weights='RainTomorrowNOT')

In [4]:
new_data_gini = pd.concat((df1, df2)).drop('RainTomorrowNOT', axis=1)
new_data_gini = new_data_gini.sample(frac=1).reset_index(drop=True)
new_data_gini

Unnamed: 0,RainToday,RainTomorrow,Location_AliceSprings,Location_Brisbane,Location_Cairns,Location_Canberra,Location_Cobar,Location_CoffsHarbour,Location_Darwin,Location_Hobart,...,Pressure3pm_l,Pressure3pm_r,Cloud9am_l,Cloud9am_r,Cloud3pm_l,Cloud3pm_r,Temp9am_l,Temp9am_r,Temp3pm_l,Temp3pm_r
0,0,1,0,1,0,0,0,0,0,0,...,0,1,0,1,0,1,0,1,0,1
1,0,1,0,0,0,0,0,0,0,0,...,0,1,0,1,0,1,0,1,0,1
2,0,1,0,0,0,0,0,0,0,0,...,1,0,1,0,1,0,0,1,0,1
3,1,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,1,1,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,1,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,1,0,0,0,0,0,0,0,0,...,0,1,0,1,0,1,1,0,0,1
996,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,1,0,0,1,0,1
997,0,1,0,0,0,0,0,0,0,0,...,0,1,1,0,1,0,1,0,0,1
998,0,0,0,0,0,0,0,0,1,0,...,0,1,1,0,1,0,0,1,0,1


In [5]:
data_quantiles['RainTomorrowNOT'] = (data_quantiles['RainTomorrow'] + 1) % 2

df1 = data_quantiles.sample(500, random_state=13, weights='RainTomorrow')
df2 = data_quantiles.sample(500, random_state=17, weights='RainTomorrowNOT')

In [6]:
new_data_quantiles = pd.concat((df1, df2)).drop('RainTomorrowNOT', axis=1)
new_data_quantiles = new_data_quantiles.sample(frac=1).reset_index(drop=True)
new_data_quantiles

Unnamed: 0,RainToday,RainTomorrow,Location_AliceSprings,Location_Brisbane,Location_Cairns,Location_Canberra,Location_Cobar,Location_CoffsHarbour,Location_Darwin,Location_Hobart,...,Cloud9am_3,Cloud3pm_1,Cloud3pm_2,Cloud3pm_3,Temp9am_1,Temp9am_2,Temp9am_3,Temp3pm_1,Temp3pm_2,Temp3pm_3
0,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,1,1,0,1,1
1,0,1,0,0,0,0,0,0,0,0,...,0,0,1,1,0,2,0,0,2,0
2,0,1,0,0,0,0,0,0,1,0,...,0,0,1,1,0,1,1,0,1,1
3,1,1,0,0,0,0,0,0,0,0,...,0,0,1,1,0,2,0,1,1,0
4,1,1,0,0,0,0,0,0,0,0,...,1,0,1,1,1,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,1,0,0,0,0,0,0,0,0,...,0,0,2,0,0,2,0,1,1,0
996,0,1,0,0,0,0,0,0,0,0,...,1,0,1,1,0,2,0,0,1,1
997,0,1,0,0,0,0,0,0,0,0,...,0,0,1,1,0,2,0,0,2,0
998,0,1,0,0,0,0,0,0,0,0,...,0,0,1,1,1,1,0,1,1,0


# Lazy Classifier 1

In [5]:
# both train_positive and train_negative come without target variable

def lazy_classifier1(train_negative, train_positive, test, treshold):
    
    progress = 0
    # classification labels for test samples as a result
    result = []
    
    for _, test_sample in test.iterrows():
        
        count_negative = 0
        count_positive = 0
        
        if progress % 50 == 0:
            print("objects classified:", progress)
        
        for __, train_sample in train_negative.iterrows():
            
            # cardinality of  the intersection of: current test sample and current train sample
            intersection_count = np.sum(np.multiply(np.array(test_sample), np.array(train_sample)))
            
            if intersection_count >= treshold:
                
                count_negative += 1  
                
        for __, train_sample in train_positive.iterrows():
            
            # cardinality of  the intersection of: current test sample and current train sample
            intersection_count = np.sum(np.multiply(np.array(test_sample), np.array(train_sample)))
            
            if intersection_count >= treshold:
                
                count_positive += 1
        
        # voting process: if count_positive >= count_negative => test sample labeled by class 0, else by class 1
        result.append(int(count_positive >= count_negative))
        
        # number of objects classified
        progress += 1
    
    return np.array(result)
        
    

In [7]:
X_train, X_test, y_train, y_test = train_test_split(new_data_gini, 
                                                    new_data_gini['RainTomorrow'], 
                                                    test_size=0.2, 
                                                    random_state=42)

X_test.drop('RainTomorrow', axis=1, inplace=True)
train_negative = X_train[X_train['RainTomorrow'] == 0].drop('RainTomorrow', axis=1)
train_positive = X_train[X_train['RainTomorrow'] == 1].drop('RainTomorrow', axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [8]:
for treshold in range(0, 15):

    pred = lazy_classifier1(train_negative, train_positive, X_test, treshold)
    
    print("------------------------------------------------------")
    print("treshold:", treshold)
    print("accuracy score: ", accuracy_score(np.array(y_test), pred))
    print("precision_score: ", precision_score(np.array(y_test), pred))
    print("recall score: ", recall_score(np.array(y_test), pred))
    print("f1 score: ", f1_score(np.array(y_test), pred))
    print("------------------------------------------------------")

objects classified: 0
objects classified: 100
160000
treshold: 0
accuracy score:  0.49
precision_score:  0.0
recall score:  0.0
f1 score:  0.0
------------------------------------------------------
objects classified: 0


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


objects classified: 100
159994
treshold: 1
accuracy score:  0.49
precision_score:  0.0
recall score:  0.0
f1 score:  0.0
------------------------------------------------------
objects classified: 0
objects classified: 100
159940
treshold: 2
accuracy score:  0.495
precision_score:  1.0
recall score:  0.00980392156862745
f1 score:  0.01941747572815534
------------------------------------------------------
objects classified: 0
objects classified: 100
159680
treshold: 3
accuracy score:  0.505
precision_score:  1.0
recall score:  0.029411764705882353
f1 score:  0.05714285714285715
------------------------------------------------------
objects classified: 0
objects classified: 100
158857
treshold: 4
accuracy score:  0.56
precision_score:  0.8888888888888888
recall score:  0.1568627450980392
f1 score:  0.26666666666666666
------------------------------------------------------
objects classified: 0
objects classified: 100
156853
treshold: 5
accuracy score:  0.585
precision_score:  0.851851851

In [9]:
for treshold in range(15, 40):

    pred = lazy_classifier1(train_negative, train_positive, X_test, treshold)
    
    print("------------------------------------------------------")
    print("treshold:", treshold)
    print("accuracy score: ", accuracy_score(np.array(y_test), pred))
    print("precision_score: ", precision_score(np.array(y_test), pred))
    print("recall score: ", recall_score(np.array(y_test), pred))
    print("f1 score: ", f1_score(np.array(y_test), pred))
    print("------------------------------------------------------")

objects classified: 0
objects classified: 100
9815
treshold: 15
accuracy score:  0.685
precision_score:  0.693069306930693
recall score:  0.6862745098039216
f1 score:  0.6896551724137931
------------------------------------------------------
objects classified: 0
objects classified: 100
3519
treshold: 16
accuracy score:  0.665
precision_score:  0.6605504587155964
recall score:  0.7058823529411765
f1 score:  0.6824644549763033
------------------------------------------------------
objects classified: 0
objects classified: 100
833
treshold: 17
accuracy score:  0.66
precision_score:  0.6287878787878788
recall score:  0.8137254901960784
f1 score:  0.7094017094017094
------------------------------------------------------
objects classified: 0
objects classified: 100
144
treshold: 18
accuracy score:  0.6
precision_score:  0.5647058823529412
recall score:  0.9411764705882353
f1 score:  0.7058823529411765
------------------------------------------------------
objects classified: 0
objects clas

KeyboardInterrupt: 

### best result for gini numerical split data:

- treshold: 15

- accuracy score:  0.685

- precision_score:  0.693069306930693

- recall score:  0.6862745098039216

- f1 score:  0.6896551724137931

### Now, lazy_classifier1 for quantiles dataset

In [8]:
X_train, X_test, y_train, y_test = train_test_split(new_data_quantiles, 
                                                    new_data_quantiles['RainTomorrow'], 
                                                    test_size=0.2, 
                                                    random_state=42)

X_test.drop('RainTomorrow', axis=1, inplace=True)
train_negative = X_train[X_train['RainTomorrow'] == 0].drop('RainTomorrow', axis=1)
train_positive = X_train[X_train['RainTomorrow'] == 1].drop('RainTomorrow', axis=1)

In [7]:
for treshold in range(0, 15):

    pred = lazy_classifier1(train_negative, train_positive, X_test, treshold)
    
    print("------------------------------------------------------")
    print("treshold:", treshold)
    print("accuracy score: ", accuracy_score(np.array(y_test), pred))
    print("precision_score: ", precision_score(np.array(y_test), pred))
    print("recall score: ", recall_score(np.array(y_test), pred))
    print("f1 score: ", f1_score(np.array(y_test), pred))
    print("------------------------------------------------------")

objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold: 0
accuracy score:  0.455
precision_score:  0.0
recall score:  0.0
f1 score:  0.0
------------------------------------------------------
objects classified: 0


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold: 1
accuracy score:  0.455
precision_score:  0.0
recall score:  0.0
f1 score:  0.0
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold: 2
accuracy score:  0.455
precision_score:  0.0
recall score:  0.0
f1 score:  0.0
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold: 3
accuracy score:  0.455
precision_score:  0.0
recall score:  0.0
f1 score:  0.0
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
-----------------------------------------------

In [13]:
for treshold in range(15, 50):

    pred = lazy_classifier1(train_negative, train_positive, X_test, treshold)
    
    print("------------------------------------------------------")
    print("treshold:", treshold)
    print("accuracy score: ", accuracy_score(np.array(y_test), pred))
    print("precision_score: ", precision_score(np.array(y_test), pred))
    print("recall score: ", recall_score(np.array(y_test), pred))
    print("f1 score: ", f1_score(np.array(y_test), pred))
    print("------------------------------------------------------")

objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold: 15
accuracy score:  0.455
precision_score:  0.0
recall score:  0.0
f1 score:  0.0
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold: 16
accuracy score:  0.455
precision_score:  0.0
recall score:  0.0
f1 score:  0.0
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold: 17
accuracy score:  0.455
precision_score:  0.0
recall score:  0.0
f1 score:  0.0
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
----------------------

objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold: 40
accuracy score:  0.645
precision_score:  0.6397058823529411
recall score:  0.7981651376146789
f1 score:  0.710204081632653
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold: 41
accuracy score:  0.62
precision_score:  0.6137931034482759
recall score:  0.8165137614678899
f1 score:  0.7007874015748031
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold: 42
accuracy score:  0.605
precision_score:  0.5961538461538461
recall score:  0.8532110091743119
f1 score:  0.7018867924528301
------------------------------------------------------
obje

KeyboardInterrupt: 

### best result for quantiles split data:

- treshold: 33
    
- accuracy score:  0.745
    
- precision_score:  0.7543859649122807
    
- recall score:  0.7889908256880734
    
- f1 score:  0.7713004484304933

# Lazy Classifier 2

In [17]:
# both train_positive and train_negative come without target variable

def lazy_classifier2(train_negative, train_positive, test, treshold):
    
    progress = 0
    # classification labels for test samples as a result
    result = []
    
    for _, test_sample in test.iterrows():
        
        count_negative = 0
        count_positive = 0
        
        intersection_neg = 0
        intersection_pos = 0
        
        if progress % 50 == 0:
            print("objects classified:", progress)
        
        for __, train_sample in train_negative.iterrows():
            
            # cardinality of  the intersection of: current test sample and current train sample
            intersection_count = np.sum(np.multiply(np.array(test_sample), np.array(train_sample)))
            
            if intersection_count >= treshold:
                
                count_negative += 1  
            
            intersection_neg += intersection_count
                
        for __, train_sample in train_positive.iterrows():
            
            # cardinality of  the intersection of: current test sample and current train sample
            intersection_count = np.sum(np.multiply(np.array(test_sample), np.array(train_sample)))
            
            if intersection_count >= treshold:
                
                count_positive += 1
            
            intersection_pos += intersection_count
        
        # fraction of non-empty intersections with + and - classes plus 1
        weight_neg = ((count_negative/train_negative.shape[0]) + 1)*(intersection_neg/train_negative.shape[1])
        weight_pos = ((count_positive/train_positive.shape[0]) + 1)*(intersection_pos/train_positive.shape[1])
        
        result.append(int(weight_pos >= weight_neg))
        
        # number of objects classified
        progress += 1
    
    return np.array(result)
        
    

In [9]:
for treshold in range(0, 20):

    pred = lazy_classifier2(train_negative, train_positive, X_test, treshold)
    
    print("------------------------------------------------------")
    print("treshold: ", treshold)
    print("accuracy score: ", accuracy_score(np.array(y_test), pred))
    print("precision_score: ", precision_score(np.array(y_test), pred))
    print("recall score: ", recall_score(np.array(y_test), pred))
    print("f1 score: ", f1_score(np.array(y_test), pred))
    print("------------------------------------------------------")

objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold:  0
accuracy score:  0.76
precision_score:  0.7654320987654321
recall score:  0.6813186813186813
f1 score:  0.7209302325581395
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold:  1
accuracy score:  0.76
precision_score:  0.7654320987654321
recall score:  0.6813186813186813
f1 score:  0.7209302325581395
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold:  2
accuracy score:  0.76
precision_score:  0.7654320987654321
recall score:  0.6813186813186813
f1 score:  0.7209302325581395
--------------------------------------

In [10]:
for treshold in range(20, 50):

    pred = lazy_classifier2(train_negative, train_positive, X_test, treshold)
    
    print("------------------------------------------------------")
    print("treshold: ", treshold)
    print("accuracy score: ", accuracy_score(np.array(y_test), pred))
    print("precision_score: ", precision_score(np.array(y_test), pred))
    print("recall score: ", recall_score(np.array(y_test), pred))
    print("f1 score: ", f1_score(np.array(y_test), pred))
    print("------------------------------------------------------")

objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold:  20
accuracy score:  0.76
precision_score:  0.7654320987654321
recall score:  0.6813186813186813
f1 score:  0.7209302325581395
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold:  21
accuracy score:  0.76
precision_score:  0.7654320987654321
recall score:  0.6813186813186813
f1 score:  0.7209302325581395
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold:  22
accuracy score:  0.76
precision_score:  0.7654320987654321
recall score:  0.6813186813186813
f1 score:  0.7209302325581395
-----------------------------------

objects classified: 100
objects classified: 150
------------------------------------------------------
treshold:  44
accuracy score:  0.76
precision_score:  0.7654320987654321
recall score:  0.6813186813186813
f1 score:  0.7209302325581395
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold:  45
accuracy score:  0.76
precision_score:  0.7654320987654321
recall score:  0.6813186813186813
f1 score:  0.7209302325581395
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold:  46
accuracy score:  0.76
precision_score:  0.7654320987654321
recall score:  0.6813186813186813
f1 score:  0.7209302325581395
------------------------------------------------------
objects classified: 0
obj

In [11]:
for treshold in range(50, 100):

    pred = lazy_classifier2(train_negative, train_positive, X_test, treshold)
    
    print("------------------------------------------------------")
    print("treshold: ", treshold)
    print("accuracy score: ", accuracy_score(np.array(y_test), pred))
    print("precision_score: ", precision_score(np.array(y_test), pred))
    print("recall score: ", recall_score(np.array(y_test), pred))
    print("f1 score: ", f1_score(np.array(y_test), pred))
    print("------------------------------------------------------")

objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold:  50
accuracy score:  0.76
precision_score:  0.7654320987654321
recall score:  0.6813186813186813
f1 score:  0.7209302325581395
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold:  51
accuracy score:  0.76
precision_score:  0.7654320987654321
recall score:  0.6813186813186813
f1 score:  0.7209302325581395
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold:  52
accuracy score:  0.76
precision_score:  0.7654320987654321
recall score:  0.6813186813186813
f1 score:  0.7209302325581395
-----------------------------------

objects classified: 100
objects classified: 150
------------------------------------------------------
treshold:  74
accuracy score:  0.76
precision_score:  0.7654320987654321
recall score:  0.6813186813186813
f1 score:  0.7209302325581395
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold:  75
accuracy score:  0.76
precision_score:  0.7654320987654321
recall score:  0.6813186813186813
f1 score:  0.7209302325581395
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold:  76
accuracy score:  0.76
precision_score:  0.7654320987654321
recall score:  0.6813186813186813
f1 score:  0.7209302325581395
------------------------------------------------------
objects classified: 0
obj

------------------------------------------------------
treshold:  98
accuracy score:  0.76
precision_score:  0.7654320987654321
recall score:  0.6813186813186813
f1 score:  0.7209302325581395
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold:  99
accuracy score:  0.76
precision_score:  0.7654320987654321
recall score:  0.6813186813186813
f1 score:  0.7209302325581395
------------------------------------------------------


### best result, ginisplit dataset:

- treshold: 50

- accuracy score:  0.76

- precision_score:  0.7654320987654321

- recall score:  0.6813186813186813

- f1 score: 0.7209302325581395

### Now, lazy_classifier2 for quantiles dataset

In [12]:
for treshold in range(0, 15):

    pred = lazy_classifier2(train_negative, train_positive, X_test, treshold)
    
    print("------------------------------------------------------")
    print("treshold:", treshold)
    print("accuracy score: ", accuracy_score(np.array(y_test), pred))
    print("precision_score: ", precision_score(np.array(y_test), pred))
    print("recall score: ", recall_score(np.array(y_test), pred))
    print("f1 score: ", f1_score(np.array(y_test), pred))
    print("------------------------------------------------------")

objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold: 0
accuracy score:  0.73
precision_score:  0.7628865979381443
recall score:  0.7047619047619048
f1 score:  0.7326732673267325
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold: 1
accuracy score:  0.73
precision_score:  0.7628865979381443
recall score:  0.7047619047619048
f1 score:  0.7326732673267325
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold: 2
accuracy score:  0.73
precision_score:  0.7628865979381443
recall score:  0.7047619047619048
f1 score:  0.7326732673267325
-----------------------------------------

In [13]:
for treshold in range(15, 30):

    pred = lazy_classifier2(train_negative, train_positive, X_test, treshold)
    
    print("------------------------------------------------------")
    print("treshold:", treshold)
    print("accuracy score: ", accuracy_score(np.array(y_test), pred))
    print("precision_score: ", precision_score(np.array(y_test), pred))
    print("recall score: ", recall_score(np.array(y_test), pred))
    print("f1 score: ", f1_score(np.array(y_test), pred))
    print("------------------------------------------------------")

objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold: 15
accuracy score:  0.73
precision_score:  0.7628865979381443
recall score:  0.7047619047619048
f1 score:  0.7326732673267325
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold: 16
accuracy score:  0.73
precision_score:  0.7628865979381443
recall score:  0.7047619047619048
f1 score:  0.7326732673267325
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold: 17
accuracy score:  0.73
precision_score:  0.7628865979381443
recall score:  0.7047619047619048
f1 score:  0.7326732673267325
--------------------------------------

In [14]:
for treshold in range(30, 50):

    pred = lazy_classifier2(train_negative, train_positive, X_test, treshold)
    
    print("------------------------------------------------------")
    print("treshold:", treshold)
    print("accuracy score: ", accuracy_score(np.array(y_test), pred))
    print("precision_score: ", precision_score(np.array(y_test), pred))
    print("recall score: ", recall_score(np.array(y_test), pred))
    print("f1 score: ", f1_score(np.array(y_test), pred))
    print("------------------------------------------------------")

objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold: 30
accuracy score:  0.72
precision_score:  0.7247706422018348
recall score:  0.7523809523809524
f1 score:  0.7383177570093459
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold: 31
accuracy score:  0.735
precision_score:  0.7363636363636363
recall score:  0.7714285714285715
f1 score:  0.7534883720930233
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold: 32
accuracy score:  0.745
precision_score:  0.7454545454545455
recall score:  0.780952380952381
f1 score:  0.7627906976744186
-------------------------------------

### best result, quantiles dataset:

- treshold: 34

- accuracy score:  0.765

- precision_score:  0.7735849056603774

- recall score:  0.780952380952381

- f1 score:  0.7772511848341234

# Lazy classifier 3

In [8]:
# both train_positive and train_negative come without target variable

def lazy_classifier3(train_negative, train_positive, test, treshold):
    
    progress = 0
    # classification labels for test samples as a result
    result = []
    
    for _, test_sample in test.iterrows():
        
        intersection_neg = 0
        intersection_pos = 0
        
        if progress % 50 == 0:
            print("objects classified:", progress)
        
        for __, train_sample in train_negative.iterrows():
            
            # cardinality of  the intersection of: current test sample and current train sample
            intersection_count = np.sum(np.multiply(np.array(test_sample), np.array(train_sample)))
            intersection_neg += intersection_count
                
        for __, train_sample in train_positive.iterrows():
            
            # cardinality of  the intersection of: current test sample and current train sample
            intersection_count = np.sum(np.multiply(np.array(test_sample), np.array(train_sample)))
            intersection_pos += intersection_count
        
        # fraction of non-empty intersections with + and - classes plus 1
        weight_neg = (intersection_neg/train_negative.shape[1])
        weight_pos = (intersection_pos/train_positive.shape[1])
        
        result.append(int((weight_pos/weight_neg) >= treshold))
        
        # number of objects classified
        progress += 1
    
    return np.array(result)

In [9]:
for treshold in range(0, 50):


    pred = lazy_classifier3(train_negative, train_positive, X_test, treshold)

    print("------------------------------------------------------")
    print("accuracy score: ", accuracy_score(np.array(y_test), pred))
    print("precision_score: ", precision_score(np.array(y_test), pred))
    print("recall score: ", recall_score(np.array(y_test), pred))
    print("f1 score: ", f1_score(np.array(y_test), pred))
    print("------------------------------------------------------")

objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
accuracy score:  0.445
precision_score:  0.445
recall score:  1.0
f1 score:  0.615916955017301
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
accuracy score:  0.745
precision_score:  0.7209302325581395
recall score:  0.6966292134831461
f1 score:  0.7085714285714286
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
accuracy score:  0.555
precision_score:  0.0
recall score:  0.0
f1 score:  0.0
------------------------------------------------------
objects classified: 0


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
accuracy score:  0.555
precision_score:  0.0
recall score:  0.0
f1 score:  0.0
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
accuracy score:  0.555
precision_score:  0.0
recall score:  0.0
f1 score:  0.0
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
accuracy score:  0.555
precision_score:  0.0
recall score:  0.0
f1 score:  0.0
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
accuracy score:  0.555
preci

objects classified: 100
objects classified: 150
------------------------------------------------------
accuracy score:  0.555
precision_score:  0.0
recall score:  0.0
f1 score:  0.0
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
accuracy score:  0.555
precision_score:  0.0
recall score:  0.0
f1 score:  0.0
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
accuracy score:  0.555
precision_score:  0.0
recall score:  0.0
f1 score:  0.0
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
accuracy score:  0.555
precision_score:  0.0
recall

### best result, ginisplit dataset:

- accuracy score:  0.745

- precision_score:  0.7209302325581395

- recall score:  0.6966292134831461

- f1 score: 0.7085714285714286

### Now, lazy_classifier3 for quantiles dataset

In [16]:
for treshold in range(0, 50):

    pred = lazy_classifier3(train_negative, train_positive, X_test, treshold)
    
    print("------------------------------------------------------")
    print("treshold:", treshold)
    print("accuracy score: ", accuracy_score(np.array(y_test), pred))
    print("precision_score: ", precision_score(np.array(y_test), pred))
    print("recall score: ", recall_score(np.array(y_test), pred))
    print("f1 score: ", f1_score(np.array(y_test), pred))
    print("------------------------------------------------------")

objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold: 0
accuracy score:  0.525
precision_score:  0.525
recall score:  1.0
f1 score:  0.6885245901639345
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold: 1
accuracy score:  0.73
precision_score:  0.7628865979381443
recall score:  0.7047619047619048
f1 score:  0.7326732673267325
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold: 2
accuracy score:  0.475
precision_score:  0.0
recall score:  0.0
f1 score:  0.0
------------------------------------------------------
objects classified: 0


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold: 3
accuracy score:  0.475
precision_score:  0.0
recall score:  0.0
f1 score:  0.0
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold: 4
accuracy score:  0.475
precision_score:  0.0
recall score:  0.0
f1 score:  0.0
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold: 5
accuracy score:  0.475
precision_score:  0.0
recall score:  0.0
f1 score:  0.0
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
-----------------------------------------------

objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold: 31
accuracy score:  0.475
precision_score:  0.0
recall score:  0.0
f1 score:  0.0
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold: 32
accuracy score:  0.475
precision_score:  0.0
recall score:  0.0
f1 score:  0.0
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
------------------------------------------------------
treshold: 33
accuracy score:  0.475
precision_score:  0.0
recall score:  0.0
f1 score:  0.0
------------------------------------------------------
objects classified: 0
objects classified: 50
objects classified: 100
objects classified: 150
--------------------------------------------

### best result, quantiles split dataset:

- accuracy score:  0.73

- precision_score:  0.7628865979381443

- recall score:  0.7047619047619048

- f1 score: 0.7326732673267325