In [249]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn import model_selection
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
import random
from statistics import mean
from sklearn.model_selection import train_test_split

In [263]:
df_train = pd.read_csv('/content/drive/MyDrive/HSE/OSDA/train1.csv')
df_train.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10
0,x,x,x,x,o,o,x,o,o,positive
1,x,x,x,x,o,o,o,o,x,positive
2,x,x,x,x,o,o,o,b,b,positive
3,x,x,x,x,o,o,b,o,b,positive
4,x,x,x,x,o,o,b,b,o,positive


In [251]:
df_test = pd.read_csv('/content/drive/MyDrive/HSE/OSDA/test1.csv')
df_test.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10
0,x,x,x,x,o,o,o,x,o,positive
1,x,x,x,x,o,b,o,b,o,positive
2,x,x,x,o,o,x,o,x,o,positive
3,x,x,x,o,o,b,x,o,b,positive
4,x,x,x,b,o,b,o,o,x,positive


In [252]:
def scaling(df):
    for i in range(9):
        str_i = str(i + 1)
        df['v' + str_i] = (df['V' + str_i] == 'x').astype(int)
    df['v10'] = (df['V10'] == 'positive').astype(int)
    df.drop(['V' + str(i+1) for i in range(10)], axis=1, inplace = True)
    return df

In [253]:
def intersection(dict_1, dict_2):
  return dict(set(dict_1.items()) & set(dict_2.items()))

In [254]:
def subset(dict_1, dict_2):
  return dict_1.items() <= dict_2.items()

In [255]:
df_train_scaled = scaling(df_train)
pos_examples = df_train_scaled[df_train_scaled['v10'] == 1].drop('v10', axis = 1).to_dict('records')
neg_examples = df_train_scaled[df_train_scaled['v10'] == 0].drop('v10', axis = 1).to_dict('records')
df_test_scaled = scaling(df_test)
x_test = df_test_scaled.drop('v10', axis = 1).to_dict('records')
y_test = df_test_scaled['v10']

In [256]:
def baseline_algorithm(X_test, X_pos, X_neg):  
  predictions = []
  for i_obj in range(len(X_test)):

    test_extent = X_test[i_obj]
    support_pos  = 0
    support_neg  = 0
    for j_extent in range(len(X_pos)):
      inter = intersection(test_extent, X_pos[j_extent])
      if len(inter) != 0:
        counter = 0
        for min_extent in range(len(X_neg)):
          if subset(inter, X_neg[min_extent]):
            counter += 1
        if counter == 0:
          support_pos += 1
    for k_extent in range(len(X_neg)):
      inter = intersection(test_extent, X_neg[k_extent])
      if len(inter) != 0:
        counter = 0
        for max_extent in range(len(X_pos)):
          if subset(inter, X_pos[max_extent]):
            counter += 1
        if counter == 0:
          support_neg += 1
    if support_neg == support_pos:
      predictions.append(random.choice([1, 0]))
    elif support_pos > support_neg:
      predictions.append(1)
    else:
      predictions.append(0)
  return predictions

In [None]:
def algorithm_with_threshold(X_test, X_pos, X_neg, threshold = 5):  
  predictions = []
  for i_obj in range(len(X_test)):

    test_extent = X_test[i_obj]
    support_pos  = 0
    support_neg  = 0
    for j_extent in range(len(X_pos)):
      inter = intersection(test_extent, X_pos[j_extent])
      if len(inter) != 0:
        counter = 0
        for min_extent in range(len(X_neg)):
          if subset(inter, X_neg[min_extent]):
            counter += 1
        if counter < threshold:
          support_pos += 1


    for k_extent in range(len(X_neg)):
      inter = intersection(test_extent, X_neg[k_extent])
      if len(inter) != 0:
        counter = 0
        for max_extent in range(len(X_pos)):
          if subset(inter, X_pos[max_extent]):
            counter += 1
        if counter < threshold:
          support_neg += 1


    if support_neg == support_pos:
      predictions.append(random.choice([1, 0]))
    elif support_pos > support_neg:
      predictions.append(1)
    else:
      predictions.append(0)
  return predictions

In [257]:
y_pred = baseline_algorithm(x_test, pos_examples, neg_examples)

In [261]:
kf = KFold(n_splits=5, random_state=None, shuffle=True)
acc_res = []
roc_auc_res = []
prc_res = []
rcl_res = []
for train_index, test_index in kf.split(df_train_scaled):
    df_dev = df_train_scaled.iloc[train_index]
    df_test = df_train_scaled.iloc[test_index]
 
    plus_context = df_dev[df_dev['v10'] == 1]
    minus_context = df_dev[df_dev['v10'] == 0]
    
    X_plus = plus_context.drop("v10", axis = 1).to_dict('records')
    X_minus = minus_context.drop("v10", axis = 1).to_dict('records')
 
    X_test = df_test.drop("v10", axis = 1).to_dict('records')
    y_test = df_test["v10"].tolist()

    answers = baseline_algorithm(X_test, X_plus, X_minus)
    acc = metrics.accuracy_score(y_test, answers)
    acc_res.append(acc)
    roc_auc = metrics.roc_auc_score(y_test, answers)
    roc_auc_res.append(roc_auc)
    prc = metrics.precision_score(y_test, answers)
    prc_res.append(prc)
    rcl = metrics.recall_score(y_test, answers) 
    rcl_res.append(rcl)
print(mean(acc_res), mean(roc_auc_res), mean(prc_res), mean(rcl_res))

0.9491329479768786 0.9252195445837688 0.9288327916775688 1.0


In [258]:
kf = KFold(n_splits=5, random_state=None, shuffle=True)
acc_res = []
roc_auc_res = []
prc_res = []
rcl_res = []
for train_index, test_index in kf.split(df_train_scaled):
    df_dev = df_train_scaled.iloc[train_index]
    df_test = df_train_scaled.iloc[test_index]
 
    plus_context = df_dev[df_dev['v10'] == 1]
    minus_context = df_dev[df_dev['v10'] == 0]
    
    X_plus = plus_context.drop("v10", axis = 1).to_dict('records')
    X_minus = minus_context.drop("v10", axis = 1).to_dict('records')
 
    X_test = df_test.drop("v10", axis = 1).to_dict('records')
    y_test = df_test["v10"].tolist()

    answers = algorithm_with_threshold(X_test, X_plus, X_minus)
    acc = metrics.accuracy_score(y_test, answers)
    acc_res.append(acc)
    roc_auc = metrics.roc_auc_score(y_test, answers)
    roc_auc_res.append(roc_auc)
    prc = metrics.precision_score(y_test, answers)
    prc_res.append(prc)
    rcl = metrics.recall_score(y_test, answers) 
    rcl_res.append(rcl)
print(mean(acc_res), mean(roc_auc_res), mean(prc_res), mean(rcl_res))

0.976878612716763 0.9663685787710948 0.9665356561514469 1.0


In [270]:
x_train, x_test, y_train, y_test = train_test_split(df_train_scaled.drop('v10', axis = 1), df_train_scaled['v10'], train_size=0.8, random_state=42)

In [271]:
clf = LogisticRegression()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred), metrics.roc_auc_score(y_test, y_pred), metrics.precision_score(y_test, y_pred), metrics.recall_score(y_test, y_pred))

0.7687861271676301 0.7137464387464387 0.753731343283582 0.9351851851851852
