In [219]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn import model_selection
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
import random
from statistics import mean
from sklearn.model_selection import train_test_split

In [220]:
column_names = ['en_speaker', 'instructor', 'course', 'semestr', 'size', 'target']
df = pd.read_csv('/content/drive/MyDrive/HSE/OSDA/tae.data', names = column_names, sep = '\t')

In [198]:
df['size'] = pd.qcut(df['size'], 5, ['Very_Low','Low','Medium','High','Very_High'])

In [199]:
for attr in ['en_speaker', 'instructor', 'course', 'semestr', 'size']:
    for i in df[attr].unique():
        df[attr + '_' + str(i)] = (df[attr] == i).astype(int)
    df = df.drop(attr, axis=1)

In [201]:
def intersection(dict_1, dict_2):
  return dict(set(dict_1.items()) & set(dict_2.items()))

In [202]:
def subset(dict_1, dict_2):
  return dict_1.items() <= dict_2.items()

In [203]:
def baseline_algorithm(X_test, X_pos, X_neg):  
  predictions = []
  for i_obj in range(len(X_test)):

    test_extent = X_test[i_obj]
    support_pos  = 0
    support_neg  = 0
    for j_extent in range(len(X_pos)):
      inter = intersection(test_extent, X_pos[j_extent])
      if len(inter) != 0:
        counter = 0
        for min_extent in range(len(X_neg)):
          if subset(inter, X_neg[min_extent]):
            counter += 1
        if counter == 0:
          support_pos += 1
    for k_extent in range(len(X_neg)):
      inter = intersection(test_extent, X_neg[k_extent])
      if len(inter) != 0:
        counter = 0
        for max_extent in range(len(X_pos)):
          if subset(inter, X_pos[max_extent]):
            counter += 1
        if counter == 0:
          support_neg += 1
    if support_neg == support_pos:
      predictions.append(random.choice([1, 0]))
    elif support_pos > support_neg:
      predictions.append(1)
    else:
      predictions.append(0)
  return predictions

In [224]:
kf = KFold(n_splits=5, random_state=42, shuffle=True)
acc_res = []
roc_auc_res = []
prc_res = []
rcl_res = []
for train_index, test_index in kf.split(df):
    df_dev = df.iloc[train_index]
    df_test = df.iloc[test_index]
 
    plus_context = df_dev[df_dev['target'] == 1]
    minus_context = df_dev[df_dev['target'] == 0]
    
    X_plus = plus_context.drop("target", axis = 1).to_dict('records')
    X_minus = minus_context.drop("target", axis = 1).to_dict('records')
 
    X_test = df_test.drop("target", axis = 1).to_dict('records')
    y_test = df_test["target"].tolist()

    answers = baseline_algorithm(X_test, X_plus, X_minus)
    acc = metrics.accuracy_score(y_test, answers)
    acc_res.append(acc)
    roc_auc = metrics.roc_auc_score(y_test, answers)
    roc_auc_res.append(roc_auc)
    prc = metrics.precision_score(y_test, answers)
    prc_res.append(prc)
    rcl = metrics.recall_score(y_test, answers) 
    rcl_res.append(rcl)
print(mean(acc_res), mean(roc_auc_res), mean(prc_res), mean(rcl_res))

0.6242857142857142 0.6698232323232323 0.6518648018648019 0.6888888888888889


In [210]:
def algorithm_with_threshold(X_test, X_pos, X_neg, threshold = 5):  
  predictions = []
  for i_obj in range(len(X_test)):

    test_extent = X_test[i_obj]
    support_pos  = 0
    support_neg  = 0
    for j_extent in range(len(X_pos)):
      inter = intersection(test_extent, X_pos[j_extent])
      if len(inter) != 0:
        counter = 0
        for min_extent in range(len(X_neg)):
          if subset(inter, X_neg[min_extent]):
            counter += 1
        if counter < threshold:
          support_pos += 1


    for k_extent in range(len(X_neg)):
      inter = intersection(test_extent, X_neg[k_extent])
      if len(inter) != 0:
        counter = 0
        for max_extent in range(len(X_pos)):
          if subset(inter, X_pos[max_extent]):
            counter += 1
        if counter < threshold:
          support_neg += 1


    if support_neg == support_pos:
      predictions.append(random.choice([1, 0]))
    elif support_pos > support_neg:
      predictions.append(1)
    else:
      predictions.append(0)
  return predictions

0.7519047619047619 0.8016594516594516 0.8405194805194806 0.7277777777777777


In [215]:
x_train, x_test, y_train, y_test = train_test_split(df.drop('target', axis = 1), df['target'], train_size=0.8, random_state=42)

In [218]:
clf = LogisticRegression()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred), metrics.roc_auc_score(y_test, y_pred), metrics.precision_score(y_test, y_pred), metrics.recall_score(y_test, y_pred))

0.7142857142857143 0.6944444444444444 0.7142857142857143 0.5555555555555556
