In [1]:
%load_ext autoreload

%autoreload 2


import warnings


warnings.filterwarnings('ignore')

### Load tfidf model

download from https://russiansuperglue.com/tasks/tf_idf

In [2]:
import pickle
import codecs
import joblib

In [3]:
vect = joblib.load("tfidf.pkl")

### Load data

In [None]:
!wget https://russiansuperglue.com/tasks/download
!unzip download
!rm download

In [4]:
from pathlib import Path

In [5]:
data_dir = Path("combined/")

In [6]:
all_results = {
    "name": [],
    "train": [],
    "val": [],
    "test": []
}

In [7]:
output_dir = Path("baseline_submission")

In [9]:
!mkdir $output_dir

In [10]:
import json


def save_output(data, path):
    with open(path, mode="w") as file:
        for line in sorted(data, key=lambda x: int(x.get("idx"))):
            line["idx"] = int(line["idx"])
            file.write(f"{json.dumps(line, ensure_ascii=False)}\n")

### PARus

1. build text: "{premise} {question} {choice1} {choice2}"
2. get tfidf
3. fit logreg


In [11]:
train_path = data_dir / "PARus/train.jsonl"
val_path = data_dir / "PARus/val.jsonl"
test_path = data_dir / "PARus/test.jsonl"

In [12]:
from tfidf_baseline.PARus import eval_PARus

In [13]:
_, PARus_scores = eval_PARus(train_path, val_path, test_path, vect)

In [14]:
PARus_scores["train"], PARus_scores["val"]

(0.7725, 0.44)

In [15]:
all_results["name"].append("PARus")
all_results["train"].append(PARus_scores["train"])
all_results["val"].append(PARus_scores["val"])

In [16]:
save_output(PARus_scores["test_pred"], output_dir / "PARus.jsonl")

### RCB
1. build text: "{premise} {hypothesis}"
2. get tfidf
3. fit logreg

In [17]:
train_path = data_dir / "RCB/train.jsonl"
val_path = data_dir / "RCB/val.jsonl"
test_path = data_dir / "RCB/test.jsonl"

In [18]:
from tfidf_baseline.RCB import eval_RCB

In [19]:
_, RCB_scores = eval_RCB(train_path, val_path, test_path, vect)

In [20]:
RCB_scores["train"], RCB_scores["val"]

(0.7968036529680366, 0.5136363636363637)

In [21]:
all_results["name"].append("RCB")
all_results["train"].append(RCB_scores["train"])
all_results["val"].append(RCB_scores["val"])

In [22]:
save_output(RCB_scores["test_pred"], output_dir / "RCB.jsonl")

### DaNetQA
1. build text: "{question}"
2. get tfidf
3. fit logreg

In [23]:
train_path = data_dir / "DaNetQA/train.jsonl"
val_path = data_dir / "DaNetQA/val.jsonl"
test_path = data_dir / "DaNetQA/test.jsonl"

In [24]:
from tfidf_baseline.DaNetQA import eval_DaNetQA

In [25]:
_, DaNetQA_scores = eval_DaNetQA(train_path, val_path, test_path, vect)

In [26]:
DaNetQA_scores["train"], DaNetQA_scores["val"]

(0.8010291595197255, 0.5907429963459196)

In [27]:
all_results["name"].append("DaNetQA")
all_results["train"].append(DaNetQA_scores["train"])
all_results["val"].append(DaNetQA_scores["val"])

In [28]:
save_output(DaNetQA_scores["test_pred"], output_dir / "DaNetQA.jsonl")

### TERRa
1. build text: "{premise} {hypothesis}"
2. get tfidf
3. fit logreg


In [29]:
train_path = data_dir / "TERRa/train.jsonl"
val_path = data_dir / "TERRa/val.jsonl"
test_path = data_dir / "TERRa/test.jsonl"

In [30]:
from tfidf_baseline.TERRa import eval_TERRa

In [31]:
_, TERRa_scores = eval_TERRa(train_path, val_path, test_path, vect)

In [32]:
TERRa_scores["train"], TERRa_scores["val"]

(0.7152140672782875, 0.46579804560260585)

In [33]:
all_results["name"].append("TERRa")
all_results["train"].append(TERRa_scores["train"])
all_results["val"].append(TERRa_scores["val"])

In [34]:
save_output(TERRa_scores["test_pred"], output_dir / "TERRa.jsonl")

### RWSD
1. build text: "{premise} {span1} {span2}"
2. get tfidf
3. fit logreg


In [35]:
train_path = data_dir / "RWSD/train.jsonl"
val_path = data_dir / "RWSD/val.jsonl"
test_path = data_dir / "RWSD/test.jsonl"

In [36]:
from tfidf_baseline.RWSD import eval_RWSD

In [37]:
_, RWSD_scores = eval_RWSD(train_path, val_path, test_path, vect)

In [38]:
RWSD_scores["train"], RWSD_scores["val"]

(0.5115511551155115, 0.553921568627451)

In [39]:
all_results["name"].append("RWSD")
all_results["train"].append(RWSD_scores["train"])
all_results["val"].append(RWSD_scores["val"])

In [40]:
save_output(RWSD_scores["test_pred"], output_dir / "RWSD.jsonl")

### RUSSE
1. build text: "{sentence1} {sentence2} {word}"
2. get tfidf
3. fit logreg

In [41]:
train_path = data_dir / "RUSSE/train.jsonl"
val_path = data_dir / "RUSSE/val.jsonl"
test_path = data_dir / "RUSSE/test.jsonl"

In [42]:
from tfidf_baseline.RUSSE import eval_RUSSE

In [43]:
_, RUSSE_scores = eval_RUSSE(train_path, val_path, test_path, vect)

In [44]:
RUSSE_scores["train"], RUSSE_scores["val"]

(0.7103552532123961, 0.6654908877131099)

In [45]:
all_results["name"].append("RUSSE")
all_results["train"].append(RUSSE_scores["train"])
all_results["val"].append(RUSSE_scores["val"])

In [46]:
save_output(RUSSE_scores["test_pred"], output_dir / "RUSSE.jsonl")

### LiDiRus
1. build text: "{sentence1} {sentence2}"
2. get tfidf
3. fit logreg

In [47]:
train_path = data_dir / "TERRa/train.jsonl"
val_path = data_dir / "TERRa/val.jsonl"
test_path = data_dir / "LiDiRus/LiDiRus.jsonl"

In [48]:
from tfidf_baseline.LiDiRus import eval_LiDiRus

In [49]:
_, LiDiRus_scores = eval_LiDiRus(train_path, val_path, test_path, vect)

In [50]:
LiDiRus_scores["train"], LiDiRus_scores["test"]

(0.4294719661883857, 0.05974021843803689)

In [51]:
all_results["name"].append("LiDiRus")
all_results["train"].append(LiDiRus_scores["train"])
all_results["val"].append(LiDiRus_scores["val"])

In [52]:
save_output(LiDiRus_scores["test_pred"], output_dir / "LiDiRus.jsonl")

### RuCoS
1. build text of passage and queries
2. get tfidf of passage and queries
3. calculate cosins between passage and queries
4. select best by cosin metric

In [53]:
train_path = data_dir / "RuCoS/train.jsonl"
val_path = data_dir / "RuCoS/val.jsonl"
test_path = data_dir / "RuCoS/test.jsonl"

In [54]:
from tfidf_baseline.RuCoS import eval_RuCoS

In [55]:
_, RuCoS_scores = eval_RuCoS(train_path, val_path, test_path, vect)

In [56]:
RuCoS_scores["train"], RuCoS_scores["val"]

((0.20824733699943207, 0.2263773335525391),
 (0.22964233865646033, 0.235315208305838))

In [57]:
all_results["name"].append("RuCoS")
all_results["train"].append(RuCoS_scores["train"])
all_results["val"].append(RuCoS_scores["val"])

In [59]:
save_output(RuCoS_scores["test_pred"], output_dir / "RuCoS.jsonl")

### MuSeRC
1. build text of passage and queries
2. get tfidf of passage and queries
3. calculate cosins between passage and queries
4. select best 2 by cosin metric

In [60]:
train_path = data_dir / "MuSeRC/train.jsonl"
val_path = data_dir / "MuSeRC/val.jsonl"
test_path = data_dir / "MuSeRC/test.jsonl"

In [61]:
from tfidf_baseline.MuSeRC import eval_MuSeRC

In [62]:
_, MuSeRC_scores = eval_MuSeRC(train_path, val_path, test_path, vect)

In [63]:
MuSeRC_scores["train"], MuSeRC_scores["val"]

((0.2564722126337591, 0.5966356478167503),
 (0.2495274102079395, 0.5841053144807411))

In [64]:
all_results["name"].append("MuSeRC")
all_results["train"].append(MuSeRC_scores["train"])
all_results["val"].append(MuSeRC_scores["val"])

In [65]:
save_output(MuSeRC_scores["test_pred"], output_dir / "MuSeRC.jsonl")

### Over All

In [66]:
import pandas as pd

In [67]:
all_results.pop("test")

[]

In [68]:
results = pd.DataFrame(all_results)

In [69]:
results

Unnamed: 0,name,train,val
0,PARus,0.7725,0.44
1,RCB,0.796804,0.513636
2,DaNetQA,0.801029,0.590743
3,TERRa,0.715214,0.465798
4,RWSD,0.511551,0.553922
5,RUSSE,0.710355,0.665491
6,LiDiRus,0.429472,-0.0683523
7,RuCoS,"(0.20824733699943207, 0.2263773335525391)","(0.22964233865646033, 0.235315208305838)"
8,MuSeRC,"(0.2564722126337591, 0.5966356478167503)","(0.2495274102079395, 0.5841053144807411)"


In [70]:
results.to_csv("results.csv", sep="\t")

#### Make submission file

In [71]:
!7z a "baseline_submission.zip" $output_dir


7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=C.UTF-8,Utf16=on,HugeFiles=on,64 bits,96 CPUs Intel(R) Xeon(R) Platinum 8168 CPU @ 2.70GHz (50654),ASM,AES-NI)

Scanning the drive:
  0M Sca        1 folder, 9 files, 1403214 bytes (1371 KiB)

Creating archive: baseline_submission.zip

Items to compress: 10

    
Files read from disk: 9
Archive size: 129050 bytes (127 KiB)
Everything is Ok


Submit at https://russiansuperglue.com/login/start_submit/

This submission should be scored with 0.434 total score