In [1]:
%load_ext autoreload

%autoreload 2


import warnings


warnings.filterwarnings('ignore')

### Load tfidf model

In [2]:
import pickle
import codecs
import joblib

In [3]:
vect = joblib.load("ruentfidf/tfidf.pkl")

### Load data

In [None]:
!wget https://russiansuperglue.com/tasks/download
!unzip download
!rm download

In [4]:
from pathlib import Path

In [5]:
data_dir = Path("combined/")

In [6]:
all_results = {
    "name": [],
    "train": [],
    "val": [],
    "test": []
}

In [7]:
output_dir = Path("baseline_submission")

In [None]:
!mkdir $output_dir

In [8]:
import jsonlines


def save_output(data, path):
    with jsonlines.open(path, mode="w") as writer:
        writer.write_all(data)

### PARus

1. build text: "{premise} {question} {choice1} {choice2}"
2. get tfidf
3. fit logreg


In [9]:
train_path = data_dir / "PARus/train.jsonl"
val_path = data_dir / "PARus/val.jsonl"
test_path = data_dir / "PARus/test.jsonl"

In [10]:
from tfidf_baseline.PARus import eval_PARus

In [11]:
_, PARus_scores = eval_PARus(train_path, val_path, test_path, vect)

In [12]:
PARus_scores["train"], PARus_scores["val"]

(0.775, 0.45)

In [13]:
all_results["name"].append("PARus")
all_results["train"].append(PARus_scores["train"])
all_results["val"].append(PARus_scores["val"])

In [14]:
save_output(PARus_scores["test_pred"], output_dir / "PARus.jsonl")

### RCB
1. build text: "{premise} {hypothesis}"
2. get tfidf
3. fit logreg

In [15]:
train_path = data_dir / "RCB/train.jsonl"
val_path = data_dir / "RCB/val.jsonl"
test_path = data_dir / "RCB/test.jsonl"

In [16]:
from tfidf_baseline.RCB import eval_RCB

In [17]:
_, RCB_scores = eval_RCB(train_path, val_path, test_path, vect)

In [18]:
RCB_scores["train"], RCB_scores["val"]

(0.7420091324200914, 0.5227272727272727)

In [19]:
all_results["name"].append("RCB")
all_results["train"].append(RCB_scores["train"])
all_results["val"].append(RCB_scores["val"])

In [20]:
save_output(RCB_scores["test_pred"], output_dir / "RCB.jsonl")

### DaNetQA
1. build text: "{question}"
2. get tfidf
3. fit logreg

In [21]:
train_path = data_dir / "DaNetQA/train.jsonl"
val_path = data_dir / "DaNetQA/val.jsonl"
test_path = data_dir / "DaNetQA/test.jsonl"

In [22]:
from tfidf_baseline.DaNetQA import eval_DaNetQA

In [23]:
_, DaNetQA_scores = eval_DaNetQA(train_path, val_path, test_path, vect)

In [24]:
DaNetQA_scores["train"], DaNetQA_scores["val"]

(0.7755102040816326, 0.7457627118644068)

In [25]:
all_results["name"].append("DaNetQA")
all_results["train"].append(DaNetQA_scores["train"])
all_results["val"].append(DaNetQA_scores["val"])

In [26]:
save_output(DaNetQA_scores["test_pred"], output_dir / "DaNetQA.jsonl")

### TERRa
1. build text: "{premise} {hypothesis}"
2. get tfidf
3. fit logreg


In [27]:
train_path = data_dir / "TERRa/train.jsonl"
val_path = data_dir / "TERRa/val.jsonl"
test_path = data_dir / "TERRa/test.jsonl"

In [28]:
from tfidf_baseline.TERRa import eval_TERRa

In [29]:
_, TERRa_scores = eval_TERRa(train_path, val_path, test_path, vect)

In [30]:
TERRa_scores["train"], TERRa_scores["val"]

(0.7152140672782875, 0.46579804560260585)

In [31]:
all_results["name"].append("TERRa")
all_results["train"].append(TERRa_scores["train"])
all_results["val"].append(TERRa_scores["val"])

In [38]:
save_output(TERRa_scores["test_pred"], output_dir / "TERRa.jsonl")

### RWSD
1. build text: "{premise} {span1} {span2}"
2. get tfidf
3. fit logreg


In [33]:
train_path = data_dir / "RWSD/train.jsonl"
val_path = data_dir / "RWSD/val.jsonl"
test_path = data_dir / "RWSD/test.jsonl"

In [34]:
from tfidf_baseline.RWSD import eval_RWSD

In [35]:
_, RWSD_scores = eval_RWSD(train_path, val_path, test_path, vect)

In [36]:
RWSD_scores["train"], RWSD_scores["val"]

(0.5115511551155115, 0.553921568627451)

In [37]:
all_results["name"].append("RWSD")
all_results["train"].append(RWSD_scores["train"])
all_results["val"].append(RWSD_scores["val"])

In [39]:
save_output(RWSD_scores["test_pred"], output_dir / "RWSD.jsonl")

### RUSSE
1. build text: "{sentence1} {sentence2} {word}"
2. get tfidf
3. fit logreg

In [40]:
train_path = data_dir / "RUSSE/train.jsonl"
val_path = data_dir / "RUSSE/val.jsonl"
test_path = data_dir / "RUSSE/test.jsonl"

In [41]:
from tfidf_baseline.RUSSE import eval_RUSSE

In [42]:
_, RUSSE_scores = eval_RUSSE(train_path, val_path, test_path, vect)

In [43]:
RUSSE_scores["train"], RUSSE_scores["val"]

(0.7103552532123961, 0.6653733098177542)

In [44]:
all_results["name"].append("RUSSE")
all_results["train"].append(RUSSE_scores["train"])
all_results["val"].append(RUSSE_scores["val"])

In [45]:
save_output(RUSSE_scores["test_pred"], output_dir / "RUSSE.jsonl")

### LiDiRus
1. build text: "{sentence1} {sentence2}"
2. get tfidf
3. fit logreg

In [46]:
train_path = data_dir / "TERRa/train.jsonl"
val_path = data_dir / "TERRa/val.jsonl"
test_path = data_dir / "LiDiRus/LiDiRuS.jsonl"

In [47]:
from tfidf_baseline.LiDiRus import eval_LiDiRus

In [48]:
_, LiDiRus_scores = eval_LiDiRus(train_path, val_path, test_path, vect)

In [50]:
LiDiRus_scores["train"], LiDiRus_scores["test"]

(0.4294719661883857, 0.05974021843803689)

In [51]:
all_results["name"].append("LiDiRus")
all_results["train"].append(LiDiRus_scores["train"])
all_results["val"].append(LiDiRus_scores["val"])

In [52]:
save_output(LiDiRus_scores["test_pred"], output_dir / "LiDiRus.jsonl")

### RuCoS
1. build text of passage and queries
2. get tfidf of passage and queries
3. calculate cosins between passage and queries
4. select best by cosin metric

In [53]:
train_path = data_dir / "RuCoS/train.jsonl"
val_path = data_dir / "RuCoS/val.jsonl"
test_path = data_dir / "RuCoS/test.jsonl"

In [54]:
from tfidf_baseline.RuCoS import eval_RuCoS

In [55]:
_, RuCoS_scores = eval_RuCoS(train_path, val_path, test_path, vect)

In [56]:
RuCoS_scores["train"], RuCoS_scores["val"]

((0.20824733699943207, 0.2263773335525391),
 (0.25308924485125855, 0.25950419527078566))

In [57]:
all_results["name"].append("RuCoS")
all_results["train"].append(RuCoS_scores["train"])
all_results["val"].append(RuCoS_scores["val"])

In [58]:
save_output(RuCoS_scores["test_pred"], output_dir / "RuCoS.jsonl")

### MuSeRC
1. build text of passage and queries
2. get tfidf of passage and queries
3. calculate cosins between passage and queries
4. select best 2 by cosin metric

In [59]:
train_path = data_dir / "MuSeRC/train.jsonl"
val_path = data_dir / "MuSeRC/val.jsonl"
test_path = data_dir / "MuSeRC/test.jsonl"

In [60]:
from tfidf_baseline.MuSeRC import eval_MuSeRC

In [61]:
_, MuSeRC_scores = eval_MuSeRC(train_path, val_path, test_path, vect)

In [62]:
MuSeRC_scores["train"], MuSeRC_scores["val"]

((0.2140077821011673, 0.5475732090384031),
 (0.20982986767485823, 0.5207215992198928))

In [63]:
all_results["name"].append("MuSeRC")
all_results["train"].append(MuSeRC_scores["train"])
all_results["val"].append(MuSeRC_scores["val"])

In [64]:
save_output(MuSeRC_scores["test_pred"], output_dir / "MuSeRC.jsonl")

### Over All

In [65]:
import pandas as pd

In [68]:
all_results.pop("test")

[]

In [69]:
results = pd.DataFrame(all_results)

In [70]:
results

Unnamed: 0,name,train,val
0,PARus,0.775,0.45
1,RCB,0.742009,0.522727
2,DaNetQA,0.77551,0.745763
3,TERRa,0.715214,0.465798
4,RWSD,0.511551,0.553922
5,RUSSE,0.710355,0.665373
6,LiDiRus,0.429472,-0.0683523
7,RuCoS,"(0.20824733699943207, 0.2263773335525391)","(0.25308924485125855, 0.25950419527078566)"
8,MuSeRC,"(0.2140077821011673, 0.5475732090384031)","(0.20982986767485823, 0.5207215992198928)"


In [71]:
results.to_csv("results.csv", sep="\t")

#### Make submission file

In [77]:
!zip "baseline_submission.zip" $output_dir

  adding: baseline_submission/ (stored 0%)


Submit at https://russiansuperglue.com/login/start_submit/

This submission should be scored with 0.372 total score