In [1]:
import numpy as np
from typing import List
import pandas as pd

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import TweetTokenizer

import re

from scipy.spatial.distance import cosine

from matplotlib import pyplot as plt
import seaborn as sns

In [3]:
def preprocessor(text: str) -> str:
    text = re.sub(r'[!\"#$%&\'()*\+,-./:;<=>?@\[\\\]^_`{|}~]+', ' ', text.lower())
    text = text.replace('ё', 'е')
    return text


def stemm_tokenizer(text: str, stemmer, tknzr) -> str:
    cleaned_text = preprocessor(text)
    stemmed_text = ' '.join([stemmer.stem(w) for w in tknzr.tokenize(cleaned_text) if w.isalpha()])
    return stemmed_text

In [4]:

def f1_score(precision_list, recall_list):
    prec = np.array(precision_list)
    rec = np.array(recall_list)
    f1 = 2 * prec * rec / (prec + rec)
    return f1

In [29]:
%run experiment_template.py
%run loss.py

In [6]:
stemmer = SnowballStemmer('russian')
tokenizer = TweetTokenizer()

tfidf = TfidfVectorizer(max_df=0.3,
                        min_df=3,
                       tokenizer=lambda text: stemm_tokenizer(text, stemmer, tokenizer).split())

In [7]:
data = pd.read_csv('../data/test_data.csv') # выгружаем датасет новостей
data = data[~data['text'].isna()]

In [8]:
tfidf.fit(data.text)



TfidfVectorizer(max_df=0.3, min_df=3,
                tokenizer=<function <lambda> at 0x0000017A198681F0>)

In [9]:
comm_res = pd.read_csv('../data/toloka_result_pairs.csv')
valid_inds = np.union1d(comm_res['new_1'].values, comm_res['new_2'].values)
comm_res = comm_res.groupby(['new_1', 'new_2']).max()
        
comm_res_train, comm_res_test, _, _ = train_test_split(comm_res, comm_res, train_size=0.5, random_state=42)

In [30]:
myexp = MyExperiment(data.loc[valid_inds], tfidf, 'Cos_Last',0.9)

In [37]:
myexp.run()

Succesful run


In [44]:
myexp.info()

{'Specification': 'Cos_Last', 'Threshold': 0.1, 'Num of news': 2992, 'Num of chains': 2991, 'Max chain length': 2, 'Min chain length': 1, 'Chains': [array([2], dtype=int64), array([3], dtype=int64), array([9], dtype=int64), array([10], dtype=int64), array([11], dtype=int64), array([19], dtype=int64), array([22], dtype=int64), array([24], dtype=int64), array([26], dtype=int64), array([34], dtype=int64), array([47], dtype=int64), array([48], dtype=int64), array([49], dtype=int64), array([51], dtype=int64), array([53], dtype=int64), array([57], dtype=int64), array([58], dtype=int64), array([68], dtype=int64), array([80], dtype=int64), array([81], dtype=int64), array([82], dtype=int64), array([88], dtype=int64), array([89], dtype=int64), array([90], dtype=int64), array([97], dtype=int64), array([98], dtype=int64), array([107], dtype=int64), array([112], dtype=int64), array([113], dtype=int64), array([120], dtype=int64), array([125], dtype=int64), array([127], dtype=int64), array([130], dty

In [32]:
myexp.evaluate()

Result of evaluation: 
Train precision: 0.4371973796639134
Train Recall:1.0
Test Precision:0.4347949886104784
TestRecall:1.0


In [42]:
myexp.run(0.1)

Succesful run


In [43]:
myexp.evaluate()

Result of evaluation: 
Train precision: 1.0
Train Recall:0.004560260586319218
Test Precision:0.5
TestRecall:0.0006548788474132286
