In [2]:
import requests
from bs4 import BeautifulSoup
import pymorphy2
from tqdm import tqdm
import time
import numpy as np
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import spatial

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('russian') + stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
morph = pymorphy2.MorphAnalyzer()

In [4]:
num2q_dict = dict()
with open('Data/queries_b.txt', 'r', encoding='utf-8') as f:
    for line in f.readlines():
        tmp = line.strip().split('\t')
        if tmp[0] not in num2q_dict:
            tmp_w = tmp[1].split(' ')
            norm_form =[]
            for w in tmp_w:
                norm_form.append(morph.parse(w)[0].normal_form)
            norm_form = ' '.join(norm_form)
            num2q_dict[tmp[0]] = norm_form
            
num2title_dict = dict()
with open('Data/titles_norm.txt', 'r', encoding='utf-8') as f:
    for line in f.readlines():
        tmp = line.strip().split('\t')
        num2title_dict[tmp[0]] = tmp[1]

In [5]:
title = []
for t in num2title_dict:
    title.append(num2title_dict[t])

query = []
for q in num2q_dict:
    query.append(num2q_dict[q])    

In [6]:
q2url_dict = dict()
with open('Data/train.marks.tsv','r') as f:
    for line in f.readlines():
        tmp = line.strip().split('\t')
        if tmp[0] in q2url_dict:
            q2url_dict[tmp[0]].append(tmp[1])
        else:
            q2url_dict[tmp[0]] = [tmp[1]]
with open('Data/sample.csv','r') as f:
    f.readline()
    for line in f.readlines():
        tmp = line.strip().split(',')
        if tmp[0] in q2url_dict:
            q2url_dict[tmp[0]].append(tmp[1])
        else:
            q2url_dict[tmp[0]] = [tmp[1]]

In [7]:
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1,1), encoding='utf-8')
vectorizer.fit(title)
title_transformed = vectorizer.transform(title)
query_transformed = vectorizer.transform(query)

with open('feat_title/сosine_tfidf_char1_1.txt', 'w') as f:
    for q in tqdm(q2url_dict):
        if q not in num2q_dict:
            continue
        docs = []
        b = vectorizer.transform([num2q_dict[q]]).toarray().ravel()
        for d in q2url_dict[q]:
            if d not in num2title_dict:
                continue
            a = vectorizer.transform([num2title_dict[d]]).toarray().ravel()
            cosin_sim = 1-spatial.distance.cosine(a, b)
            f.write(q+'\t'+d+'\t'+str(cosin_sim)+'\n')

100%|██████████████████████████████████████████████████████████████████████████████| 6311/6311 [05:52<00:00, 17.88it/s]


In [None]:
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2,3), encoding='utf-8')
vectorizer.fit(title)
title_transformed = vectorizer.transform(title)
query_transformed = vectorizer.transform(query)

with open('feat_title/сosine_tfidf_char2_3.txt', 'w') as f:
    for q in tqdm(q2url_dict):
        if q not in num2q_dict:
            continue
        docs = []
        b = vectorizer.transform([num2q_dict[q]]).toarray().ravel()
        for d in q2url_dict[q]:
            if d not in num2title_dict:
                continue
            a = vectorizer.transform([num2title_dict[d]]).toarray().ravel()
            cosin_sim = 1-spatial.distance.cosine(a, b)
            f.write(q+'\t'+d+'\t'+str(cosin_sim)+'\n')

In [None]:
with open('feat_title/сosine_tfidf_char3_13.txt', 'w') as f:
    for q in tqdm(q2url_dict):
        if q not in num2q_dict:
            continue
        docs = []
        for d in q2url_dict[q]:
            if d not in num2title_dict:
                continue
            docs.append(num2title_dict[d])
        vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3,13), encoding='utf-8')
        vectorizer.fit(docs)
        b = vectorizer.transform([num2q_dict[q]]).toarray().ravel()
        for d in q2url_dict[q]:
            if d not in num2title_dict:
                continue
            a = vectorizer.transform([num2title_dict[d]]).toarray().ravel()
            cosin_sim = 1-spatial.distance.cosine(a, b)
            f.write(q+'\t'+d+'\t'+str(cosin_sim)+'\n')

In [1]:
from rank_bm25 import  BM25L #to create bm25Plus and bm25 features just change BM25L to what you need

In [48]:
def remove_stopwords(input_string):
    word_list = input_string.split(' ')
    filtered_words = [word for word in word_list if word not in stop_words]
    filtered_string = ' '.join(filtered_words)
    return filtered_string

In [49]:
def convert_to_ngramms(input_string):
    chars = list(input_string.replace(' ', ''))
    bigrams = [chars[i]+chars[i+1] for i in range(len(chars)-1)]
    trigrams = [chars[i]+chars[i+1]+chars[i+2] for i in range(len(chars)-2)]
    return chars+bigrams+trigrams

In [53]:
with open('feat_title/сosine_ bm25L.txt', 'w') as f:
    for q in tqdm(q2url_dict):
        if q not in num2q_dict:
            continue
        docs = []
        for d in q2url_dict[q]:
            if d not in num2title_dict:
                continue
            docs.append(remove_stopwords(num2title_dict[d]))
        q_new = remove_stopwords(num2q_dict[q])
        tokenized_corpus = [convert_to_ngramms(doc) for doc in docs]
        bm25 =  BM25L(tokenized_corpus)
        tokenized_query = convert_to_ngramms(q_new)
        doc_scores = bm25.get_scores(tokenized_query)
        i = 0
        for d in q2url_dict[q]:
            if d not in num2title_dict:
                continue
            f.write(q+'\t'+d+'\t'+str(doc_scores[i])+'\n')
            i += 1

100%|██████████████████████████████████████████████████████████████████████████████| 6311/6311 [01:12<00:00, 87.18it/s]
