In [28]:
import requests
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

def fetch_moex_tickers():
    url = 'https://iss.moex.com/iss/engines/stock/markets/shares/securities.json'
    response = requests.get(url)
    data = response.json()
    securities = data['securities']['data']
    columns = data['securities']['columns']
    df = pd.DataFrame(securities, columns=columns)
    df_filtered = df[['SECID', 'SHORTNAME', 'SECNAME', 'LATNAME']].dropna()
    df_filtered = df_filtered[df_filtered['SECID'].apply(lambda x: len(str(x)) <= 6)]
    company_to_ticker = {}
    for _, row in df_filtered.iterrows():
        names = [row['SECNAME'], row['SHORTNAME'], row['LATNAME']]
        for name in names:
            company_to_ticker[name.lower()] = row['SECID']
    return company_to_ticker

model_name = "sberbank-ai/ruBert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state[:, 0, :].numpy()
    return embedding

company_to_ticker = fetch_moex_tickers()
company_names = list(company_to_ticker.keys())
embeddings = [get_embedding(name)[0] for name in company_names]

target_name = "–ì–∞–∑–ø—Ä–æ–º"
target_embedding = get_embedding(target_name)[0]
similarities = cosine_similarity([target_embedding], embeddings)[0]
best_idx = similarities.argmax()
best_match_name = company_names[best_idx]
best_match_ticker = company_to_ticker[best_match_name]

print(f"Best match: {best_match_name}")
print(f"Ticker: {best_match_ticker}")
print(f"Cosine similarity: {similarities[best_idx]:.4f}")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Best match: —Ä–æ—Å–Ω–µ—Ñ—Ç—å
Ticker: ROSN
Cosine similarity: 0.9045


In [236]:
df_filtered = pd.read_excel(r"C:\Users\–û–ª—å–≥–∞\–í–ö–† –ö–∞—Ä–ø–µ–Ω–∫–æ\data markup\filtered_with_tickers_2.xlsx")
df_filtered

Unnamed: 0,–¢–µ–∫—Å—Ç,Ticker,Correct
0,"üç∑**EBITDA LTM ""–ù–æ–≤–∞–±–µ–≤ –≥—Ä—É–ø–ø"" –Ω–∞ 30 –∏—é–Ω—è 2024 ...",BELU,1
1,"üî∫**""–°–æ–ª–ª–µ—Ä—Å"" –ø–æ–¥–Ω—è–ª –ø–æ—Ä–æ–≥–æ–≤–æ–µ –∑–Ω–∞—á–µ–Ω–∏–µ —É—á–∏—Ç—ã–≤–∞...",SVAV,1
2,"üóì**""–ù–û–í–ê–¢–≠–ö"" 6 —Ñ–µ–≤—Ä–∞–ª—è –ø—Ä–æ–≤–µ–¥–µ—Ç —Å–±–æ—Ä –∑–∞—è–≤–æ–∫ –Ω–∞...",NVTK,1
3,"**–ú–∏–Ω—Ñ–∏–Ω –Ω–µ —Å–ª—ã—à–∞–ª –æ –ø–ª–∞–Ω–∞—Ö ""–¢—Ä–∞–Ω—Å–Ω–µ—Ñ—Ç–∏"" —Å–Ω–∏–∑–∏...",TRNFP,1
4,**üí∞SoftBank Group –≤–µ–¥–µ—Ç –ø–µ—Ä–µ–≥–æ–≤–æ—Ä—ã –æ–± –∏–Ω–≤–µ—Å—Ç–∏—Ü...,,0
...,...,...,...
374,**–†–æ—Å–Ω–∞–Ω–æ –¥–æ–ø—É—Å—Ç–∏–ª–æ —Ç–µ—Ö–¥–µ—Ñ–æ–ª—Ç –ø–æ –∫—É–ø–æ–Ω—É –æ–±–ª–∏–≥–∞...,,1
375,"üóì**–°–æ–≤–µ—Ç –¥–∏—Ä–µ–∫—Ç–æ—Ä–æ–≤ ""–ù–æ–≤–∞–±–µ–≤ –ì—Ä—É–ø–ø"" 2 –∞–ø—Ä–µ–ª—è —Ä...",BELU,1
376,"**""–¶–∏–∞–Ω"" –æ–¥–Ω–æ–≤—Ä–µ–º–µ–Ω–Ω–æ —Å–æ —Å—Ç–∞—Ä—Ç–æ–º —Ç–æ—Ä–≥–æ–≤ –Ω–∞ –ú–æ—Å...",CNRU,1
377,"**""–¶–∏–∞–Ω"" –≤ IV –∫–≤–∞—Ä—Ç–∞–ª–µ –Ω–∞—Ä–∞—Å—Ç–∏–ª –≤—ã—Ä—É—á–∫—É –Ω–∞ 5%,...",CNRU,1


## testing re + rapidfuzz

In [246]:
import re

df_filtered['company'] = df_filtered['–¢–µ–∫—Å—Ç'].apply(lambda text: re.findall(r'¬´(.*?)¬ª|"(.*?)"|‚Äú(.*?)‚Äù|\((.*?)\)', text))

df_filtered['company'] = df_filtered['company'].apply(lambda matches: [match for group in matches for match in group if match])

df_filtered =  df_filtered[['–¢–µ–∫—Å—Ç', 'company', 'Ticker']]
df_filtered

Unnamed: 0,–¢–µ–∫—Å—Ç,company,Ticker
0,"üç∑**EBITDA LTM ""–ù–æ–≤–∞–±–µ–≤ –≥—Ä—É–ø–ø"" –Ω–∞ 30 –∏—é–Ω—è 2024 ...","[–ù–æ–≤–∞–±–µ–≤ –≥—Ä—É–ø–ø, –ù–æ–≤–∞–±–µ–≤ –ì—Ä—É–ø–ø]",BELU
1,"üî∫**""–°–æ–ª–ª–µ—Ä—Å"" –ø–æ–¥–Ω—è–ª –ø–æ—Ä–æ–≥–æ–≤–æ–µ –∑–Ω–∞—á–µ–Ω–∏–µ —É—á–∏—Ç—ã–≤–∞...","[–°–æ–ª–ª–µ—Ä—Å, —á–∏—Å—Ç—ã–π –¥–æ–ª–≥/EBITDA, –°–æ–ª–ª–µ—Ä—Å, —á–∏—Å—Ç—ã–π ...",SVAV
2,"üóì**""–ù–û–í–ê–¢–≠–ö"" 6 —Ñ–µ–≤—Ä–∞–ª—è –ø—Ä–æ–≤–µ–¥–µ—Ç —Å–±–æ—Ä –∑–∞—è–≤–æ–∫ –Ω–∞...","[–ù–û–í–ê–¢–≠–ö, –ù–û–í–ê–¢–≠–ö, –ò–Ω—Ç–µ—Ä—Ñ–∞–∫—Å—É, –ù—å—é—Ç–æ–Ω –∏–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏]",NVTK
3,"**–ú–∏–Ω—Ñ–∏–Ω –Ω–µ —Å–ª—ã—à–∞–ª –æ –ø–ª–∞–Ω–∞—Ö ""–¢—Ä–∞–Ω—Å–Ω–µ—Ñ—Ç–∏"" —Å–Ω–∏–∑–∏...","[–¢—Ä–∞–Ω—Å–Ω–µ—Ñ—Ç–∏, –¢—Ä–∞–Ω—Å–Ω–µ—Ñ—Ç–∏, –Ø –Ω–µ —Å–ª—ã—à–∞–ª –æ —Ç–∞–∫–æ–º, ...",TRNFP
4,**üí∞SoftBank Group –≤–µ–¥–µ—Ç –ø–µ—Ä–µ–≥–æ–≤–æ—Ä—ã –æ–± –∏–Ω–≤–µ—Å—Ç–∏—Ü...,"[–±–æ–ª–µ–µ $15 –º–ª—Ä–¥, –ü–µ—Ä–µ–≥–æ–≤–æ—Ä—ã –∏–¥—É—Ç, –∏ —Å—É–º–º–∞, –∫–æ—Ç...",
...,...,...,...
374,**–†–æ—Å–Ω–∞–Ω–æ –¥–æ–ø—É—Å—Ç–∏–ª–æ —Ç–µ—Ö–¥–µ—Ñ–æ–ª—Ç –ø–æ –∫—É–ø–æ–Ω—É –æ–±–ª–∏–≥–∞...,"[–†–æ—Å–Ω–∞–Ω–æ, —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞ —Ç–µ—Ö–Ω–∏—á–µ—Å–∫–æ–≥–æ –º–µ—Ö–∞–Ω–∏–∑–º–∞ –≤—ã...",
375,"üóì**–°–æ–≤–µ—Ç –¥–∏—Ä–µ–∫—Ç–æ—Ä–æ–≤ ""–ù–æ–≤–∞–±–µ–≤ –ì—Ä—É–ø–ø"" 2 –∞–ø—Ä–µ–ª—è —Ä...","[–ù–æ–≤–∞–±–µ–≤ –ì—Ä—É–ø–ø, –†–∞—Å—Å–º–æ—Ç—Ä–µ–Ω–∏–µ –≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç–∏ –ø—Ä–æ–≤–µ...",BELU
376,"**""–¶–∏–∞–Ω"" –æ–¥–Ω–æ–≤—Ä–µ–º–µ–Ω–Ω–æ —Å–æ —Å—Ç–∞—Ä—Ç–æ–º —Ç–æ—Ä–≥–æ–≤ –Ω–∞ –ú–æ—Å...","[–¶–∏–∞–Ω, –¶–∏–∞–Ω, –¶–∏–∞–Ω, –¶–∏–∞–Ω, –¶–∏–∞–Ω, –¶–∏–∞–Ω]",CNRU
377,"**""–¶–∏–∞–Ω"" –≤ IV –∫–≤–∞—Ä—Ç–∞–ª–µ –Ω–∞—Ä–∞—Å—Ç–∏–ª –≤—ã—Ä—É—á–∫—É –Ω–∞ 5%,...","[–¶–∏–∞–Ω, –¶–∏–∞–Ω, –ò–Ω—Ç–µ—Ä—Ñ–∞–∫—Å–∞, –¶–∏–∞–Ω]",CNRU


In [247]:
import requests
import pandas as pd
from rapidfuzz import fuzz

def fetch_moex_tickers():
    url = 'https://iss.moex.com/iss/engines/stock/markets/shares/securities.json'
    response = requests.get(url)
    data = response.json()
    securities = data['securities']['data']
    columns = data['securities']['columns']
    df = pd.DataFrame(securities, columns=columns)
    df_filtered = df[['SECID', 'SHORTNAME', 'SECNAME', 'LATNAME']].dropna()
    df_filtered = df_filtered[df_filtered['SECID'].apply(lambda x: len(str(x)) <= 6)]
    company_to_ticker = {}
    for _, row in df_filtered.iterrows():
        names = [row['SECNAME'], row['SHORTNAME'], row['LATNAME']]
        for name in names:
            company_to_ticker[name.lower()] = row['SECID']
    return company_to_ticker

company_to_ticker = fetch_moex_tickers()
company_names = list(company_to_ticker.keys())

def find_ticker_by_top3(companies):
    for company in companies[:3]:  
        best_match_name = None
        best_score = 0
        for name in company_names:
            score = fuzz.partial_ratio(company.lower(), name.lower())
            if score > best_score:
                best_score = score
                best_match_name = name
        if best_score >= 85:  
            return company_to_ticker[best_match_name]
    return None

df_filtered['Predicted'] = df_filtered['company'].apply(find_ticker_by_top3)

df_filtered

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Predicted'] = df_filtered['company'].apply(find_ticker_by_top3)


Unnamed: 0,–¢–µ–∫—Å—Ç,company,Ticker,Predicted
0,"üç∑**EBITDA LTM ""–ù–æ–≤–∞–±–µ–≤ –≥—Ä—É–ø–ø"" –Ω–∞ 30 –∏—é–Ω—è 2024 ...","[–ù–æ–≤–∞–±–µ–≤ –≥—Ä—É–ø–ø, –ù–æ–≤–∞–±–µ–≤ –ì—Ä—É–ø–ø]",BELU,BELU
1,"üî∫**""–°–æ–ª–ª–µ—Ä—Å"" –ø–æ–¥–Ω—è–ª –ø–æ—Ä–æ–≥–æ–≤–æ–µ –∑–Ω–∞—á–µ–Ω–∏–µ —É—á–∏—Ç—ã–≤–∞...","[–°–æ–ª–ª–µ—Ä—Å, —á–∏—Å—Ç—ã–π –¥–æ–ª–≥/EBITDA, –°–æ–ª–ª–µ—Ä—Å, —á–∏—Å—Ç—ã–π ...",SVAV,SVAV
2,"üóì**""–ù–û–í–ê–¢–≠–ö"" 6 —Ñ–µ–≤—Ä–∞–ª—è –ø—Ä–æ–≤–µ–¥–µ—Ç —Å–±–æ—Ä –∑–∞—è–≤–æ–∫ –Ω–∞...","[–ù–û–í–ê–¢–≠–ö, –ù–û–í–ê–¢–≠–ö, –ò–Ω—Ç–µ—Ä—Ñ–∞–∫—Å—É, –ù—å—é—Ç–æ–Ω –∏–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏]",NVTK,NVTK
3,"**–ú–∏–Ω—Ñ–∏–Ω –Ω–µ —Å–ª—ã—à–∞–ª –æ –ø–ª–∞–Ω–∞—Ö ""–¢—Ä–∞–Ω—Å–Ω–µ—Ñ—Ç–∏"" —Å–Ω–∏–∑–∏...","[–¢—Ä–∞–Ω—Å–Ω–µ—Ñ—Ç–∏, –¢—Ä–∞–Ω—Å–Ω–µ—Ñ—Ç–∏, –Ø –Ω–µ —Å–ª—ã—à–∞–ª –æ —Ç–∞–∫–æ–º, ...",TRNFP,TRNFP
4,**üí∞SoftBank Group –≤–µ–¥–µ—Ç –ø–µ—Ä–µ–≥–æ–≤–æ—Ä—ã –æ–± –∏–Ω–≤–µ—Å—Ç–∏—Ü...,"[–±–æ–ª–µ–µ $15 –º–ª—Ä–¥, –ü–µ—Ä–µ–≥–æ–≤–æ—Ä—ã –∏–¥—É—Ç, –∏ —Å—É–º–º–∞, –∫–æ—Ç...",,
...,...,...,...,...
374,**–†–æ—Å–Ω–∞–Ω–æ –¥–æ–ø—É—Å—Ç–∏–ª–æ —Ç–µ—Ö–¥–µ—Ñ–æ–ª—Ç –ø–æ –∫—É–ø–æ–Ω—É –æ–±–ª–∏–≥–∞...,"[–†–æ—Å–Ω–∞–Ω–æ, —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞ —Ç–µ—Ö–Ω–∏—á–µ—Å–∫–æ–≥–æ –º–µ—Ö–∞–Ω–∏–∑–º–∞ –≤—ã...",,
375,"üóì**–°–æ–≤–µ—Ç –¥–∏—Ä–µ–∫—Ç–æ—Ä–æ–≤ ""–ù–æ–≤–∞–±–µ–≤ –ì—Ä—É–ø–ø"" 2 –∞–ø—Ä–µ–ª—è —Ä...","[–ù–æ–≤–∞–±–µ–≤ –ì—Ä—É–ø–ø, –†–∞—Å—Å–º–æ—Ç—Ä–µ–Ω–∏–µ –≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç–∏ –ø—Ä–æ–≤–µ...",BELU,BELU
376,"**""–¶–∏–∞–Ω"" –æ–¥–Ω–æ–≤—Ä–µ–º–µ–Ω–Ω–æ —Å–æ —Å—Ç–∞—Ä—Ç–æ–º —Ç–æ—Ä–≥–æ–≤ –Ω–∞ –ú–æ—Å...","[–¶–∏–∞–Ω, –¶–∏–∞–Ω, –¶–∏–∞–Ω, –¶–∏–∞–Ω, –¶–∏–∞–Ω, –¶–∏–∞–Ω]",CNRU,CNRU
377,"**""–¶–∏–∞–Ω"" –≤ IV –∫–≤–∞—Ä—Ç–∞–ª–µ –Ω–∞—Ä–∞—Å—Ç–∏–ª –≤—ã—Ä—É—á–∫—É –Ω–∞ 5%,...","[–¶–∏–∞–Ω, –¶–∏–∞–Ω, –ò–Ω—Ç–µ—Ä—Ñ–∞–∫—Å–∞, –¶–∏–∞–Ω]",CNRU,CNRU


In [59]:
df_filtered.to_excel('df_filtered.xlsx', index=False)

In [248]:
if 'Ticker' in df_filtered.columns and 'Predicted' in df_filtered.columns:
    df_filtered['is_correct'] = df_filtered.apply(
        lambda row: (row['Ticker'] == row['Predicted']) or 
                    (pd.isna(row['Ticker']) and pd.isna(row['Predicted'])), axis=1)
    total = len(df_filtered)
    correct = df_filtered['is_correct'].sum()
    accuracy = correct / total if total > 0 else 0
    print(f"Accuracy of ticker matching: {accuracy:.4f}")
    print(f"Out of {total} records, {correct} were predicted correctly.")
else:
    print("Columns 'Ticker' and 'Predicted' were not found in the DataFrame.")

Accuracy of ticker matching: 0.7414
Out of 379 records, 281 were predicted correctly.


## Step 2 NER + TF IDF

In [237]:
from natasha import (
    Segmenter,
    NewsEmbedding,
    NewsNERTagger,
    Doc
)

segmenter = Segmenter()
emb = NewsEmbedding()
ner_tagger = NewsNERTagger(emb)

def extract_companies_natasha(text):
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_ner(ner_tagger)
    companies = [span.text for span in doc.spans if span.type == 'ORG']
    
    return companies

df_filtered['company_natasha'] = df_filtered['–¢–µ–∫—Å—Ç'].apply(extract_companies_natasha)


df_companies_natasha = df_filtered[['–¢–µ–∫—Å—Ç', 'company_natasha', 'Ticker']].copy()

df_companies_natasha['company_natasha'] = df_companies_natasha['company_natasha'].apply(
    lambda companies: ['–¢-—Ç–µ—Ö–Ω–æ–ª–æ–≥–∏–∏' if ('–†–æ—Å–±–∞–Ω–∫' in c or 'T-–±–∞–Ω–∫' in c) else c for c in companies]
)

df_companies_natasha

Unnamed: 0,–¢–µ–∫—Å—Ç,company_natasha,Ticker
0,"üç∑**EBITDA LTM ""–ù–æ–≤–∞–±–µ–≤ –≥—Ä—É–ø–ø"" –Ω–∞ 30 –∏—é–Ω—è 2024 ...","[–ù–æ–≤–∞–±–µ–≤ –≥—Ä—É–ø–ø, –ù–æ–≤–∞–±–µ–≤ –ì—Ä—É–ø–ø, LTM, LTM]",BELU
1,"üî∫**""–°–æ–ª–ª–µ—Ä—Å"" –ø–æ–¥–Ω—è–ª –ø–æ—Ä–æ–≥–æ–≤–æ–µ –∑–Ω–∞—á–µ–Ω–∏–µ —É—á–∏—Ç—ã–≤–∞...","[–°–æ–ª–ª–µ—Ä—Å, –ü–ê–û ""–°–æ–ª–ª–µ—Ä—Å"", –°–æ–ª–ª–µ—Ä—Å–∞, –°–æ–ª–ª–µ—Ä—Å]",SVAV
2,"üóì**""–ù–û–í–ê–¢–≠–ö"" 6 —Ñ–µ–≤—Ä–∞–ª—è –ø—Ä–æ–≤–µ–¥–µ—Ç —Å–±–æ—Ä –∑–∞—è–≤–æ–∫ –Ω–∞...","[–ù–û–í–ê–¢–≠–ö, –ü–ê–û ""–ù–û–í–ê–¢–≠–ö"", –ò–Ω—Ç–µ—Ä—Ñ–∞–∫—Å—É, –ë–∞–Ω–∫–∞ –†–æ—Å...",NVTK
3,"**–ú–∏–Ω—Ñ–∏–Ω –Ω–µ —Å–ª—ã—à–∞–ª –æ –ø–ª–∞–Ω–∞—Ö ""–¢—Ä–∞–Ω—Å–Ω–µ—Ñ—Ç–∏"" —Å–Ω–∏–∑–∏...","[–ú–∏–Ω—Ñ–∏–Ω, –¢—Ä–∞–Ω—Å–Ω–µ—Ñ—Ç–∏, –ú–∏–Ω–∏—Å—Ç–µ—Ä—Å—Ç–≤–æ —Ñ–∏–Ω–∞–Ω—Å–æ–≤, –¢—Ä...",TRNFP
4,**üí∞SoftBank Group –≤–µ–¥–µ—Ç –ø–µ—Ä–µ–≥–æ–≤–æ—Ä—ã –æ–± –∏–Ω–≤–µ—Å—Ç–∏—Ü...,"[SoftBank Group, Financial Times, OpenAI, Soft...",
...,...,...,...
374,**–†–æ—Å–Ω–∞–Ω–æ –¥–æ–ø—É—Å—Ç–∏–ª–æ —Ç–µ—Ö–¥–µ—Ñ–æ–ª—Ç –ø–æ –∫—É–ø–æ–Ω—É –æ–±–ª–∏–≥–∞...,"[–†–æ—Å–Ω–∞–Ω–æ, –ê–û ""–†–æ—Å–Ω–∞–Ω–æ"", –†–æ—Å–Ω–∞–Ω–æ, –ò–Ω—Ç–µ—Ä—Ñ–∞–∫—Å, –†–æ...",
375,"üóì**–°–æ–≤–µ—Ç –¥–∏—Ä–µ–∫—Ç–æ—Ä–æ–≤ ""–ù–æ–≤–∞–±–µ–≤ –ì—Ä—É–ø–ø"" 2 –∞–ø—Ä–µ–ª—è —Ä...","[–ù–æ–≤–∞–±–µ–≤ –ì—Ä—É–ø–ø, Winelab**, –í–∏–Ω–õ–∞–±]",BELU
376,"**""–¶–∏–∞–Ω"" –æ–¥–Ω–æ–≤—Ä–µ–º–µ–Ω–Ω–æ —Å–æ —Å—Ç–∞—Ä—Ç–æ–º —Ç–æ—Ä–≥–æ–≤ –Ω–∞ –ú–æ—Å...","[–¶–∏–∞–Ω, –ú–ö–ü–ê–û, –ú–ö–ü–ê–û ""–¶–∏–∞–Ω, –¶–∏–∞–Ω, –ú–ö–ü–ê–û]",CNRU
377,"**""–¶–∏–∞–Ω"" –≤ IV –∫–≤–∞—Ä—Ç–∞–ª–µ –Ω–∞—Ä–∞—Å—Ç–∏–ª –≤—ã—Ä—É—á–∫—É –Ω–∞ 5%,...","[–ò–Ω—Ç–µ—Ä—Ñ–∞–∫—Å–∞, –¶–∏–∞–Ω]",CNRU


In [241]:
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def fetch_moex_tickers():
    url = 'https://iss.moex.com/iss/engines/stock/markets/shares/securities.json'
    response = requests.get(url)
    data = response.json()
    securities = data['securities']['data']
    columns = data['securities']['columns']
    df = pd.DataFrame(securities, columns=columns)
    df_filtered = df[['SECID', 'SHORTNAME', 'SECNAME', 'LATNAME']].dropna()
    df_filtered = df_filtered[df_filtered['SECID'].apply(lambda x: len(str(x)) <= 6)]
    company_to_ticker = {}
    for _, row in df_filtered.iterrows():
        names = [row['SECNAME'], row['SHORTNAME'], row['LATNAME']]
        for name in names:
            company_to_ticker[name.lower()] = row['SECID']
    
    company_to_ticker.update({
        '–∏–≤–∞—Ç': 'IVAT',
        'iva technologies': 'IVAT',
        '–Ω–æ—Ä–Ω–∏–∫–µ–ª—å': 'GMKN',
        '–Ω–æ—Ä–∏–ª—å—Å–∫–∏–π –Ω–∏–∫–µ–ª—å': 'GMKN',
        'fix price': 'FIXP',
        '—Ñ–∏–∫—Å –ø—Ä–∞–π—Å': 'FIXP',
        'c–º–∞—Ä—Ç—Ç–µ—Ö–≥—Ä—É–ø–ø': 'CARM',
        "–æ'–∫–µ–π": 'OKEY'
    })
    
    remove_words = ['—Ö–æ–ª–¥–∏–Ω–≥', 'holding', '–∏–Ω—Ç–µ—Ä–Ω–µ—à–∏–æ–Ω–∞–ª', 'international']
    cleaned_company_to_ticker = {}
    for key, value in company_to_ticker.items():
        cleaned_key = key
        for word in remove_words:
            cleaned_key = cleaned_key.replace(word, '').strip()
        cleaned_company_to_ticker[cleaned_key] = value
    
    return cleaned_company_to_ticker


company_to_ticker = fetch_moex_tickers()
company_names = list(company_to_ticker.keys())
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,5))
tfidf_matrix = vectorizer.fit_transform(company_names)

def find_ticker_by_top3(companies):
    for company in companies[:3]:
        org_norm = company.lower()
        
        target_vector = vectorizer.transform([org_norm])
        similarities = cosine_similarity(target_vector, tfidf_matrix)[0]
        best_idx = similarities.argmax()
        best_similarity = similarities[best_idx]
        if best_similarity >= 0.5:
            best_match_name = company_names[best_idx]
            return company_to_ticker[best_match_name]
    return None


df_companies_natasha['Predicted'] = df_companies_natasha['company_natasha'].apply(find_ticker_by_top3)

df_companies_natasha

Unnamed: 0,–¢–µ–∫—Å—Ç,company_natasha,Ticker,Predicted,is_correct
0,"üç∑**EBITDA LTM ""–ù–æ–≤–∞–±–µ–≤ –≥—Ä—É–ø–ø"" –Ω–∞ 30 –∏—é–Ω—è 2024 ...","[–ù–æ–≤–∞–±–µ–≤ –≥—Ä—É–ø–ø, –ù–æ–≤–∞–±–µ–≤ –ì—Ä—É–ø–ø, LTM, LTM]",BELU,BELU,True
1,"üî∫**""–°–æ–ª–ª–µ—Ä—Å"" –ø–æ–¥–Ω—è–ª –ø–æ—Ä–æ–≥–æ–≤–æ–µ –∑–Ω–∞—á–µ–Ω–∏–µ —É—á–∏—Ç—ã–≤–∞...","[–°–æ–ª–ª–µ—Ä—Å, –ü–ê–û ""–°–æ–ª–ª–µ—Ä—Å"", –°–æ–ª–ª–µ—Ä—Å–∞, –°–æ–ª–ª–µ—Ä—Å]",SVAV,SVAV,True
2,"üóì**""–ù–û–í–ê–¢–≠–ö"" 6 —Ñ–µ–≤—Ä–∞–ª—è –ø—Ä–æ–≤–µ–¥–µ—Ç —Å–±–æ—Ä –∑–∞—è–≤–æ–∫ –Ω–∞...","[–ù–û–í–ê–¢–≠–ö, –ü–ê–û ""–ù–û–í–ê–¢–≠–ö"", –ò–Ω—Ç–µ—Ä—Ñ–∞–∫—Å—É, –ë–∞–Ω–∫–∞ –†–æ—Å...",NVTK,NVTK,True
3,"**–ú–∏–Ω—Ñ–∏–Ω –Ω–µ —Å–ª—ã—à–∞–ª –æ –ø–ª–∞–Ω–∞—Ö ""–¢—Ä–∞–Ω—Å–Ω–µ—Ñ—Ç–∏"" —Å–Ω–∏–∑–∏...","[–ú–∏–Ω—Ñ–∏–Ω, –¢—Ä–∞–Ω—Å–Ω–µ—Ñ—Ç–∏, –ú–∏–Ω–∏—Å—Ç–µ—Ä—Å—Ç–≤–æ —Ñ–∏–Ω–∞–Ω—Å–æ–≤, –¢—Ä...",TRNFP,TRNFP,True
4,**üí∞SoftBank Group –≤–µ–¥–µ—Ç –ø–µ—Ä–µ–≥–æ–≤–æ—Ä—ã –æ–± –∏–Ω–≤–µ—Å—Ç–∏—Ü...,"[SoftBank Group, Financial Times, OpenAI, Soft...",,,True
...,...,...,...,...,...
374,**–†–æ—Å–Ω–∞–Ω–æ –¥–æ–ø—É—Å—Ç–∏–ª–æ —Ç–µ—Ö–¥–µ—Ñ–æ–ª—Ç –ø–æ –∫—É–ø–æ–Ω—É –æ–±–ª–∏–≥–∞...,"[–†–æ—Å–Ω–∞–Ω–æ, –ê–û ""–†–æ—Å–Ω–∞–Ω–æ"", –†–æ—Å–Ω–∞–Ω–æ, –ò–Ω—Ç–µ—Ä—Ñ–∞–∫—Å, –†–æ...",,,True
375,"üóì**–°–æ–≤–µ—Ç –¥–∏—Ä–µ–∫—Ç–æ—Ä–æ–≤ ""–ù–æ–≤–∞–±–µ–≤ –ì—Ä—É–ø–ø"" 2 –∞–ø—Ä–µ–ª—è —Ä...","[–ù–æ–≤–∞–±–µ–≤ –ì—Ä—É–ø–ø, Winelab**, –í–∏–Ω–õ–∞–±]",BELU,BELU,True
376,"**""–¶–∏–∞–Ω"" –æ–¥–Ω–æ–≤—Ä–µ–º–µ–Ω–Ω–æ —Å–æ —Å—Ç–∞—Ä—Ç–æ–º —Ç–æ—Ä–≥–æ–≤ –Ω–∞ –ú–æ—Å...","[–¶–∏–∞–Ω, –ú–ö–ü–ê–û, –ú–ö–ü–ê–û ""–¶–∏–∞–Ω, –¶–∏–∞–Ω, –ú–ö–ü–ê–û]",CNRU,CNRU,True
377,"**""–¶–∏–∞–Ω"" –≤ IV –∫–≤–∞—Ä—Ç–∞–ª–µ –Ω–∞—Ä–∞—Å—Ç–∏–ª –≤—ã—Ä—É—á–∫—É –Ω–∞ 5%,...","[–ò–Ω—Ç–µ—Ä—Ñ–∞–∫—Å–∞, –¶–∏–∞–Ω]",CNRU,CNRU,True


In [242]:
if 'Ticker' in df_companies_natasha.columns and 'Predicted' in df_companies_natasha.columns:
    df_companies_natasha['is_correct'] = df_companies_natasha.apply(
        lambda row: (row['Ticker'] == row['Predicted']) or 
                    (pd.isna(row['Ticker']) and pd.isna(row['Predicted'])), axis=1)
    total = len(df_companies_natasha)
    correct = df_companies_natasha['is_correct'].sum()
    accuracy = correct / total if total > 0 else 0
    print(f"Accuracy of ticker matching: {accuracy:.4f}")
    print(f"Out of {total} records, {correct} were predicted correctly.")
else:
    print("Columns 'Ticker' and 'Predicted' were not found in the DataFrame.")


Accuracy of ticker matching: 0.9472
Out of 379 records, 359 were predicted correctly.


In [243]:
if 'Ticker' in df_companies_natasha.columns and 'Predicted' in df_companies_natasha.columns:
    df_error = df_companies_natasha[~(
        (df_companies_natasha['Ticker'] == df_companies_natasha['Predicted']) |
        ((df_companies_natasha['Ticker'].fillna(0) == 0) & (df_companies_natasha['Predicted'].fillna(0) == 0))
    )]
df_error

Unnamed: 0,–¢–µ–∫—Å—Ç,company_natasha,Ticker,Predicted,is_correct
22,**Skillbox Holding —Ä–µ–¥–æ–º–∏—Ü–∏–ª–∏—Ä–æ–≤–∞–Ω –≤ –†–§**\n\nS...,"[Skillbox Holding Limited, –ï–ì–†–Æ–õ, –ú–ö–ê–û ""–°–∫–∏–ª–±–æ...",T,,False
33,**üè¶BNY Mellon –≤ 2024 –≥–æ–¥—É –ø–æ–ª—É—á–∏–ª —Ä–µ–∫–æ—Ä–¥–Ω—É—é —á–∏...,[Bank of New York Mellon],,MBNK,False
34,"**–¶–ë –†–§ –∑–∞—Ä–µ–≥–∏—Å—Ç—Ä–∏—Ä–æ–≤–∞–ª –≤—ã–ø—É—Å–∫ –∞–∫—Ü–∏–π –ú–ö–ê–û ""–°–∫–∏...","[–¶–ë, –ú–ö–ê–û ""–°–∫–∏–ª–±–æ–∫—Å –•–æ–ª–¥–∏–Ω–≥, –ë–∞–Ω–∫ –†–æ—Å—Å–∏–∏, –ú–ö–ê–û...",WUSH,SBER,False
45,**–°–±–µ—Ä –≤–æ—Å—Å—Ç–∞–Ω–æ–≤–∏–ª –¥–æ—Å—Ç–∞—Ç–æ—á–Ω–æ—Å—Ç—å –∫–∞–ø–∏—Ç–∞–ª–∞ –¥–æ –Ω...,[],SBER,,False
47,**‚öñÔ∏è–°—É–¥ –æ—Ç–∫–∞–∑–∞–ª –≤ —á–∞—Å—Ç–∏—á–Ω–æ–π –æ—Ç–º–µ–Ω–µ –æ–±–µ—Å–ø–µ—á–∏—Ç–µ–ª...,"[–ê—Ä–±–∏—Ç—Ä–∞–∂–Ω—ã–π —Å—É–¥, –ë–æ—Ä–µ—Ü, –û–û–û "" –ë–æ—Ä–µ—Ü –ö–∞–ø–∏—Ç–∞–ª, ...",0,AKGD,False
62,"**–•–ö ""–ú–µ—Ç–∞–ª–ª–æ–∏–Ω–≤–µ—Å—Ç"" –∑–∞—Ñ–∏–∫—Å–∏—Ä–æ–≤–∞–ª–∞ –æ–±—ä–µ–º —Ä–∞–∑–º–µ...","[–•–ö ""–ú–µ—Ç–∞–ª–ª–æ–∏–Ω–≤–µ—Å—Ç"", –•–ö ""–ú–µ—Ç–∞–ª–ª–æ–∏–Ω–≤–µ—Å—Ç"", –ò–Ω—Ç–µ—Ä...",,IRAO,False
67,**–ü—Ä–æ–º—Å–≤—è–∑—å–±–∞–Ω–∫ –∑–∞—Ñ–∏–∫—Å–∏—Ä–æ–≤–∞–ª –æ–±—ä–µ–º —Ä–∞–∑–º–µ—â–µ–Ω–∏—è ...,"[–ü—Ä–æ–º—Å–≤—è–∑—å–±–∞–Ω–∫, –ü–ê–û ""–ü—Ä–æ–º—Å–≤—è–∑—å–±–∞–Ω–∫"", –ò–Ω—Ç–µ—Ä—Ñ–∞–∫—Å...",PSGM,IRAO,False
109,**JetLend –≤ —Ä–∞–º–∫–∞—Ö IPO –ø—Ä–∏–≤–ª–µ–∫ —á—É—Ç—å –º–µ–Ω—å—à–µ 500...,"[–ü–ê–û ""–î–∂–µ—Ç–õ–µ–Ω–¥ –•–æ–ª–¥–∏–Ω–≥, –î–∂–µ—Ç–õ–µ–Ω–¥ –•–æ–ª–¥–∏–Ω–≥–∞, –°–ü–ë...",,SPBE,False
118,"**–ê–∫—Ü–∏–æ–Ω–µ—Ä—ã –ì–ö ""–û'–ö–µ–π"" —É—Ç–≤–µ—Ä–¥–∏–ª–∏ —Ä–µ—à–µ–Ω–∏–µ –æ —Ä–µ–¥...","[–ì–ö ""–û'–ö–µ–π, –û'–ö–µ–π, –û'–∫–µ–π –ì—Ä—É–ø, –ú–ö–ü–ê–û]",,OKEY,False
122,"**–§–ü–ö ""–ì–∞—Ä–∞–Ω—Ç-–∏–Ω–≤–µ—Å—Ç"" –¥–æ–ø—É—Å—Ç–∏–ª–∞ —Ç–µ—Ö–¥–µ—Ñ–æ–ª—Ç –ø–æ –≤...","[–§–ü–ö ""–ì–∞—Ä–∞–Ω—Ç-–∏–Ω–≤–µ—Å—Ç"", –¶–ë, –ì–∞—Ä–∞–Ω—Ç-–∏–Ω–≤–µ—Å—Ç]",,RUSI,False
