In [1]:
pip install natasha

Collecting natasha
  Obtaining dependency information for natasha from https://files.pythonhosted.org/packages/32/9c/bb9d33c13564bcc939bb727087ef51b16ed3b49cc3b8fdec07c87b02f1de/natasha-1.6.0-py3-none-any.whl.metadata
  Using cached natasha-1.6.0-py3-none-any.whl.metadata (23 kB)
Collecting pymorphy2 (from natasha)
  Obtaining dependency information for pymorphy2 from https://files.pythonhosted.org/packages/07/57/b2ff2fae3376d4f3c697b9886b64a54b476e1a332c67eee9f88e7f1ae8c9/pymorphy2-0.9.1-py3-none-any.whl.metadata
  Using cached pymorphy2-0.9.1-py3-none-any.whl.metadata (3.6 kB)
Collecting razdel>=0.5.0 (from natasha)
  Obtaining dependency information for razdel>=0.5.0 from https://files.pythonhosted.org/packages/15/2c/664223a3924aa6e70479f7d37220b3a658765b9cfe760b4af7ffdc50d38f/razdel-0.5.0-py3-none-any.whl.metadata
  Using cached razdel-0.5.0-py3-none-any.whl.metadata (10.0 kB)
Collecting navec>=0.9.0 (from natasha)
  Obtaining dependency information for navec>=0.9.0 from https://fi

In [3]:
pip install rapidfuzz

Collecting rapidfuzz
  Obtaining dependency information for rapidfuzz from https://files.pythonhosted.org/packages/c9/5a/d00e1f63564050a20279015acb29ecaf41646adfacc6ce2e1e450f7f2633/rapidfuzz-3.13.0-cp311-cp311-win_amd64.whl.metadata
  Downloading rapidfuzz-3.13.0-cp311-cp311-win_amd64.whl.metadata (12 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-win_amd64.whl (1.6 MB)
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
    --------------------------------------- 0.0/1.6 MB 660.6 kB/s eta 0:00:03
   -- ------------------------------------- 0.1/1.6 MB 1.3 MB/s eta 0:00:02
   ------ --------------------------------- 0.2/1.6 MB 2.2 MB/s eta 0:00:01
   --------- ------------------------------ 0.4/1.6 MB 2.2 MB/s eta 0:00:01
   -------------- ------------------------- 0.6/1.6 MB 3.1 MB/s eta 0:00:01
   --------------- ------------------------ 0.6/1.6 MB 2.7 MB/s eta 0:00:01
   ------------------------ --------------- 1.0/1.6 MB 3.3 MB/s eta 0:00:01
   ----------------------

In [117]:
import requests
import pandas as pd
from pymorphy2 import MorphAnalyzer
from rapidfuzz import fuzz

segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
ner_tagger = NewsNERTagger(emb)
morph = MorphAnalyzer()

def fetch_moex_tickers():
    url = 'https://iss.moex.com/iss/engines/stock/markets/shares/securities.json'
    response = requests.get(url)
    data = response.json()
    securities = data['securities']['data']
    columns = data['securities']['columns']
    df = pd.DataFrame(securities, columns=columns)
    df_filtered = df[['SECID', 'SHORTNAME', 'SECNAME']].dropna()
    company_to_ticker = {}
    for _, row in df_filtered.iterrows():
        names = [row['SECNAME'], row['SHORTNAME']]
        for name in names:
            company_to_ticker[name.lower()] = row['SECID']
    return company_to_ticker

def normalize_text(text):
    return ' '.join([morph.parse(word)[0].normal_form for word in text.split()])

def extract_company_names(text):
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_ner(ner_tagger)
    for span in doc.spans:
        span.normalize(morph_vocab)
    orgs_normalized = [span.normal for span in doc.spans if span.type == 'ORG']
    orgs_original = [span.text for span in doc.spans if span.type == 'ORG']
    
    # –£–¥–∞–ª—è–µ–º —à—É–º–æ–≤—ã–µ —Ñ—Ä–∞–∑—ã
    stopwords = {"—Å–¥", "—Å–æ–≤–µ—Ç –¥–∏—Ä–µ–∫—Ç–æ—Ä–æ–≤", "–∞–∫—Ü–∏–æ–Ω–µ—Ä–æ–≤", "–º—Å—Ñ–æ"} 
    result = []
    for original, normalized in zip(orgs_original, orgs_normalized):
        norm_text = normalized.lower()
        if norm_text not in stopwords:
            print(f"Original: {original} -> Normalized: {normalized}")
            result.append(norm_text)
    return result

def get_ticker_from_text(text, company_to_ticker):
    orgs = extract_company_names(text)
    for org in orgs:
        org_norm = normalize_text(org)
        best_match = None
        best_score = 0
        for key, ticker in company_to_ticker.items():
            score = fuzz.partial_ratio(org_norm, key)
            if score > best_score and score > 80: 
                best_match = (key, ticker)
                best_score = score
        if best_match:
            return best_match
    return None, None

# Example usage
if __name__ == "__main__":
    sample_text = """
–ì–∞–∑–ø—Ä–æ–º —á—Ç–æ-—Ç–æ —Ç–∞–º –≤–æ–æ–±—â–µ —Ç–∞–∫–æ–µ
"""
    company_to_ticker = fetch_moex_tickers()
    company_name, ticker = get_ticker_from_text(sample_text, company_to_ticker)
    if ticker:
        print(f"‚úÖ Extracted Company Name: {company_name}")
        print(f"‚úÖ Extracted Ticker: {ticker}")
    else:
        print("‚ùå No matching company or ticker found.")


Original: –ì–∞–∑–ø—Ä–æ–º -> Normalized: –ì–∞–∑–ø—Ä–æ–º
‚úÖ Extracted Company Name: "–≥–∞–∑–ø—Ä–æ–º" (–ø–∞–æ) –∞–æ
‚úÖ Extracted Ticker: GAZP


In [106]:
file_path = r"C:\Users\–ö–∞—Ä–ø–µ–Ω–∫–æ\–í–ö–† –ö–∞—Ä–ø–µ–Ω–∫–æ\data makeup\filtered_interfax_data.xlsx"

# –ó–∞–≥—Ä—É–∂–∞–µ–º Excel-—Ñ–∞–π–ª
df = pd.read_excel(file_path)

# –§–∏–ª—å—Ç—Ä—É–µ–º —Å—Ç—Ä–æ–∫–∏, –≥–¥–µ Label=1
filtered_df = df[df['Label'] == 1]

# –í—ã–≤–æ–¥–∏–º –ø–µ—Ä–≤—ã–µ —Å—Ç—Ä–æ–∫–∏ –¥–ª—è –ø—Ä–æ–≤–µ—Ä–∫–∏
filtered_df

Unnamed: 0,–î–∞—Ç–∞,–¢–µ–∫—Å—Ç,Label
6,2025-01-30 13:09:25,"üç∑**EBITDA LTM ""–ù–æ–≤–∞–±–µ–≤ –≥—Ä—É–ø–ø"" –Ω–∞ 30 –∏—é–Ω—è 2024 ...",1.0
8,2025-01-30 11:39:26,"üî∫**""–°–æ–ª–ª–µ—Ä—Å"" –ø–æ–¥–Ω—è–ª –ø–æ—Ä–æ–≥–æ–≤–æ–µ –∑–Ω–∞—á–µ–Ω–∏–µ —É—á–∏—Ç—ã–≤–∞...",1.0
9,2025-01-30 11:27:51,"üóì**""–ù–û–í–ê–¢–≠–ö"" 6 —Ñ–µ–≤—Ä–∞–ª—è –ø—Ä–æ–≤–µ–¥–µ—Ç —Å–±–æ—Ä –∑–∞—è–≤–æ–∫ –Ω–∞...",1.0
10,2025-01-30 11:02:33,"**–ú–∏–Ω—Ñ–∏–Ω –Ω–µ —Å–ª—ã—à–∞–ª –æ –ø–ª–∞–Ω–∞—Ö ""–¢—Ä–∞–Ω—Å–Ω–µ—Ñ—Ç–∏"" —Å–Ω–∏–∑–∏...",1.0
13,2025-01-30 08:02:19,**üí∞SoftBank Group –≤–µ–¥–µ—Ç –ø–µ—Ä–µ–≥–æ–≤–æ—Ä—ã –æ–± –∏–Ω–≤–µ—Å—Ç–∏—Ü...,1.0
...,...,...,...
1504,2025-04-01 08:21:01,**–†–æ—Å–Ω–∞–Ω–æ –¥–æ–ø—É—Å—Ç–∏–ª–æ —Ç–µ—Ö–¥–µ—Ñ–æ–ª—Ç –ø–æ –∫—É–ø–æ–Ω—É –æ–±–ª–∏–≥–∞...,1.0
1508,2025-04-01 07:14:17,"üóì**–°–æ–≤–µ—Ç –¥–∏—Ä–µ–∫—Ç–æ—Ä–æ–≤ ""–ù–æ–≤–∞–±–µ–≤ –ì—Ä—É–ø–ø"" 2 –∞–ø—Ä–µ–ª—è —Ä...",1.0
1512,2025-04-01 07:03:16,"**""–¶–∏–∞–Ω"" –æ–¥–Ω–æ–≤—Ä–µ–º–µ–Ω–Ω–æ —Å–æ —Å—Ç–∞—Ä—Ç–æ–º —Ç–æ—Ä–≥–æ–≤ –Ω–∞ –ú–æ—Å...",1.0
1513,2025-04-01 07:00:48,"**""–¶–∏–∞–Ω"" –≤ IV –∫–≤–∞—Ä—Ç–∞–ª–µ –Ω–∞—Ä–∞—Å—Ç–∏–ª –≤—ã—Ä—É—á–∫—É –Ω–∞ 5%,...",1.0


In [None]:
correctness = []

for index, row in filtered_df.iterrows():
    print(f"\nüì∞ News:\n{row['–¢–µ–∫—Å—Ç']}")
    print(f"üîé Extracted Ticker: {row['Ticker']}")
    
    user_input = input("0 or 1")
    
    if user_input not in ['0', '1']:
        print("Error")
        user_input = '0'
    
    correctness.append(int(user_input))

df_filtered['Correct'] = correctness

# –°–æ—Ö—Ä–∞–Ω—è–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç
output_path = r"C:\Users\–ö–∞—Ä–ø–µ–Ω–∫–æ\–í–ö–† –ö–∞—Ä–ø–µ–Ω–∫–æ\model\filtered_with_tickers_corrected.xlsx"
df_filtered.to_excel(output_path, index=False)

##  error correction

In [10]:
# –ü–æ–¥—Å—á–µ—Ç —Ç–æ—á–Ω–æ—Å—Ç–∏
total = len(df_filtered)
correct = df_filtered['Correct'].sum()
accuracy = correct / total

print(f"üìä Accuracy (—Ç–æ—á–Ω–æ—Å—Ç—å): {accuracy:.4f} ({correct}/{total})")

üìä Accuracy (—Ç–æ—á–Ω–æ—Å—Ç—å): 0.7018 (266/379)


In [57]:
df_filtered = pd.read_excel(r"C:\Users\–ö–∞—Ä–ø–µ–Ω–∫–æ\–í–ö–† –ö–∞—Ä–ø–µ–Ω–∫–æ\model\filtered_with_tickers_corrected.xlsx")
df_filtered

Unnamed: 0,–î–∞—Ç–∞,–¢–µ–∫—Å—Ç,Label,Ticker,Correct
0,2025-01-30 13:09:25,"üç∑**EBITDA LTM ""–ù–æ–≤–∞–±–µ–≤ –≥—Ä—É–ø–ø"" –Ω–∞ 30 –∏—é–Ω—è 2024 ...",1,BELU,1
1,2025-01-30 11:39:26,"üî∫**""–°–æ–ª–ª–µ—Ä—Å"" –ø–æ–¥–Ω—è–ª –ø–æ—Ä–æ–≥–æ–≤–æ–µ –∑–Ω–∞—á–µ–Ω–∏–µ —É—á–∏—Ç—ã–≤–∞...",1,SVAV,1
2,2025-01-30 11:27:51,"üóì**""–ù–û–í–ê–¢–≠–ö"" 6 —Ñ–µ–≤—Ä–∞–ª—è –ø—Ä–æ–≤–µ–¥–µ—Ç —Å–±–æ—Ä –∑–∞—è–≤–æ–∫ –Ω–∞...",1,NVTK,1
3,2025-01-30 11:02:33,"**–ú–∏–Ω—Ñ–∏–Ω –Ω–µ —Å–ª—ã—à–∞–ª –æ –ø–ª–∞–Ω–∞—Ö ""–¢—Ä–∞–Ω—Å–Ω–µ—Ñ—Ç–∏"" —Å–Ω–∏–∑–∏...",1,TRNFP,1
4,2025-01-30 08:02:19,**üí∞SoftBank Group –≤–µ–¥–µ—Ç –ø–µ—Ä–µ–≥–æ–≤–æ—Ä—ã –æ–± –∏–Ω–≤–µ—Å—Ç–∏—Ü...,1,BSPB,0
...,...,...,...,...,...
374,2025-04-01 08:21:01,**–†–æ—Å–Ω–∞–Ω–æ –¥–æ–ø—É—Å—Ç–∏–ª–æ —Ç–µ—Ö–¥–µ—Ñ–æ–ª—Ç –ø–æ –∫—É–ø–æ–Ω—É –æ–±–ª–∏–≥–∞...,1,Not found,0
375,2025-04-01 07:14:17,"üóì**–°–æ–≤–µ—Ç –¥–∏—Ä–µ–∫—Ç–æ—Ä–æ–≤ ""–ù–æ–≤–∞–±–µ–≤ –ì—Ä—É–ø–ø"" 2 –∞–ø—Ä–µ–ª—è —Ä...",1,BELU,1
376,2025-04-01 07:03:16,"**""–¶–∏–∞–Ω"" –æ–¥–Ω–æ–≤—Ä–µ–º–µ–Ω–Ω–æ —Å–æ —Å—Ç–∞—Ä—Ç–æ–º —Ç–æ—Ä–≥–æ–≤ –Ω–∞ –ú–æ—Å...",1,CNRU,1
377,2025-04-01 07:00:48,"**""–¶–∏–∞–Ω"" –≤ IV –∫–≤–∞—Ä—Ç–∞–ª–µ –Ω–∞—Ä–∞—Å—Ç–∏–ª –≤—ã—Ä—É—á–∫—É –Ω–∞ 5%,...",1,CNRU,1


## error correction

In [171]:
import re
import requests
import pandas as pd
from natasha import Segmenter, NewsNERTagger, NewsEmbedding, MorphVocab, Doc
from pymorphy2 import MorphAnalyzer
from rapidfuzz import fuzz

segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
ner_tagger = NewsNERTagger(emb)
morph = MorphAnalyzer()

def clean_company_name(name):
    keywords_to_remove = ['plc', 'ipjsc', '–æ–∞–æ', '–∑–∞–æ', '–ø–∏—Ñ', 'international public js com', '–º–∫–ø–∞–æ',
                         '–∞–¥—Ä', 'corporate center', '–≥—Ä—É–ø–ø–∞', '–ø–∞–æ']
    words = name.lower().split()
    cleaned = ' '.join([w for w in words if w not in keywords_to_remove])
    return cleaned


def fetch_moex_tickers():
    url = 'https://iss.moex.com/iss/engines/stock/markets/shares/securities.json'
    response = requests.get(url)
    data = response.json()
    securities = data['securities']['data']
    columns = data['securities']['columns']
    df = pd.DataFrame(securities, columns=columns)
    
    df_filtered = df[['SECID', 'SHORTNAME', 'SECNAME', 'LATNAME']].dropna()
    df_filtered = df_filtered[df_filtered['SECID'].apply(lambda x: len(str(x)) <= 6)]
    
    company_to_ticker = {}
    for _, row in df_filtered.iterrows():
        names = [row['SECNAME'], row['SHORTNAME'], row['LATNAME']]
        for name in names:
            company_to_ticker[name.lower()] = row['SECID']
    return company_to_ticker


def normalize_text(text):
    return ' '.join([morph.parse(word)[0].normal_form for word in text.split()])

def extract_company_names(text):
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_ner(ner_tagger)
    for span in doc.spans:
        span.normalize(morph_vocab)
    orgs_normalized = [span.normal for span in doc.spans if span.type == 'ORG']
    orgs_original = [span.text for span in doc.spans if span.type == 'ORG']
    
    stopwords = {"—Å–¥", "—Å–æ–≤–µ—Ç –¥–∏—Ä–µ–∫—Ç–æ—Ä–æ–≤", "–∞–∫—Ü–∏–æ–Ω–µ—Ä–æ–≤", "–º—Å—Ñ–æ", '—Ü–±',
                "–º–æ—Å–±–∏—Ä–∂–∞", '–±–∞–Ω–∫ —Ä–æ—Å—Å–∏–∏', '—Ä–±–∫', '–º–æ—Å–∫–æ–≤—Å–∫–∞—è –±–∏—Ä–∂–∞', '—É–∫'}
    result = []
    for original, normalized in zip(orgs_original, orgs_normalized):
        norm_text = normalized.lower()
        if norm_text not in stopwords:
            print(f"Original: {original} -> Normalized: {normalized}")
            result.append(norm_text)

    if "–ª–µ–Ω—Ç–∞" in text.lower() and not any("–ª–µ–Ω—Ç–∞" in org for org in result):
        result.append("–ª–µ–Ω—Ç–∞")
        
    if "—Å–∞–º–æ–ª–µ—Ç" in text.lower() and not any("—Å–∞–º–æ–ª–µ—Ç" in org for org in result):
        result.append("—Å–∞–º–æ–ª–µ—Ç")

    return result


def replace_sber_with_sberbank(text):
    text = re.sub(r'\b–°–±–µ—Ä\b(?=\s|$)', '–°–±–µ—Ä–±–∞–Ω–∫', text)
    text = re.sub(r'\b–¢[ -]–ë–∞–Ω–∫\b', '–¢-–¢–µ—Ö–Ω–æ–ª–æ–≥–∏–∏', text, flags=re.IGNORECASE)
    text = re.sub(r'\b–¢[ -]–ë–∞–Ω–∫–∞\b', '–¢-–¢–µ—Ö–Ω–æ–ª–æ–≥–∏–∏', text, flags=re.IGNORECASE)
    text = re.sub(r'["¬´]?\s*–õ–µ–Ω—Ç–∞\s*["¬ª]?', '–õ–µ–Ω—Ç–∞ –ê–û', text, flags=re.IGNORECASE)
    return text

def get_ticker_from_text(text, company_to_ticker):
    text = replace_sber_with_sberbank(text)
    orgs = extract_company_names(text)
    
    for org in orgs:
        org_norm = normalize_text(org)
        if org_norm.lower() in ["vk", "–≤–∫", "–≤–∫–æ–Ω—Ç–∞–∫—Ç–µ", "vk tech"]:
            return ("VK", "VKCO")
        if org_norm.lower() in ["x5", "x5 group", "—Ö5", "–∏–∫—Å5", "–∏–∫—Å 5"]:
            return ("X5", "X5")
        if org_norm.lower() in ["–º–∞–≥–Ω–∏—Ç"]:
            return ('–ü–ê–û "–ú–∞–≥–Ω–∏—Ç"' , "MGNT")
        if org_norm.lower() in ["–º–∞—Ç—å –∏ –¥–∏—Ç—è", "–º–∞—Ç–µ—Ä–∏ –∏ –¥–µ—Ç–∏", "md medical group"]:
            return ('–ü–ê–û "–ú–∞—Ç—å –∏ –¥–∏—Ç—è"', "MDMG")
        if org_norm.lower() in ["c–∞–º–æ–ª–µ—Ç"]:
            return ('–°–∞–º–æ–ª–µ—Ç', "SMLT")

        
        org_cleaned = clean_company_name(org_norm)
        best_match = None
        best_score = 0
        for key, ticker in company_to_ticker.items():
            key_cleaned = clean_company_name(key)
            score = fuzz.partial_ratio(org_cleaned, key_cleaned)
            if score > best_score and score > 85:
                best_match = (key, ticker)
                best_score = score
        if best_match:
            return best_match
    
    stopwords = {"—Å–¥", "—Å–æ–≤–µ—Ç –¥–∏—Ä–µ–∫—Ç–æ—Ä–æ–≤", "–∞–∫—Ü–∏–æ–Ω–µ—Ä–æ–≤", "–º—Å—Ñ–æ", '—Ü–±',
                 '–º–æ—Å–±–∏—Ä–∂–∞', '–±–∞–Ω–∫ —Ä–æ—Å—Å–∏–∏', '—Ä–±–∫', '–º–æ—Å–∫–æ–≤—Å–∫–∞—è –±–∏—Ä–∂–∞'}
    for org in orgs:
        if org.lower() not in stopwords:
            return (org, "N/A")
    
    if "–æ–æ–æ" in text.lower():
        return ("–û–û–û", "N/A")
    elif any(keyword in text.lower() for keyword in ["plc", "ltd", "limited", "inc", "corp", "company"]):
        return ("–ò–Ω–æ—Å—Ç—Ä–∞–Ω–Ω–∞—è –∫–æ–º–ø–∞–Ω–∏—è", "N/A")
    else:
        return (None, None)



if __name__ == "__main__":
    sample_text = """
**"–°–∞–º–æ–ª–µ—Ç" –≤ 2024 –≥–æ–¥—É —É–≤–µ–ª–∏—á–∏–ª EBITDA –Ω–∞ 16%**

–î–µ–≤–µ–ª–æ–ø–µ—Ä "–°–∞–º–æ–ª–µ—Ç" –≤ 2024 –≥–æ–¥—É —É–≤–µ–ª–∏—á–∏–ª EBITDA –ø–æ –ú–°–§–û –Ω–∞ 16%, –¥–æ 83,6 –º–ª—Ä–¥ —Ä—É–±–ª–µ–π.

–°–æ–≥–ª–∞—Å–Ω–æ –æ—Ç—á–µ—Ç–Ω–æ—Å—Ç–∏, —á–∏—Å—Ç–∞—è –ø—Ä–∏–±—ã–ª—å —É–ø–∞–ª–∞ –≤ 3 —Ä–∞–∑–∞, –¥–æ 8,2 –º–ª—Ä–¥ —Ä—É–±–ª–µ–π.

–ù–∞–∫–∞–Ω—É–Ω–µ –∞–Ω–∞–ª–∏—Ç–∏–∫–∏ –æ–∂–∏–¥–∞–ª–∏ EBITDA "–°–∞–º–æ–ª–µ—Ç–∞" –≤ –¥–∏–∞–ø–∞–∑–æ–Ω–µ 74-78,3 –º–ª—Ä–¥ —Ä—É–±–ª–µ–π, —á–∏—Å—Ç—É—é –ø—Ä–∏–±—ã–ª—å - 5-7,1 –º–ª—Ä–¥ —Ä—É–±–ª–µ–π.

@ifax_go
    """
    company_to_ticker = fetch_moex_tickers()
    company_name, ticker = get_ticker_from_text(sample_text, company_to_ticker)
    if ticker:
        print(f"‚úÖ Extracted Company Name: {company_name}")
        print(f"‚úÖ Extracted Ticker: {ticker}")
    else:
        print("‚ùå No matching company or ticker found.")

‚úÖ Extracted Company Name: –≥–∫ —Å–∞–º–æ–ª–µ—Ç –∞–æ
‚úÖ Extracted Ticker: SMLT


In [125]:
df = df_filtered[['–¢–µ–∫—Å—Ç']].copy()

company_to_ticker = fetch_moex_tickers()

df['Ticker'] = df['–¢–µ–∫—Å—Ç'].apply(lambda x: get_ticker_from_text(x, company_to_ticker)[1])

output_path = r"C:\Users\–ö–∞—Ä–ø–µ–Ω–∫–æ\–í–ö–† –ö–∞—Ä–ø–µ–Ω–∫–æ\data markup\filtered_with_tickers_2.xlsx"
df.to_excel(output_path, index=False)

print(f" {output_path}")

Original: –ù–æ–≤–∞–±–µ–≤ –≥—Ä—É–ø–ø -> Normalized: –ù–æ–≤–∞–±–µ–≤ –≥—Ä—É–ø–ø
Original: –ù–æ–≤–∞–±–µ–≤ –ì—Ä—É–ø–ø -> Normalized: –ù–æ–≤–∞–±–µ–≤ –ì—Ä—É–ø–ø
Original: LTM -> Normalized: LTM
Original: LTM -> Normalized: LTM
Original: –°–æ–ª–ª–µ—Ä—Å -> Normalized: –°–æ–ª–ª–µ—Ä—Å
Original: –ü–ê–û "–°–æ–ª–ª–µ—Ä—Å" -> Normalized: –ü–ê–û "–°–æ–ª–ª–µ—Ä—Å"
Original: –°–æ–ª–ª–µ—Ä—Å–∞ -> Normalized: –°–æ–ª–ª–µ—Ä—Å–∞
Original: –°–æ–ª–ª–µ—Ä—Å -> Normalized: –°–æ–ª–ª–µ—Ä—Å
Original: –ù–û–í–ê–¢–≠–ö -> Normalized: –ù–û–í–ê–¢–≠–ö
Original: –ü–ê–û "–ù–û–í–ê–¢–≠–ö" -> Normalized: –ü–ê–û "–ù–û–í–ê–¢–≠–ö"
Original: –ò–Ω—Ç–µ—Ä—Ñ–∞–∫—Å—É -> Normalized: –ò–Ω—Ç–µ—Ä—Ñ–∞–∫—Å—É
Original: –ë–∞–Ω–∫–∞ –†–æ—Å—Å–∏–∏ -> Normalized: –ë–∞–Ω–∫–∞ –†–æ—Å—Å–∏–∏
Original: –ú–∏–Ω—Ñ–∏–Ω -> Normalized: –ú–∏–Ω—Ñ–∏–Ω
Original: –¢—Ä–∞–Ω—Å–Ω–µ—Ñ—Ç–∏ -> Normalized: –¢—Ä–∞–Ω—Å–Ω–µ—Ñ—Ç–∏
Original: –ú–∏–Ω–∏—Å—Ç–µ—Ä—Å—Ç–≤–æ —Ñ–∏–Ω–∞–Ω—Å–æ–≤ -> Normalized: –ú–∏–Ω–∏—Å—Ç–µ—Ä—Å—Ç–≤–æ —Ñ–∏–Ω–∞–Ω—Å–æ–≤
Original: –¢—Ä–∞–Ω—Å–Ω–µ—Ñ—Ç–∏ -> Normalize

Original: Taiwan Semiconductor Manufacturing Company -> Normalized: Taiwan Semiconductor Manufacturing Company
Original: CNBC -> Normalized: CNBC
Original: TSMC -> Normalized: TSMC
Original: Taiex -> Normalized: Taiex
Original: Bank of New York Mellon -> Normalized: Bank of New York Mellon
Original: –ú–ö–ê–û "–°–∫–∏–ª–±–æ–∫—Å –•–æ–ª–¥–∏–Ω–≥ -> Normalized: –ú–ö–ê–û "–°–∫–∏–ª–±–æ–∫—Å –•–æ–ª–¥–∏–Ω–≥
Original: –ú–ö–ê–û -> Normalized: –ú–ö–ê–û
Original: –°–∫–∏–ª–±–æ–∫—Å –•–æ–ª–¥–∏–Ω–≥ -> Normalized: –°–∫–∏–ª–±–æ–∫—Å –•–æ–ª–¥–∏–Ω–≥
Original: VK -> Normalized: VK
Original: Mail.ru Group -> Normalized: Mail.ru Group
Original: Skillbox -> Normalized: Skillbox
Original: GeekBrains -> Normalized: GeekBrains
Original: Skillbox Limited -> Normalized: Skillbox Limited
Original: Skillbox -> Normalized: Skillbox
Original: IT-–ø—Ä–µ–¥–ø—Ä–∏–Ω–∏–º–∞—Ç–µ–ª—è -> Normalized: IT-–ø—Ä–µ–¥–ø—Ä–∏–Ω–∏–º–∞—Ç–µ–ª—è
Original: Almaz Capital -> Normalized: Almaz Capital
Original: –ï–ì–†–Æ–õ -> Normalized: –ï–ì–†–Æ–õ

Original: –†—É—Å—Å–Ω–µ—Ñ—Ç—å -> Normalized: –†—É—Å—Å–Ω–µ—Ñ—Ç—å
Original: –ü–ê–û "–†—É—Å—Å–Ω–µ—Ñ—Ç—å" -> Normalized: –ü–ê–û "–†—É—Å—Å–Ω–µ—Ñ—Ç—å"
Original: –ù–ö -> Normalized: –ù–ö
Original: –§–æ—Å–ê–≥—Ä–æ -> Normalized: –§–æ—Å–ê–≥—Ä–æ
Original: –§–æ—Å–ê–≥—Ä–æ -> Normalized: –§–æ—Å–ê–≥—Ä–æ
Original: –§–æ—Å–ê–≥—Ä–æ -> Normalized: –§–æ—Å–ê–≥—Ä–æ
Original: –ß–µ—Ä–∫–∏–∑–æ–≤–æ -> Normalized: –ß–µ—Ä–∫–∏–∑–æ–≤–æ
Original: –ß–µ—Ä–∫–∏–∑–æ–≤–æ -> Normalized: –ß–µ—Ä–∫–∏–∑–æ–≤–æ
Original: –ß–µ—Ä–∫–∏–∑–æ–≤–æ -> Normalized: –ß–µ—Ä–∫–∏–∑–æ–≤–æ
Original: –ü—Ä–æ–º—Å–≤—è–∑—å–±–∞–Ω–∫ -> Normalized: –ü—Ä–æ–º—Å–≤—è–∑—å–±–∞–Ω–∫
Original: –ü–ê–û "–ü—Ä–æ–º—Å–≤—è–∑—å–±–∞–Ω–∫" -> Normalized: –ü–ê–û "–ü—Ä–æ–º—Å–≤—è–∑—å–±–∞–Ω–∫"
Original: –ò–Ω—Ç–µ—Ä—Ñ–∞–∫—Å—É -> Normalized: –ò–Ω—Ç–µ—Ä—Ñ–∞–∫—Å—É
Original: –ü–°–ë -> Normalized: –ü–°–ë
Original: –ê–ö–†–ê -> Normalized: –ê–ö–†–ê
Original: –ù–û–í–ê–¢–≠–ö–∞ -> Normalized: –ù–û–í–ê–¢–≠–ö–∞
Original: –ù–û–í–ê–¢–≠–ö–∞ -> Normalized: –ù–û–í–ê–¢–≠–ö–∞
Original: –ù–û–í–ê–¢–≠–ö

Original: –ü–ê–û "–î–∂–µ—Ç–õ–µ–Ω–¥ –•–æ–ª–¥–∏–Ω–≥ -> Normalized: –ü–ê–û "–î–∂–µ—Ç–õ–µ–Ω–¥ –•–æ–ª–¥–∏–Ω–≥
Original: –°–ü–ë –±–∏—Ä–∂–µ -> Normalized: –°–ü–ë –±–∏—Ä–∂–µ
Original: –ß–µ—Ä–∫–∏–∑–æ–≤–æ -> Normalized: –ß–µ—Ä–∫–∏–∑–æ–≤–æ
Original: –ü–ê–û "–ì—Ä—É–ø–ø–∞ –ß–µ—Ä–∫–∏–∑–æ–≤–æ -> Normalized: –ü–ê–û "–ì—Ä—É–ø–ø–∞ –ß–µ—Ä–∫–∏–∑–æ–≤–æ
Original: –ë–ö "–†–µ–≥–∏–æ–Ω" -> Normalized: –ë–ö "–†–µ–≥–∏–æ–Ω"
Original: –†–æ—Å—Å–µ–ª—å—Ö–æ–∑–±–∞–Ω–∫ -> Normalized: –†–æ—Å—Å–µ–ª—å—Ö–æ–∑–±–∞–Ω–∫
Original: –ß–µ—Ä–∫–∏–∑–æ–≤–æ -> Normalized: –ß–µ—Ä–∫–∏–∑–æ–≤–æ
Original: –ö–° -> Normalized: –ö–°
Original: –ì–ö "–û'–ö–µ–π -> Normalized: –ì–ö "–û'–ö–µ–π
Original: –û'–ö–µ–π -> Normalized: –û'–ö–µ–π
Original: –û'–∫–µ–π –ì—Ä—É–ø -> Normalized: –û'–∫–µ–π –ì—Ä—É–ø
Original: –ú–ö–ü–ê–û -> Normalized: –ú–ö–ü–ê–û
Original: –ü–ê–û "–≠–ª–µ–º–µ–Ω—Ç" -> Normalized: –ü–ê–û "–≠–ª–µ–º–µ–Ω—Ç"
Original: –ü–ê–û "–≠–ª–µ–º–µ–Ω—Ç" -> Normalized: –ü–ê–û "–≠–ª–µ–º–µ–Ω—Ç"
Original: –≠–ª–µ–º–µ–Ω—Ç–∞ -> Normalized: –≠–ª–µ–º–µ–Ω—Ç–∞
Ori

Original: –î–∏–∞—Å–æ—Ñ—Ç -> Normalized: –î–∏–∞—Å–æ—Ñ—Ç
Original: –ü–ê–û "–î–∏–∞—Å–æ—Ñ—Ç" -> Normalized: –ü–ê–û "–î–∏–∞—Å–æ—Ñ—Ç"
Original: –Æ–Ω–∏–ø—Ä–æ -> Normalized: –Æ–Ω–∏–ø—Ä–æ
Original: –ü–ê–û "–Æ–Ω–∏–ø—Ä–æ" -> Normalized: –ü–ê–û "–Æ–Ω–∏–ø—Ä–æ"
Original: –î–∏–∞—Å–æ—Ñ—Ç -> Normalized: –î–∏–∞—Å–æ—Ñ—Ç
Original: –ü–ê–û "–î–∏–∞—Å–æ—Ñ—Ç" -> Normalized: –ü–ê–û "–î–∏–∞—Å–æ—Ñ—Ç"
Original: –°–∞–Ω–∫—Ç-–ü–µ—Ç–µ—Ä–±—É—Ä–≥ -> Normalized: –°–∞–Ω–∫—Ç-–ü–µ—Ç–µ—Ä–±—É—Ä–≥
Original: –ë–æ—Ä–µ—Ü -> Normalized: –ë–æ—Ä–µ—Ü
Original: –ù–∞—Ü–∏–æ–Ω–∞–ª—å–Ω—ã–π —Ä–∞—Å—á–µ—Ç–Ω—ã–π –¥–µ–ø–æ–∑–∏—Ç–∞—Ä–∏–π (–ù–†–î) -> Normalized: –ù–∞—Ü–∏–æ–Ω–∞–ª—å–Ω—ã–π —Ä–∞—Å—á–µ—Ç–Ω—ã–π –¥–µ–ø–æ–∑–∏—Ç–∞—Ä–∏–π (–ù–†–î)
Original: –ù–†–î -> Normalized: –ù–†–î
Original: –ì–µ–Ω–ø—Ä–æ–∫—É—Ä–∞—Ç—É—Ä—ã -> Normalized: –ì–µ–Ω–ø—Ä–æ–∫—É—Ä–∞—Ç—É—Ä—ã
Original: HeadHunter -> Normalized: HeadHunter
Original: CEO -> Normalized: CEO
Original: HeadHunter (HH) -> Normalized: HeadHunter (HH)
Original: HH -> Normalized: HH
Original: –ú–¢–° -> Norma

Original: –ê–±—Ä–∞—É-–î—é—Ä—Å–æ -> Normalized: –ê–±—Ä–∞—É-–î—é—Ä—Å–æ
Original: –ì–ö "–ê–±—Ä–∞—É-–î—é—Ä—Å–æ" -> Normalized: –ì–ö "–ê–±—Ä–∞—É-–î—é—Ä—Å–æ"
Original: –ò–Ω—Ç–µ—Ä—Ñ–∞–∫—Å–∞ -> Normalized: –ò–Ω—Ç–µ—Ä—Ñ–∞–∫—Å–∞
Original: –°–æ—Ñ—Ç–ª–∞–π–Ω–∞ -> Normalized: –°–æ—Ñ—Ç–ª–∞–π–Ω–∞
Original: –ü–ê–û "–°–æ—Ñ—Ç–ª–∞–π–Ω" -> Normalized: –ü–ê–û "–°–æ—Ñ—Ç–ª–∞–π–Ω"
Original: –ü–ê–û "–°–æ—Ñ—Ç–ª–∞–π–Ω" -> Normalized: –ü–ê–û "–°–æ—Ñ—Ç–ª–∞–π–Ω"
Original: –°–æ–≤–∫–æ–º–±–∞–Ω–∫ -> Normalized: –°–æ–≤–∫–æ–º–±–∞–Ω–∫
Original: –°–æ–≤–∫–æ–º–±–∞–Ω–∫ -> Normalized: –°–æ–≤–∫–æ–º–±–∞–Ω–∫
Original: –°–æ–≤–∫–æ–º–±–∞–Ω–∫ -> Normalized: –°–æ–≤–∫–æ–º–±–∞–Ω–∫
Original: –ù–∞–±—Å–æ–≤–µ—Ç -> Normalized: –ù–∞–±—Å–æ–≤–µ—Ç
Original: VK -> Normalized: VK
Original: –ú–ö–ü–ê–û "–í–ö" -> Normalized: –ú–ö–ü–ê–û "–í–ö"
Original: VK -> Normalized: VK
Original: VK -> Normalized: VK
Original: –†—É—Å–∞–ª -> Normalized: –†—É—Å–∞–ª
Original: –†—É—Å–∞–ª -> Normalized: –†—É—Å–∞–ª
Original: –ì–∞–∑–ø—Ä–æ–º–±–∞–Ω–∫ -> Normalized: –ì–∞–∑–ø

Original: –ü—Ä–æ–º–æ–º–µ–¥–∞ -> Normalized: –ü—Ä–æ–º–æ–º–µ–¥–∞
Original: –ü—Ä–æ–º–æ–º–µ–¥ -> Normalized: –ü—Ä–æ–º–æ–º–µ–¥
Original: –ü–ê–û "–ü—Ä–æ–º–æ–º–µ–¥" -> Normalized: –ü–ê–û "–ü—Ä–æ–º–æ–º–µ–¥"
Original: –ü—Ä–æ–º–æ–º–µ–¥–∞ -> Normalized: –ü—Ä–æ–º–æ–º–µ–¥–∞
Original: –ì–ö "–°–∞–º–æ–ª–µ—Ç" -> Normalized: –ì–ö "–°–∞–º–æ–ª–µ—Ç"
Original: –ì–ö "–°–∞–º–æ–ª–µ—Ç" -> Normalized: –ì–ö "–°–∞–º–æ–ª–µ—Ç"
Original: –°–æ–≤–∫–æ–º–±–∞–Ω–∫ -> Normalized: –°–æ–≤–∫–æ–º–±–∞–Ω–∫
Original: –ì–∞–∑–ø—Ä–æ–º–±–∞–Ω–∫ -> Normalized: –ì–∞–∑–ø—Ä–æ–º–±–∞–Ω–∫
Original: –í–¢–ë –ö–∞–ø–∏—Ç–∞–ª —Ç—Ä–µ–π–¥–∏–Ω–≥ -> Normalized: –í–¢–ë –ö–∞–ø–∏—Ç–∞–ª —Ç—Ä–µ–π–¥–∏–Ω–≥
Original: –ê–ö–†–ê -> Normalized: –ê–ö–†–ê
Original: –°–±–µ—Ä–±–∞–Ω–∫–∞ -> Normalized: –°–±–µ—Ä–±–∞–Ω–∫–∞
Original: –°–±–µ—Ä–±–∞–Ω–∫–∞ -> Normalized: –°–±–µ—Ä–±–∞–Ω–∫–∞
Original: –ù–∞–±—Å–æ–≤–µ—Ç -> Normalized: –ù–∞–±—Å–æ–≤–µ—Ç
Original: –°–±–µ—Ä–±–∞–Ω–∫–∞ -> Normalized: –°–±–µ—Ä–±–∞–Ω–∫–∞
Original: –°–µ–≤–µ—Ä—Å—Ç–∞–ª—å -> Normalized: –°–µ–≤–µ—Ä—Å—Ç–∞–ª—å
O

Original: –°–±–µ—Ä–±–∞–Ω–∫ -> Normalized: –°–±–µ—Ä–±–∞–Ω–∫
Original: –°–±–µ—Ä–±–∞–Ω–∫ -> Normalized: –°–±–µ—Ä–±–∞–Ω–∫
Original: –°–µ–ª–∏–≥–¥–∞—Ä -> Normalized: –°–µ–ª–∏–≥–¥–∞—Ä
Original: –ê–õ–†–û–°–ê -> Normalized: –ê–õ–†–û–°–ê
Original: –Ø–∫—É—Ç–∏—è 24 -> Normalized: –Ø–∫—É—Ç–∏—è 24
Original: –ê–õ–†–û–°–ê -> Normalized: –ê–õ–†–û–°–ê
Original: –°–µ–ª–∏–≥–¥–∞—Ä -> Normalized: –°–µ–ª–∏–≥–¥–∞—Ä
Original: –ü–ê–û "–°–µ–ª–∏–≥–¥–∞—Ä" -> Normalized: –ü–ê–û "–°–µ–ª–∏–≥–¥–∞—Ä"
Original: –ì–∞–∑–ø—Ä–æ–º–±–∞–Ω–∫ -> Normalized: –ì–∞–∑–ø—Ä–æ–º–±–∞–Ω–∫
Original: –í–¢–ë –ö–∞–ø–∏—Ç–∞–ª —Ç—Ä–µ–π–¥–∏–Ω–≥ -> Normalized: –í–¢–ë –ö–∞–ø–∏—Ç–∞–ª —Ç—Ä–µ–π–¥–∏–Ω–≥
Original: –°–æ–≤–∫–æ–º–±–∞–Ω–∫ -> Normalized: –°–æ–≤–∫–æ–º–±–∞–Ω–∫
Original: –Ø–Ω–¥–µ–∫—Å -> Normalized: –Ø–Ω–¥–µ–∫—Å
Original: –Ø–Ω–¥–µ–∫—Å -> Normalized: –Ø–Ω–¥–µ–∫—Å
Original: –ò–Ω—Ç–µ—Ä—Ñ–∞–∫—Å—É -> Normalized: –ò–Ω—Ç–µ—Ä—Ñ–∞–∫—Å—É
Original: –ë–∞–Ω–∫–∞ –†–æ—Å—Å–∏–∏ -> Normalized: –ë–∞–Ω–∫–∞ –†–æ—Å—Å–∏–∏
Original: –ê–û "–°—Ç–∞—Ä—Ç –∫–∞–ø–∏—Ç–∞–ª -> 

## Manual verification of results

In [176]:
df_filtered = pd.read_excel(r"C:\Users\–ö–∞—Ä–ø–µ–Ω–∫–æ\–í–ö–† –ö–∞—Ä–ø–µ–Ω–∫–æ\data markup\filtered_with_tickers_2.xlsx")


total = len(df_filtered)
correct = df_filtered['Correct'].sum()
accuracy = correct / total

print(f"üìä Accuracy (—Ç–æ—á–Ω–æ—Å—Ç—å): {accuracy:.4f} ({correct}/{total})")

üìä Accuracy (—Ç–æ—á–Ω–æ—Å—Ç—å): 0.9208 (349/379)
