In [31]:
import pandas as pd
import re
import numpy as np
import pymorphy2
import nltk
from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
 
eng_lemmatizer = WordNetLemmatizer()

rus_stop_words = stopwords.words('russian')
eng_stop_words = stopwords.words('english')
lemmatizer = pymorphy2.MorphAnalyzer()

In [20]:
# Теги

nltk_to_pymorphy_mapping = {
    # Nouns
    'NN': 'NOUN',       # Singular or mass noun
    'NNS': 'NOUN',      # Plural noun
    'NNP': 'NOUN',      # Proper noun, singular
    'NNPS': 'NOUN',     # Proper noun, plural
    
    # Verbs
    'VB': 'VERB',       # Base form of verb
    'VBD': 'VERB',      # Past tense verb
    'VBG': 'VERB',      # Gerund or present participle
    'VBN': 'VERB',      # Past participle
    'VBP': 'VERB',      # Non-3rd person singular present
    'VBZ': 'VERB',      # 3rd person singular present
    
    # Adjectives
    'JJ': 'ADJF',       # Adjective
    'JJR': 'ADJF',      # Comparative adjective
    'JJS': 'ADJF',      # Superlative adjective
    
    # Adverbs
    'RB': 'ADVB',       # Adverb
    'RBR': 'ADVB',      # Comparative adverb
    'RBS': 'ADVB',      # Superlative adverb
    
    # Pronouns
    'PRP': 'NPRO',      # Personal pronoun
    'PRP$': 'NPRO',     # Possessive pronoun
    
    # Determiners
    'DT': 'ADJF',       # Determiner
    
    # Prepositions
    'IN': 'PREP',       # Preposition
    
    # Conjunctions
    'CC': 'CONJ',       # Coordinating conjunction
    
    # Interjections
    'UH': 'INTJ',       # Interjection
    
    # Numbers
    'CD': 'NUMR',       # Cardinal number
    
    # Symbols
    '$': 'UNKN',        # Dollar sign
    '#': 'UNKN',        # Pound sign
    
    # Others
    'FW': 'UNKN',       # Foreign word
    'SYM': 'UNKN',      # Symbol
    'LS': 'UNKN',       # List item marker
    'POS': 'UNKN',      # Possessive ending
    'RP': 'UNKN',       # Particle
    'TO': 'INFN',       # to (infinitive marker)
    'EX': 'UNKN',       # Existential there
    'WP': 'NPRO',       # Wh-pronoun
    'WP$': 'NPRO',      # Possessive wh-pronoun
}

def map_nltk_to_pymorphy(nltk_pos_tag):
    return nltk_to_pymorphy_mapping.get(nltk_pos_tag, None)

In [88]:
# Лемматизация и удаление знаков препинания 

def lemmatize_text(text):
    if text != None:
        text = text.lower()
        text = re.sub('[^а-яА-Я]', ' ', text)
        text = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
        text = re.sub("(\\d|\\W)+"," ",text)
        split_text = text.split(' ')
        pos_list = [lemmatizer.parse(word)[0].tag.POS for word in split_text if word not in rus_stop_words]
        word_list = [lemmatizer.parse(word)[0].normal_form for word in split_text if word not in rus_stop_words]
        word_pos_pairs = []
       
        for pos, word in zip(pos_list, word_list):
            if pos is not None:
                word_pos_pairs.append(word + '_' + pos)

        return ' '.join(word_pos_pairs)
    else:
        return 'None'

In [112]:
# Лемматизация и удаление знаков препинания 

def eng_lemmatize_text(text):
    if text != None:
        text = text.lower()
       
        pattern = r'[^\w\s]'
        text = re.sub(pattern, '', text)
        tokens = word_tokenize(text)

        tokens = [eng_lemmatizer.lemmatize(token) for token in tokens]
        tokens = [token for token in tokens if token != 'wa']
        tagged_words = nltk.pos_tag(tokens)
        tagged_words = [(word, map_nltk_to_pymorphy(tag)) for word, tag in tagged_words if word not in eng_stop_words]
        return ' '.join(['{}_{}'.format(word, pos) for word, pos in tagged_words])
        
    else:
        return 'None'

In [119]:
# Разделение файлов для формата fast_align

with open('/Users/karpovapolina/xml_to_txt/output-eng-rus_2.txt', 'r', encoding='utf-8') as input_file, open('output-eng-rus.txt', 'w', encoding='utf-8') as output_file:
    for line in input_file:
        if len(line.strip().split('|||')) != 1:
            english_text, russian_text = line.strip().split('|||')
            english_lemmatized = eng_lemmatize_text(english_text.strip())
            russian_lemmatized = lemmatize_text(russian_text.strip())
            output_file.write(f"{english_lemmatized} ||| {russian_lemmatized}\n")

In [117]:
def remove_lines_with_pattern(input_file, output_file, pattern):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            if pattern not in line:
                outfile.write(line)

input_file = '/Users/karpovapolina/Downloads/output-eng-rus.txt' 
output_file = 'output-eng-rus_2.txt' 
pattern = '###' 

remove_lines_with_pattern(input_file, output_file, pattern)

In [87]:
with open('/Users/karpovapolina/Downloads/output-rus-eng-lem-words.txt', 'r') as file1:
    file1 = file1.readlines()

In [88]:
# Удаление дополнительных знаков препинания 

import string
additional_punctuation = "@#$%^&*()_-+=[]{};:'\"\\|<>,./''’”“‘"
all_punctuation = set(string.punctuation + additional_punctuation)

def remove_punctuation(input_str):
    input_str = input_str.split('\t')
    word_rus = ''.join(char if char not in all_punctuation_except_apostrophe or re.match(r"(?<![а-яА-ЯёЁ])'|'(?![а-яА-ЯёЁ])", char) else '' for char in input_str[2])
    word_eng = ''.join(char if char not in all_punctuation_except_apostrophe or re.match(r"(?<![a-zA-Z])'|'(?![a-zA-Z])", char) else '' for char in input_str[3][:-1])
    return input_str[0] + '\t' + input_str[1] + '\t' + word_rus + '\t' + word_eng + '\n'

In [90]:
new_file = []
for i in file1:
    new_file.append(remove_punctuation(i))

In [91]:
# Очистка данных от пар слов, которые встречаются менее трёх раз

final_file = []
for i in new_file:
    i = i.split('\t')
    if int(i[0]) >= 3 and i[2] != '' and i[3][:-1] != '' and i[2] != ' ' and i[3][:-1] != ' ':
        final_file.append('\t'.join(i))

In [92]:
with open('output_rus_eng_cleaned.txt', 'w') as file:
    for item in final_file:
        file.write(item)

In [94]:
df = pd.read_csv('output_rus_eng_cleaned.txt', sep='\t', header=None, names=['count', 'probability', 'russian', 'english'])

In [95]:
df.to_csv('output_rus_eng.csv')

In [None]:
final_file = []
for i in file1:
    i = i.split('\t')
    eng_word = eng_lemmatize_text(i[2])
    rus_word = lemmatize_text(i[3][:-1])
    new_i = '\t'.join([i[0], i[1], eng_word, rus_word])
    if eng_word != ' ' and rus_word != ' ' and new_i not in final_file:
        final_file.append(new_i)

In [None]:
# Очистка данных от полупустых строк

with open('output-rus-eng-final.txt', 'r', encoding='utf-8') as file:
    text = file.readlines()

c = 0
with open('output-rus-eng-final-lem.txt', 'w', encoding='utf-8') as new_file:
    for line in text:
        if line.split()[0] == '|||' or line.split()[-1:] == ['|||']:
            c += 1
        else:
            new_file.write(line)