In [1]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer
import spacy

# nlp_fr = spacy.load('fr_core_news_sm')
# nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('omw-1.4') 

lemmatizer_en = WordNetLemmatizer()
lemmatizer_fr = FrenchLefffLemmatizer()


In [2]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemmatize_sentence_en(sentence):
    words = nltk.word_tokenize(sentence)
    tagged_words = pos_tag(words)
    lemmatized_words = []
    for word, tag in tagged_words:
        wn_tag = get_wordnet_pos(tag)
        if wn_tag:
            lemmatized_word = lemmatizer_en.lemmatize(word, pos=wn_tag)
        else:
            lemmatized_word = lemmatizer_en.lemmatize(word)
        lemmatized_words.append(lemmatized_word)
    return ' '.join(lemmatized_words)

def lemmatize_sentence_fr(sentence):
    doc = nlp_fr(sentence)
    return ' '.join([token.lemma_ for token in doc])


In [3]:
def lemmatize_file(file_path, language):
    base_name = file_path.rsplit('.', 1)[0]
    extension = file_path.rsplit('.', 1)[1]
    output_path = f"{base_name}.lemma.{extension}"
    
    try:
        with open(file_path, 'r', encoding='utf-8') as file, \
             open(output_path, 'w', encoding='utf-8') as outfile:
            for line in file:
                clean_line = line.strip()
                if clean_line:
                    if language == 'english':
                        lemmatized_line = lemmatize_sentence_en(clean_line)
                    elif language == 'french':
                        lemmatized_line = lemmatize_sentence_fr(clean_line)
                    outfile.write(lemmatized_line + '\n')  
        print(f"Lemmatized file saved as: {output_path}")
    except Exception as e:
        print(f"Failed to process {file_path}: {e}")

In [4]:
files_to_process = [
    {'path': '../data/clean/Emea/Emea_test_500.tok.true.clean.en', 'language': 'english'},
    {'path': '../data/clean/Emea/Emea_test_500.tok.true.clean.fr', 'language': 'french'},
    {'path': '../data/clean/Emea/Emea_train_10k.tok.true.clean.en', 'language': 'english'},
    {'path': '../data/clean/Emea/Emea_train_10k.tok.true.clean.fr', 'language': 'french'},
    {'path': '../data/clean/Europarl/Europarl_dev_3750.tok.true.clean.en', 'language': 'english'},
    {'path': '../data/clean/Europarl/Europarl_dev_3750.tok.true.clean.fr', 'language': 'french'},
    {'path': '../data/clean/Europarl/Europarl_test_500.tok.true.clean.en', 'language': 'english'},
    {'path': '../data/clean/Europarl/Europarl_test_500.tok.true.clean.fr', 'language': 'french'},
    {'path': '../data/clean/Europarl/Europarl_train_100k.tok.true.clean.en', 'language': 'english'},
    {'path': '../data/clean/Europarl/Europarl_train_100k.tok.true.clean.fr', 'language': 'french'},
]

for file_info in files_to_process:
    lemmatize_file(file_info['path'], file_info['language'])


Lemmatized file saved as: ../data/clean/Emea/Emea_test_500.tok.true.clean.lemma.en
Lemmatized file saved as: ../data/clean/Emea/Emea_test_500.tok.true.clean.lemma.fr
Lemmatized file saved as: ../data/clean/Emea/Emea_train_10k.tok.true.clean.lemma.en
Lemmatized file saved as: ../data/clean/Emea/Emea_train_10k.tok.true.clean.lemma.fr
Lemmatized file saved as: ../data/clean/Europarl/Europarl_dev_3750.tok.true.clean.lemma.en
Lemmatized file saved as: ../data/clean/Europarl/Europarl_dev_3750.tok.true.clean.lemma.fr
Lemmatized file saved as: ../data/clean/Europarl/Europarl_test_500.tok.true.clean.lemma.en
Lemmatized file saved as: ../data/clean/Europarl/Europarl_test_500.tok.true.clean.lemma.fr
Lemmatized file saved as: ../data/clean/Europarl/Europarl_train_100k.tok.true.clean.lemma.en
Lemmatized file saved as: ../data/clean/Europarl/Europarl_train_100k.tok.true.clean.lemma.fr
