In [2]:
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer_en = WordNetLemmatizer()

from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer
lemmatizer_fr = FrenchLefffLemmatizer()


In [9]:
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Shen\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [4]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
    
def get_char_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return 'a'
    elif treebank_tag.startswith('V'):
        return 'v'
    elif treebank_tag.startswith('N'):
        return 'n'
    elif treebank_tag.startswith('R'):
        return 'r'
    else:
        return None

In [6]:
import os
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet

def process_folder(folder_path):
    
    lemma_folder = os.path.join(folder_path, 'lemma')
    os.makedirs(lemma_folder, exist_ok=True)

    for file_name in os.listdir(folder_path):
        if file_name.endswith('.en') or file_name.endswith('.fr'):  
            with open(os.path.join(folder_path, file_name), 'r', encoding='utf-8') as file:
                text = file.read().lower()  

            words = word_tokenize(text)
            tagged_words = pos_tag(words)

            lemmatized_words = []
            for word, pos in tagged_words:
                if file_name.endswith('.en'):
                    pos = get_wordnet_pos(pos)
                    if pos:
                        lemma = lemmatizer_en.lemmatize(word, pos)
                    else:
                        lemma = word
                elif file_name.endswith('.fr'):
                    pos = get_char_pos(pos)
                    if pos:
                        lemma = lemmatizer_fr.lemmatize(word, pos)
                    else:
                        lemma = word
                lemmatized_words.append(lemma)

            output_file = os.path.join(lemma_folder, f"{os.path.splitext(file_name)[0]}.lemma{os.path.splitext(file_name)[1]}")
            with open(output_file, 'w', encoding='utf-8') as out_file:
                out_file.write(' '.join(lemmatized_words))

In [7]:
def process_subfolders(root_folder):
    for subfolder in os.listdir(root_folder):
        subfolder_path = os.path.join(root_folder, subfolder)
        if os.path.isdir(subfolder_path):
            process_folder(subfolder_path)

In [10]:
process_subfolders('../data/clean')