# Lemmatisierung mit Spacy

In [1]:
import warnings
warnings.filterwarnings('ignore')

from cophi_toolbox import preprocessing
import metadata_toolbox.utils as metadata
import pandas as pd
from pathlib import Path

## Laden und auslesen

In [8]:
data = 'Z:/TM_Schuchardt'
path_to_corpus = Path(data, 'hsa_ita_lemma') # Davor Backup erstellen, da diese Files überschrieben werden
pattern = '{id}_{sender}_an_{reciever}_{date}_{language}'
# z. B. 1716_Le-Spectateur-ou-le-Socrate-moderne_Anonym_Table-des-Matieres_119-1257
meta = pd.concat([metadata.fname2metadata(str(path), pattern=pattern) for path in path_to_corpus.glob('*.txt')])

In [9]:
meta[:5]

Unnamed: 0,id,sender,reciever,date,language
Z:\TM_Schuchardt\hsa_ita_lemma\1010_Graziadio-Isaia-Ascoli_an_Hugo-Schuchardt_1873-09-02_ita.txt,1010,Graziadio-Isaia-Ascoli,Hugo-Schuchardt,1873-09-02,ita
Z:\TM_Schuchardt\hsa_ita_lemma\1057_Graziadio-Isaia-Ascoli_an_Hugo-Schuchardt_1879-02-17_ita.txt,1057,Graziadio-Isaia-Ascoli,Hugo-Schuchardt,1879-02-17,ita
Z:\TM_Schuchardt\hsa_ita_lemma\1058_Graziadio-Isaia-Ascoli_an_Hugo-Schuchardt_1879-06-24_ita.txt,1058,Graziadio-Isaia-Ascoli,Hugo-Schuchardt,1879-06-24,ita
Z:\TM_Schuchardt\hsa_ita_lemma\1083_Hugo-Schuchardt_an_Graziadio-Isaia-Ascoli_1869-01-13_ita.txt,1083,Hugo-Schuchardt,Graziadio-Isaia-Ascoli,1869-01-13,ita
Z:\TM_Schuchardt\hsa_ita_lemma\1084_Hugo-Schuchardt_an_Graziadio-Isaia-Ascoli_1869-02-11_ita.txt,1084,Hugo-Schuchardt,Graziadio-Isaia-Ascoli,1869-02-11,ita


In [10]:
len(meta)

250

## Lemmatisierung mit Spacy

In [11]:
import spacy
nlp = spacy.load('it_core_news_sm')

"""
Sprachpakete:
Deutsch: de_core_news_sm
Italienisch: it_core_news_sm
Spanisch: es_core_news_md
Französisch: fr_core_news_md
"""

### Dateien lemmatisieren und speichern

In [12]:
for file in path_to_corpus.glob('*.txt'):
    with open(file, encoding='utf-8') as f:
        original = f.read()
        lemmatized_object = nlp(original)
        lemma_list = []
        for lemma in lemmatized_object:
            lemma_list.append(lemma.lemma_)
        lemma_doc = ' '.join(lemma_list)
    with open(file, 'w', encoding='utf-8') as f: # Achtung: Files werden überschrieben
        f.write(lemma_doc)