## Import all the libraries 

In [1]:
import stanza
from csv import DictWriter
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


## Read the raw CSV

In [2]:
raw_df = pd.read_csv("raw_data_corpus.csv")
raw_df.head()

Unnamed: 0,Source,Title,Content,Section,URL,Date
0,La Jornada,Una plata y dos bronces para mexicanos en la C...,A 146 días de que comiencen los Juegos Olímpic...,Sports,https://www.jornada.com.mx/2024/03/03/deportes...,3/3/2024
1,La Jornada,Sergio Pérez arranca la temporada con podio e...,El mexicano Sergio Pérez constató que se ha co...,Sports,https://www.jornada.com.mx/2024/03/03/deportes...,3/3/2024
2,La Jornada,El torneo dejó claro que Acapulco está de pie:...,"Acapulco, Gro., El Abierto Mexicano de Tenis “...",Sports,https://www.jornada.com.mx/2024/03/03/deportes...,3/3/2024
3,La Jornada,"Lista para votarse, ley de salario base en el ...",Las reformas a la Ley Federal del Trabajo (LFT...,Sports,https://www.jornada.com.mx/2024/03/03/deportes...,3/3/2024
4,La Jornada,América humilla 5-1 a domicilio al Atlas,"Pese a fallar un penal, el América logró remon...",Sports,https://www.jornada.com.mx/2024/03/03/deportes...,3/3/2024


## Read the Normalized DataSet

In [3]:
stanza.download ('es')
nlp = stanza.Pipeline('es')
stop_words = ['DET', 'ADP', 'CCONJ', 'SCONJ', 'PRON']

titles= []
for t in raw_df['Title']:
    doc = nlp(t)
    title = ''
    for sent in doc.sentences:
        for token in sent.words:
            if token.pos in stop_words:
                continue
            if title == '':
                title += token.lemma.capitalize()
            else:
                title += ' ' + token.lemma
    title.lstrip()
    titles.append(title)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 373kB [00:00, 10.4MB/s]                    
2024-03-08 22:27:26 INFO: Downloaded file to /Users/k-dot/stanza_resources/resources.json
2024-03-08 22:27:26 INFO: Downloading default packages for language: es (Spanish) ...
2024-03-08 22:27:27 INFO: File exists: /Users/k-dot/stanza_resources/es/default.zip
2024-03-08 22:27:29 INFO: Finished downloading models and saved to /Users/k-dot/stanza_resources
2024-03-08 22:27:29 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 373kB [00:00, 6.87MB/s]                    
2024-03-08 22:27:30 INFO: Downloaded file to /Users/k-dot/stanza_resources/resources.json
2024-03-08 22:27:31 INFO: Loading these models 

In [None]:
contents= []
for c in raw_df['Content']:
    if isinstance(c, str):
        doc = nlp(c)
        content = ''
        for sent in doc.sentences:
            for token in sent.words:
                if token.pos in stop_words:
                    continue
                if content == '':
                    content += token.lemma.capitalize()
                else:
                    content += ' ' + token.lemma  
    else:
        content = ''
    content.lstrip()
    contents.append(content)

In [None]:
with open('normalized_data_corpus.csv', mode='w', newline='', encoding='utf-8') as csv_file:
        fields = ["Source", "Title", "Content", "Section", "URL", "Date"]
        writer = DictWriter(csv_file, fieldnames=fields)
        writer.writeheader()
        for row in range(raw_df.shape[0]):
            rows = []
            for i, content in enumerate(raw_df.iloc[row]) :
                if i == 1:
                    rows.append(titles[row])
                elif i ==2:
                    rows.append(contents[row])
                else:
                    rows.append(content)
            writer.writerow({"Source": rows[0], "Title": rows[1], "Content": rows[2], "Section": rows[3], "URL": rows[4], 'Date': rows[5]})

In [65]:
normalized_df = pd.read_csv("normalized_data_corpus.csv")
normalized_df.head()

Unnamed: 0,Source,Title,Content,Section,URL,Date
0,La Jornada,Plata dos bronce mexicano Copa Mundial Clavados,"146 día comencar Juegos Olímpicos , clavadista...",Sports,https://www.jornada.com.mx/2024/03/03/deportes...,3/3/2024
1,La Jornada,Sergio Pérez arrancar temporada podio Baréin ;...,Mexicano Sergio Pérez constatar haber converti...,Sports,https://www.jornada.com.mx/2024/03/03/deportes...,3/3/2024
2,La Jornada,Torneo dejar claro Acapulco estar pie : Falla,"Acapulco , Gro. , Abierto Mexicano Tenis "" val...",Sports,https://www.jornada.com.mx/2024/03/03/deportes...,3/3/2024
3,La Jornada,"Lista votar , ley salario base deporte","Reforma Ley Federal Trabajo ( LFT ) , fin gara...",Sports,https://www.jornada.com.mx/2024/03/03/deportes...,3/3/2024
4,La Jornada,América humillar 5-1 domicilio Atlas,"Pese fallar penal , América lograr remontar ma...",Sports,https://www.jornada.com.mx/2024/03/03/deportes...,3/3/2024
