In [112]:
import pandas as pd
import zipfile
import os
import numpy as np

In [88]:
def getDataFrameFromZip(filename, folder=""):
    with zipfile.ZipFile(f"{folder}{"/" if folder != "" else ""}{filename}.zip", "r") as zipfolder:
        with zipfolder.open(f"{filename}.jsonl", "r") as jsonlfile:
            return pd.read_json(jsonlfile, lines=True)

In [89]:
folder = "scrapesbytopic"

In [90]:
sources = ["elconfidencial", "publico", "lavanguardia", "eldiario", "lanuevaespana", "lavozdeasturias", "elpais", "elespanol", "elmundo", "larazon", "elcomercio", "abc"]

categories = ["aborto", "bildu", "cambio-climatico", "casa-real", "ciudadanos", "covid", "cristianismo", "drogas", "erc", "franquismo", "guerra-ucrania", "impuestos", "independentismo", "inmigracion", "iu", "junts", "pnv", "podemos", "pp", "psoe", "sindicatos", "terrorismo", "toros", "vox", "yihadismo"]

In [91]:
def getFilename(folder, source, category):
    filenames = os.listdir(folder)
    for filename in filenames:
        if f"{source}-{category}" in filename:
            return f"{folder}/{filename}"
    return None

def getDataFrame(folder, source, category):
    filename = getFilename(folder, source, category)
    with open(filename, "r") as file:
        return pd.read_json(file, lines=True)

In [92]:
df = None

for source in sources:
    for category in categories:
        if df is None:
            df = getDataFrame(folder, source, category)
        else:
            newdf = getDataFrame(folder, source, category)
            if len(newdf) > 0:
                df = pd.concat([df, newdf], ignore_index=True)

In [93]:
df["source"].value_counts()

source
El País               183759
El Mundo               57090
El Español             54457
La Voz de Asturias     52627
La Razón               42514
Público                28421
La Nueva España        24453
La Vanguardia           7608
El Confidencial          498
Name: count, dtype: int64

In [94]:
df.to_json("scrapedbytopic.jsonl", orient="records", lines=True)

In [95]:
def cleanDataFrame(df):
    output = df.copy()
    output = output.loc[output["categories"].notna()]
    output["title"] = output["title"].apply(lambda x: x.replace("\xa0", " "))
    return output

In [96]:
df_p = cleanDataFrame(df)

In [97]:
with open("1000temas.txt", "r", encoding="utf-8") as reducedCategoryFile:
    reducedCategoryList = [line.replace("\n", "").split("->") for line in reducedCategoryFile.readlines()]
    reducedCategoryDict = {}
    for category in reducedCategoryList:
        if len(category) == 1:
            reducedCategoryDict[category[0]] = category[0]
        else:
            reducedCategoryDict[category[0]] = category[1]

def updateCategories(categoryList, dict):
    newCategories = []
    for category in categoryList:
        category = category.upper()
        if category in dict.keys() and dict[category] not in newCategories:
            newCategories.append(dict[category])
    if newCategories == []:
        newCategories = None
    return newCategories

df_p["categories"] = df_p["categories"].apply(lambda x: updateCategories(x, reducedCategoryDict))
df_p = df_p.loc[df_p["categories"].notna()]

In [98]:
def updateCategoriesByNewspaper(row, dict):
    for category in row["categories"]:
        if category not in dict.keys():
            dict[category] = {"El Confidencial": 0,
                              "El Diario": 0,
                              "El Español": 0,
                              "El Mundo": 0,
                              "El País": 0,
                              "La Nueva España": 0,
                              "La Razón": 0,
                              "La Vanguardia": 0,
                              "La Voz de Asturias": 0,
                              "Público": 0,
                              "TOTAL": 0}
        dict[category][row["source"]] = dict[category][row["source"]] + 1
        dict[category]["TOTAL"] = dict[category]["TOTAL"] + 1

In [99]:
categoriesByNewspaper = {}

df_p.apply(lambda x: updateCategoriesByNewspaper(x, categoriesByNewspaper), axis=1)

categoryByNewspaperMatrix = pd.DataFrame.from_dict(categoriesByNewspaper, orient="index").sort_index()

In [100]:
mainCategoryList = ["ABORTO", "DROGAS", "PSOE", "PP", "VOX", "CIUDADANOS", "JUNTS", "IU", "EH BILDU", "TERRORISMO", "INDEPENDENTISMO", "SINDICATOS", "COVID-19", "GUERRA EN UCRANIA", "IMPUESTOS", "CAMBIO CLIMÁTICO", "INDEPENDENTISMO", "YIHADISMO", "TERRORISMO", "FRANQUISMO", "CRISTIANISMO", "CASA REAL", "TOROS", "INMIGRACIÓN", "PODEMOS", "ERC", "PNV", "SINDICATOS"]

def extractMainCategories(categories, mainCategoryList):
    mainCategories = []

    for category in categories:
        if category in mainCategoryList:
            mainCategories.append(category)
    
    if len(mainCategories) == 0:
        mainCategories = None
    
    return mainCategories

df_p_maincategories = df_p.copy()
df_p_maincategories["categories"] = df_p_maincategories["categories"].apply(lambda x: extractMainCategories(x, mainCategoryList))
df_p_maincategories = df_p_maincategories.loc[df_p_maincategories["categories"].notna()]

In [101]:
df_p_maincategories["source"].value_counts()

source
El País               179591
El Mundo               56098
El Español             51267
La Voz de Asturias     50524
La Razón               41488
Público                26743
La Nueva España        24020
La Vanguardia           5991
El Confidencial          132
Name: count, dtype: int64

In [102]:
mainCategoriesByNewspaper = {}

df_p_maincategories.apply(lambda x: updateCategoriesByNewspaper(x, mainCategoriesByNewspaper), axis=1)

mainCategoriesByNewspaperMatrix = pd.DataFrame.from_dict(mainCategoriesByNewspaper, orient="index").sort_index()

In [103]:
mainCategoriesByNewspaperMatrix

Unnamed: 0,El Confidencial,El Diario,El Español,El Mundo,El País,La Nueva España,La Razón,La Vanguardia,La Voz de Asturias,Público,TOTAL
ABORTO,1,0,457,292,7473,334,1712,0,229,542,11040
CAMBIO CLIMÁTICO,1,0,1984,629,9958,1143,32,0,1338,186,15271
CASA REAL,4,0,2074,1529,10755,338,2181,89,1314,378,18662
CIUDADANOS,5,0,8141,13706,14917,2172,2490,1499,10917,1330,55177
COVID-19,1,0,2761,8630,9967,1975,2757,0,7210,1065,34366
CRISTIANISMO,4,0,2306,300,17138,43,2138,4,603,136,22672
DROGAS,4,0,1674,354,2489,763,1961,0,0,192,7437
EH BILDU,1,0,841,913,1825,679,2179,577,271,1044,8330
ERC,1,0,733,6918,16299,1915,3472,2732,4164,196,36430
FRANQUISMO,12,0,848,0,10303,224,236,259,349,1172,13403


In [104]:
df_p_maincategories.to_json("scrapedbytopic_maincategories_all.jsonl", orient="records", lines=True)

In [105]:
df_p_maincategories_original = getDataFrameFromZip("processed_maincategories_all")

In [106]:
df_p_maincategories_combined = pd.concat([df_p_maincategories, df_p_maincategories_original], ignore_index=True).drop_duplicates(subset=["title", "source", "datetime"], keep="last").reset_index(drop=True)

In [107]:
df_p_maincategories_combined

Unnamed: 0,responsedatetime,source,url,title,author,categories,datetime,text
0,2024-06-06 00:07:26,El Confidencial,https://www.elconfidencial.com/mundo/2023-11-2...,¿Podrá ilegalizar el aborto Javier Milei tras ...,P. Díaz,[ABORTO],2023-11-20 12:06:00,El nuevo presidente de Argentina para los próx...
1,2024-06-06 01:01:58,El Confidencial,https://www.elconfidencial.com/espana/pais-vas...,El CIS vasco da un empate entre el PNV y Bildu...,EFE,[PNV],2024-02-09 11:23:00,El PNV ganaría las elecciones al Parlamento Va...
2,2024-06-06 01:01:58,El Confidencial,https://www.elconfidencial.com/espana/pais-vas...,El impacto de las elecciones catalanas en las ...,Inés P. Chávarri,[PNV],2024-03-16 05:00:00,El PNV ha puesto tierra de por medido ante el ...
3,2024-06-06 01:01:58,El Confidencial,https://www.elconfidencial.com/espana/pais-vas...,Un Aberri Eguna en busca del desempate elector...,Joseba Arruti. Bilbao,[PNV],2024-03-31 17:22:00,Las encuestas dadas a conocer desde la convoca...
4,2024-06-06 01:01:58,El Confidencial,https://www.elconfidencial.com/espana/pais-vas...,El tetris de los pactos vascos tras el 21-A: l...,Inés P. Chávarri,"[PNV, PSOE]",2024-04-12 05:00:00,"Salvo cataclismo el 21-A, todo hace indicar qu..."
...,...,...,...,...,...,...,...,...
505136,2024-05-02 07:48:18,Público,https://www.publico.es/sociedad/reproduccion-a...,,,[COVID-19],2020-11-30 07:08:11,Los efectos de la pandemia se han dejado senti...
505137,2024-05-02 09:21:52,Público,https://www.publico.es/politica/cs-supera-al-p...,,,"[PODEMOS, IU, PP]",2018-02-26 08:43:26,"De celebrarse hoy las elecciones andaluzas, el..."
505138,2024-05-02 09:22:18,Público,https://www.publico.es/politica/marlaska-conde...,Marlaska condecorará a los policías que interv...,,[PSOE],2020-01-24 18:35:08,"Fuego, humo y piedras. Durante varias noches d..."
505139,2024-05-01 12:22:40,Público,https://www.publico.es/politica/gobierno-aprue...,El Gobierno aprueba este viernes los Presupue...,,"[CIUDADANOS, PP]",2017-03-30 20:26:52,El Gobierno aprobará este viernes los Presupue...


In [108]:
mainCategoriesByNewspaperCombined = {}

df_p_maincategories_combined.apply(lambda x: updateCategoriesByNewspaper(x, mainCategoriesByNewspaper), axis=1)

mainCategoriesByNewspaperCombinedMatrix = pd.DataFrame.from_dict(mainCategoriesByNewspaper, orient="index").sort_index()

In [117]:
mainCategoriesByNewspaperCombinedMatrix

Unnamed: 0,El Confidencial,El Diario,El Español,El Mundo,El País,La Nueva España,La Razón,La Vanguardia,La Voz de Asturias,Público,TOTAL
ABORTO,8,125,861,505,13965,622,3375,0,411,991,20863
CAMBIO CLIMÁTICO,215,375,3938,1237,20017,2309,1851,0,2526,372,32840
CASA REAL,42,161,5946,3491,19802,963,8161,294,2685,727,42272
CIUDADANOS,52,152,13174,21770,21773,3842,6459,3232,15584,2149,88187
COVID-19,18,2207,13782,22133,19286,4215,21969,0,15595,2529,101734
CRISTIANISMO,103,210,5293,963,31430,263,6096,32,1307,350,46047
DROGAS,147,71,3406,706,5221,1580,4612,0,0,375,16118
EH BILDU,7,451,1542,1460,2872,1016,3687,933,411,1543,13922
ERC,6,93,1233,10545,24335,3230,6069,5292,5872,327,57002
FRANQUISMO,86,494,1944,0,19373,438,962,508,589,2245,26639


In [118]:
# mainCategoriesByNewspaperCombinedMatrix.to_csv("processed_maincategories_combined_all_matrix.csv")

In [114]:
df_p_maincategories_combined.to_json("processed_maincategories_combined_all.jsonl", orient="records", lines=True)

In [116]:
df_p_maincategories_combined["source"].value_counts()

source
El País               156572
La Razón              108823
El Español             67961
El Mundo               59457
La Voz de Asturias     39729
La Nueva España        28008
Público                22470
El Diario              12387
La Vanguardia           7917
El Confidencial         1817
Name: count, dtype: int64