# Imports

In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import json

# De contenido deprecado
#    # Usado en ronda 1, 2 de relacionar animes.
#    import re
#    # Usado en ronda 3 de relacionar.
#    from difflib import SequenceMatcher

# Combinar data de Web-Scraping y Api en archivos unicos.

Las series de archivos que se combinaran son los siguientes

Con i entre 1 y 20:

1. listas_usuarios_i.pickle
2. detalles_anime_i.pickle
3. tags_anime_i.csv

Usuarios_i.json no se combinaran, pues estos solo sirven para generar listas_usuarios_i.pickle

In [48]:
# listas_usuarios_i.pickle

listas_usuarios = {}
for i in range(1, 21):
    with open(os.path.join("demo Victor", "data", f"listas_usuarios_{i}.pickle"), "rb") as archivo:
        listas_usuarios = listas_usuarios | pickle.load(archivo) # El operador | permite combinar diccionarios

with open(os.path.join("data", f"listas_usuarios.pickle"), "wb") as archivo:
    pickle.dump(listas_usuarios, archivo)

In [49]:
# detalles_anime_i.pickle

detalles_anime = {}
for i in range(1, 21):
    with open(os.path.join("demo Victor", "data", f"detalles_anime_{i}.pickle"), "rb") as archivo:
        detalles_anime = detalles_anime | pickle.load(archivo)

with open(os.path.join("data", f"detalles_anime.pickle"), "wb") as archivo:
    pickle.dump(detalles_anime, archivo)

In [50]:
# tags_anime_i.csv

tags_anime = pd.DataFrame()
for i in range(1, 21):
    mini_df = pd.read_csv(os.path.join("demo Victor", "data", f"tags_anime_{i}.csv"))
    tags_anime = pd.concat([tags_anime, mini_df])


tags_anime.to_csv(os.path.join("data", f"tags_anime.csv"), index=False)

# Crear dataframe de ratings de usuarios

In [41]:
with open(os.path.join("data", f"listas_usuarios.pickle"), "rb") as archivo:
    listas_usuarios = pickle.load(archivo)

lista_limpia = {}
for user_id, value in listas_usuarios.items():
    name = value.pop("nombre")
    tipo = value.pop("tipo")
    if tipo != "publica":
        continue

    # Solo animes que ha visto y calificado
    value = {x[0]: x[1][0] for x in value.items() if (x[1][1] == 2) and (x[1][0] > 0)}
    if len(value) == 0:
        continue
    lista_limpia[f"{user_id}:{name}"] = value

gran_df = pd.DataFrame.from_dict(lista_limpia, orient="columns")
gran_df

Unnamed: 0,39110:xxbladexx20,216949:haijuta,246587:d3athzero,107198:JayJay1401,45036:Seleare,4328:Joseph_,110895:Rudenick,93256:Moltke,160129:dander,256598:melandrea,...,16805797:iambabymango,16809902:Flopoflop,16791488:RhaenyraFR,16838607:SasOnator,17063891:Dezwhite05,17197026:dumb_zoro_,16617450:kotyboh,16459755:Tiberjuice,16965619:Watcher794_2,16656371:Aoto_uchiha10
24,10.0,,10.0,,,,,,8.0,,...,,,,,,,,,,
519,10.0,,,,,,,,,,...,,,,,,,,,,
846,10.0,,8.0,,,,,,7.0,,...,,,,,,,,,,
849,10.0,,,7.0,,9.0,,,10.0,,...,,,,,,,,,,
1530,10.0,,,,,9.0,9.0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40179,,,,,,,,,,,...,,,,,,,,,,
40508,,,,,,,,,,,...,,,,,,,,,,
41179,,,,,,,,,,,...,,,,,,,,,,
41188,,,,,,,,,,,...,,,,,,,,,,


In [40]:
gran_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8978 entries, 24 to 33183
Columns: 7793 entries, 39110:xxbladexx20 to 16656371:Aoto_uchiha10
dtypes: float64(7793)
memory usage: 533.9 MB


**Como se puede ver, el peso de este dataframe es masivo, principalmente debido al hecho de tener tantos 0's en los usuarios.**

In [45]:
# Tomamos en cuenta enteros numpy, el valor que no vamos a guardad seran los NaN, estos no ocuparan espacio
gran_df_sparse = gran_df.astype(pd.SparseDtype(np.int8, fill_value=np.nan))
gran_df_sparse.dtypes
gran_df_sparse.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8978 entries, 24 to 33183
Columns: 7793 entries, 39110:xxbladexx20 to 16656371:Aoto_uchiha10
dtypes: Sparse[int8, nan](7793)
memory usage: 2.7 MB


In [43]:
gran_df_sparse

Unnamed: 0,39110:xxbladexx20,216949:haijuta,246587:d3athzero,107198:JayJay1401,45036:Seleare,4328:Joseph_,110895:Rudenick,93256:Moltke,160129:dander,256598:melandrea,...,16805797:iambabymango,16809902:Flopoflop,16791488:RhaenyraFR,16838607:SasOnator,17063891:Dezwhite05,17197026:dumb_zoro_,16617450:kotyboh,16459755:Tiberjuice,16965619:Watcher794_2,16656371:Aoto_uchiha10
24,10.0,,10.0,,,,,,8.0,,...,,,,,,,,,,
519,10.0,,,,,,,,,,...,,,,,,,,,,
846,10.0,,8.0,,,,,,7.0,,...,,,,,,,,,,
849,10.0,,,7.0,,9.0,,,10.0,,...,,,,,,,,,,
1530,10.0,,,,,9.0,9.0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40179,,,,,,,,,,,...,,,,,,,,,,
40508,,,,,,,,,,,...,,,,,,,,,,
41179,,,,,,,,,,,...,,,,,,,,,,
41188,,,,,,,,,,,...,,,,,,,,,,


In [46]:
with open(os.path.join("data", "users_df.pickle"), "wb") as archivo:
    pickle.dump(gran_df_sparse, archivo) # Esto tiene que ser guardado como pickle, parquet no soporta SparseDtype.
del gran_df # Borramos el dataframe original para liberar memoria.

# Crear dataframe de detalles de los anime de MyAnimeList.

In [55]:
with open(os.path.join("data", f"detalles_anime.pickle"), "rb") as archivo:
    detalles_anime = pickle.load(archivo)

MAL_df = pd.DataFrame.from_dict({x[0]:x[1]["detalles"] for x in detalles_anime.items()}, orient="index")
MAL_df.reset_index(inplace=True, drop=True)
MAL_df.to_csv(os.path.join("data", "MAL_dataframe.csv"), index=False)

In [56]:
MAL_df_open = pd.read_csv(os.path.join("data", "MAL_dataframe.csv"))
MAL_df_open

Unnamed: 0,id,title,main_picture,alternative_titles,start_date,end_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,media_type,status,num_episodes,start_season,studios
0,1,Cowboy Bebop,{'medium': 'https://cdn.myanimelist.net/images...,"{'synonyms': [], 'en': 'Cowboy Bebop', 'ja': '...",1998-04-03,1999-04-24,"Crime is timeless. By the year 2071, humanity ...",8.75,44.0,43,1818165,939118,tv,finished_airing,26,"{'year': 1998, 'season': 'spring'}","[{'id': 14, 'name': 'Sunrise'}]"
1,32772,Puzzle & Dragons Cross,{'medium': 'https://cdn.myanimelist.net/images...,"{'synonyms': ['PazuDora Cross'], 'en': 'Puzzle...",2016-07-04,2018-03-26,"Dorogoza Island is rich in ""Drop Energy"" that ...",6.49,6728.0,6665,8409,2300,tv,finished_airing,89,"{'year': 2016, 'season': 'summer'}","[{'id': 1, 'name': 'Pierrot'}]"
2,5,Cowboy Bebop: Tengoku no Tobira,{'medium': 'https://cdn.myanimelist.net/images...,"{'synonyms': [""Cowboy Bebop: Knockin' on Heave...",2001-09-01,2001-09-01,"Another day, another bounty—such is the life o...",8.38,194.0,610,369342,210529,movie,finished_airing,1,"{'year': 2001, 'season': 'summer'}","[{'id': 4, 'name': 'Bones'}]"
3,6,Trigun,{'medium': 'https://cdn.myanimelist.net/images...,"{'synonyms': [], 'en': 'Trigun', 'ja': 'トライガン'}",1998-04-01,1998-09-30,"Vash the Stampede is the man with a $$60,000,0...",8.22,334.0,248,745104,364884,tv,finished_airing,26,"{'year': 1998, 'season': 'spring'}","[{'id': 11, 'name': 'Madhouse'}]"
4,7,Witch Hunter Robin,{'medium': 'https://cdn.myanimelist.net/images...,"{'synonyms': ['WHR'], 'en': 'Witch Hunter Robi...",2002-07-03,2002-12-25,Robin Sena is a powerful craft user drafted in...,7.24,2942.0,1825,114294,43323,tv,finished_airing,26,"{'year': 2002, 'season': 'summer'}","[{'id': 14, 'name': 'Sunrise'}]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8973,30705,Makura no Danshi,{'medium': 'https://cdn.myanimelist.net/images...,"{'synonyms': ['Pillow Boys'], 'en': 'makuranod...",2015-07-14,2015-09-29,Whispering sweet lullabies into the ears of th...,4.46,13045.0,3316,42035,21599,tv,finished_airing,12,"{'year': 2015, 'season': 'summer'}","[{'id': 91, 'name': 'feel.'}, {'id': 1271, 'na..."
8974,30709,Kamisama Hajimemashita: Kako-hen,{'medium': 'https://cdn.myanimelist.net/images...,"{'synonyms': [], 'en': '', 'ja': '神様はじめました～過去編～'}",2015-08-20,2016-08-19,While playing in the snow one day at her shrin...,8.39,185.0,1385,163580,80581,ova,finished_airing,4,"{'year': 2015, 'season': 'summer'}","[{'id': 73, 'name': 'TMS Entertainment'}]"
8975,30711,Code Geass: Boukoku no Akito 5 - Itoshiki Mono...,{'medium': 'https://cdn.myanimelist.net/images...,"{'synonyms': [], 'en': 'Code Geass: Akito the ...",2016-02-06,2016-02-06,"The Ark Fleet has been destroyed, and a signif...",7.13,3545.0,2178,87905,43134,movie,finished_airing,1,"{'year': 2016, 'season': 'winter'}","[{'id': 14, 'name': 'Sunrise'}]"
8976,32761,Gekkou Shokudou,{'medium': 'https://cdn.myanimelist.net/images...,"{'synonyms': [], 'en': '', 'ja': '月光食堂'}",2012-11-02,2012-11-02,"A music video for Fullkawa Honpo's song ""Gekko...",5.64,,16308,417,227,music,finished_airing,1,"{'year': 2012, 'season': 'fall'}",[]


# **DEPRECADO: NO USAR.** Relacionar MyAnimeList con AnimePlanet

El proposito de este proceso es (era) combinar las id's de MyAnimeList con los nombres de AnimePlanet.

El proceso sigue 3 rondas.

1. Busca todos los titulos en los detalles del anime de MyAnimeList, luego busca si encuentra alguno de ellos en Animeplanet

2. Compara todos los titulos de MyAnimeList con todos los titulos de Animeplanet.

3. Si eso falla, se hara a fuerza bruta encontrar el titulo mas similar, este proceso puede tardar mas de una hora. Y si el titulo es muy disimilar, entonces no se considerara.

4. Al final, el restante deberia ser suficientemente chico como para revisar a mano.

5. Despues de eso, deberia ser necesario revisar que no hayan quedado duplicados, en cuyo caso hay que arreglarlos

**No se uso este metodo al final, esto debido a la gran cantidad de duplicados, y las inexactitudes que pueden surgir con la parte 3, al ser solamente aproximaciones, no solo eso, tambien usa mucho poder de computacion, tardandose mas de 2 horas en ejecutar**

In [18]:
# Abrir el dataframe de AnimePLanet con tags
all_anime_df = pd.read_csv(os.path.join("data", "tags_anime.csv"))

# Abrir los detalles de anime de MyAnimeList
with open(os.path.join("data", f"detalles_anime.pickle"), "rb") as archivo:
    detalles_anime = pickle.load(archivo)

# Los detalles estaran ordenados de mayor a menor popularidad, de esta forma tienen prioridad los mas populares y al final los menos conocidos
detalles_anime = dict(sorted(detalles_anime.items(), key=lambda x: x[1]["detalles"]["popularity"]))

# Convertir Tags a lista
all_anime_df["Tags"] = all_anime_df["Tags"].apply(lambda x: [x.strip("'") for x in x.lstrip("[").rstrip("]").split(", ")])

# Convertir Alts a lista
all_anime_df["Alts"].fillna("", inplace=True)
all_anime_df["Alts"] = all_anime_df["Alts"].apply(lambda x: x.strip("\r\n"))
all_anime_df["Alts"] = all_anime_df["Alts"].apply(lambda x: x.replace("Alt title: ", "").replace("Alt titles: ", "").split(", "))

all_anime_df.head(3)

Unnamed: 0,Anime,URL,Tags,Alts
0,Attack on Titan The Final Season: The Final Ch...,https://www.anime-planet.com/anime/attack-on-t...,"[Action, Drama, Fantasy, Shounen, Dark Fantasy...","[Attack on Titan The Final Season: Part III, S..."
1,Fullmetal Alchemist: Brotherhood,https://www.anime-planet.com/anime/fullmetal-a...,"[Action, Adventure, Drama, Fantasy, Mystery, S...",[Hagane no Renkinjutsushi: Full Metal Alchemist]
2,Fruits Basket the Final Season,https://www.anime-planet.com/anime/fruits-bask...,"[Drama, Fantasy, Romance, Shoujo, Animal Trans...",[Fruits Basket the Final]


## Ronda 1

In [19]:
# Para esto vamos a ocupar regex para substituir los titulos, van a quedar solo con caracteres alfanumericos, lo que con suerte deberia hacer que la busqueda sea mas simple

# dummy tiene la misma dimension que all_anime_df, pero con los titulos en minusculas y sin caracteres no alfanumericos
# El hecho de tener la misma dimension sera conveniente al sacar la porcion del dataframe que contiene el titulo.

dummy_df = all_anime_df.copy()
dummy_df["Anime"] = dummy_df["Anime"].str.lower()
dummy_df["Anime"] = dummy_df["Anime"].str.replace(r"[^a-zA-Z0-9]", "", regex=True)

pd.options.mode.chained_assignment = None  # default='warn'


titulos_1 = pd.DataFrame() # Aqui iran los titulos enlazados.
not_found_1 = {} # Aqui iran aquellos que no se pudieron encontrar en esta ronda

for anime_id, value in detalles_anime.items():
    details = value["detalles"]
    lista_titulos = [details["title"]] + (details["alternative_titles"]["synonyms"]) + [(details["alternative_titles"]["en"])]
    for titulo in lista_titulos:
        titulo_plano = re.sub("[^a-zA-Z0-9]", "", titulo.lower())
        if not titulo_plano: # El titulo es solamente caracteres especiales, y por tanto es filtrado
            continue
        mini = all_anime_df[dummy_df["Anime"] == titulo_plano]
        if not mini.empty: # Se encontro el anime
            break
    else: # No se encontro el anime
        not_found_1[anime_id] = lista_titulos
        continue
    mini["MAL_ID"] = anime_id
    titulos_1 = pd.concat([titulos_1, mini])
    
path = os.path.join("data", "titulos_1.csv")
titulos_1.to_csv(path, index=False)

titulos_1

Unnamed: 0,Anime,URL,Tags,Alts,MAL_ID
88,Attack on Titan,https://www.anime-planet.com/anime/attack-on-t...,"[Action, Fantasy, Horror, Shounen, Dark Fantas...",[Shingeki no Kyojin],16498
154,Death Note,https://www.anime-planet.com/anime/death-note,"[Mystery, Shounen, Contemporary Fantasy, Crime...",[],1535
1,Fullmetal Alchemist: Brotherhood,https://www.anime-planet.com/anime/fullmetal-a...,"[Action, Adventure, Drama, Fantasy, Mystery, S...",[Hagane no Renkinjutsushi: Full Metal Alchemist],5114
56,One-Punch Man,https://www.anime-planet.com/anime/one-punch-man,"[Action, Comedy, Sci Fi, Seinen, Cyborgs, Mons...",[],30276
1060,Sword Art Online,https://www.anime-planet.com/anime/sword-art-o...,"[Action, Adventure, Fantasy, Sci Fi, Dungeon, ...",[],11757
...,...,...,...,...,...
20812,Ai ga Oshigoto,https://www.anime-planet.com/anime/ai-ga-oshigoto,"[Family Friendly, Minna no Uta]",[],53765
19167,Chiisana Yume,https://www.anime-planet.com/anime/chiisana-yume,"[Family Friendly, Minna no Uta]",[],53753
17701,Jidousha ni Natta Kame no Uta,https://www.anime-planet.com/anime/jidousha-ni...,"[Family Friendly, Minna no Uta]",[],53791
5963,The Haunted House: The Secret of the Cave,https://www.anime-planet.com/anime/the-haunted...,"[Adventure, Horror, Family Friendly, Korean An...",[Sinbi Apartment: Geumbit Dokkaebiwa Bimirui D...,48270


## Ronda 2

In [20]:
# Para esta ronda tambien usaremos regex.

# some_anime_df es la porcion de all_anime_df que excluye los titulos ya encontrados
some_anime_df = all_anime_df[~all_anime_df.index.isin(titulos_1.index)]

dummy_df_2 = some_anime_df.copy()
dummy_df_2["Anime"] = dummy_df_2["Anime"].str.lower()
dummy_df_2["Anime"] = dummy_df_2["Anime"].str.replace(r"[^a-zA-Z0-9]", "", regex=True)

# Esta vez se busca en los titulos alternativos
dummy_df_2["Alts"] = dummy_df_2["Alts"].apply(lambda x: [re.sub("[^a-zA-Z0-9]", "", y.lower()) for y in x])

not_found_2 = {}
titulos_2 = pd.DataFrame()

for anime_id, lista_titulos in not_found_1.items(): # Vemos en los que no se encontraron en la ronda anterior

    for titulo in lista_titulos:

        titulo_plano = re.sub("[^a-zA-Z0-9]", "", titulo.lower())
        if not titulo_plano:
            continue
        mini = some_anime_df[dummy_df_2["Alts"].apply(lambda x: titulo_plano in x)]
        if not mini.empty:
            break
    else: # No se encontro el anime
        not_found_2[anime_id] = lista_titulos
        continue
    mini["MAL_ID"] = anime_id
    titulos_2 = pd.concat([titulos_2, mini])

path = os.path.join("data", "titulos_2.csv")
titulos_2.to_csv(path, index=False)

titulos_2

Unnamed: 0,Anime,URL,Tags,Alts,MAL_ID
1237,Future Diary,https://www.anime-planet.com/anime/future-diary,"[Action, Shounen, Battle Royale, High Stakes G...",[Mirai Nikki],10620
17,Attack on Titan The Final Season,https://www.anime-planet.com/anime/attack-on-t...,"[Action, Drama, Fantasy, Horror, Shounen, Dark...",[Shingeki no Kyojin The Final Season],40028
256,Food Wars! Shokugeki no Souma,https://www.anime-planet.com/anime/food-wars-s...,"[Comedy, Ecchi, Shounen, Animeism, Boarding Ho...",[Shokugeki no Souma],28171
15,Demon Slayer: Kimetsu no Yaiba Movie - Mugen T...,https://www.anime-planet.com/anime/demon-slaye...,"[Action, Drama, Shounen, Demons, Historical, M...",[Kimetsu no Yaiba Movie: Mugen Ressha-hen],40456
71,Assassination Classroom 2nd Season,https://www.anime-planet.com/anime/assassinati...,"[Action, Comedy, Sci Fi, Shounen, Assassins, N...",[Ansatsu Kyoushitsu 2nd Season],30654
...,...,...,...,...,...
10568,dalbitgunggwol,https://www.anime-planet.com/anime/dalbit-gung...,"[Adventure, Fantasy, Korean Animation]",[Lost in the Moonlight],43387
15407,Akai Boushi,https://www.anime-planet.com/anime/akai-boushi,[Minna no Uta],[Minna no Uta],43521
22430,Kotatsu Musume de Teketekete,https://www.anime-planet.com/anime/kotatsu-mus...,"[Family Friendly, Minna no Uta]",[Minna no Uta:],43521
15407,Akai Boushi,https://www.anime-planet.com/anime/akai-boushi,[Minna no Uta],[Minna no Uta],42853


## Ronda 3

In [24]:
# dataframe de all_anime_df sin titulos_1 ni titulos_2
last_anime_df = all_anime_df[~((all_anime_df.index.isin(titulos_1.index)) | (all_anime_df.index.isin(titulos_2.index)))]

dummy_df_3 = last_anime_df.copy()
dummy_df_3["Anime"] = dummy_df_3["Anime"].str.lower()
dummy_df_3["Alts"] = dummy_df_3["Alts"].apply(lambda x: [y.lower() for y in x])

# Juntar Anime y Alts
dummy_df_3["Combinado"] = dummy_df_3.apply(lambda x: [x["Anime"]] + x["Alts"], axis=1)

found_3 = {}
not_found_3 = {}
titulos_3 = pd.DataFrame()

for anime_id, lista_titulos in not_found_2.items():
    max_score = 0
    max_score_id = None
    for titulo in lista_titulos:
        if not titulo:
            continue
        # Se busca el titulo mas parecido
        ratios = dummy_df_3["Combinado"].apply(lambda x: max([SequenceMatcher(None, y, titulo.lower()).ratio() for y in x]))

        score_id = ratios.idxmax()
        score = ratios[score_id]
    
        if score > max_score:
            max_score = score
            max_score_id = score_id

    if (max_score < 0.8) or (max_score_id is None):
        not_found_3[anime_id] = {"titulos": lista_titulos, "score": max_score, "max_score_id": max_score_id}
        continue
    mini = last_anime_df.loc[[max_score_id]]
    mini["MAL_ID"] = anime_id
    found_3[anime_id] = {"titulos": lista_titulos, "score": max_score, "max_score_id": max_score_id}

    titulos_3 = pd.concat([titulos_3, mini])

path = os.path.join("data", "titulos_3.csv")
titulos_3.to_csv(path, index=False)
titulos_3

OSError: Cannot save file into a non-existent directory: 'research'

In [25]:
path = os.path.join("data", "titulos_3.csv")
titulos_3.to_csv(path, index=False)

with open(os.path.join("data", f"found_3.pickle"), "wb") as archivo:
    pickle.dump(found_3, archivo)

with open(os.path.join("data", f"not_found_3.pickle"), "wb") as archivo:
    pickle.dump(not_found_3, archivo)

# Relacionar MyAnimeList con AnimePlanet con una herramienta de terceros.

Como el metodo anterior no se pudo, se implemento una herramienta de terceros que relaciona distintos sitios de anime.

- https://github.com/manami-project/anime-offline-database

Esta solo se uso para poder relacionar las bases de datos, nada mas.

In [None]:
# MyAnimeList
path = os.path.join("data", "MAL_dataframe.csv")
MAL_df = pd.read_csv(path)

# AnimePlanet
path = os.path.join("data", "tags_anime.csv")
AP_df = pd.read_csv(path)

path = os.path.join("Ayuda Externa", "anime-offline-database.json")
with open(path, "r", encoding="utf-8") as archivo:
    anime_offline_database = json.load(archivo)

In [None]:
ids_usables = (MAL_df["id"])
unified_df = pd.DataFrame()

for anime in anime_offline_database["data"]:
    sources = [x.replace("https://", "") for x in anime["sources"]]
    # Un diccionario con el id del resultado de MyAnimeList y el url del resultado de AnimePlanet
    MAL_AP = {x.split("/")[0]: x.split("/")[2] for x in sources if ("anime-planet.com" in x) or ("myanimelist.net" in x)}
    # -1 Nunca esta en ids_usables
    mal_id = int(MAL_AP.get("myanimelist.net", "-1"))
    ap_get = MAL_AP.get("anime-planet.com", False)
    if ap_get and (mal_id != -1):

        ap_url = AP_df[AP_df["URL"] == f"https://www.anime-planet.com/anime/{ap_get}"]
        ap_url["id"] = mal_id
        unified_df = pd.concat([unified_df, ap_url], axis=0)
unified_df = unified_df[unified_df["id"].isin(ids_usables)]

path = os.path.join("data", "unified_df.csv")
unified_df.to_csv(path, index=False)

unified_df