# Merge s5 url+text types -> s6 total json (& clean other already-merged types)
- **By:** Sofia Kobayashi
- **Start Date:** 04/03/2024
- **Description:** Need to merge fic + text files for data types:
    - (1) fic
    - (2) author
    - (3) series
    - (4) Clean: coffee, collections, look into, other


In [84]:
import pandas as pd
import re
import json
import numpy as np
from collections import Counter

import AO3

import os
from dotenv import load_dotenv
from pathlib import Path
env_path = Path(".") / ".env"
load_dotenv(dotenv_path=env_path)

pd.set_option('display.max_columns', None)

In [85]:
# Constants
LOCATION_ORDER = {'ao3': 1,
                  'ffn_net': 2,
                  'tum': 3,
                  'drm': 4,
                  'lvj': 5,
                  'fcb': 6,
                  'wat': 7,
                  'pdf': 9,
                  'oth': 100,
                 }

DTB_ORDER = {'coffee': 1,
            'read': 2,
            'cont_read': 3,
            'to_read': 4}

In [86]:
# Helper Functions
def df_to_dict(df):
    
    # Make sure there are NOT empty dfs
    if len(df) == 0:
        raise Exception("ERROR: given empty dict in df_to_dict()")
   
    
    # Get dict, remove NaN's & de-dup lists
    cols = df.to_dict('list')

    for key in cols:
        cols[key] = list(set([ele for ele in cols[key] if not pd.isna(ele)]))

    return cols

## (1) Re-merge s5 fic text
- **Problem:** The initial s4 -> s5 merge was NOT THOUROUGH, many duplicates still exist
- **Thus:** Will add a stripped title col called 'title_str', merge on that
- **Desired Result:** s5 fic text JSON with no duplicates

### Add title_str col & make s5 fic text -> JSON
- Matching on titles were difficult bc sometimes different apostrophes or spacing or parenthesis, so strip all that to make title_str

In [4]:
# Read in relevant files
with open("clean_data_5/fic_url.json", "r") as infile:
    fic_url = json.load(infile)

fic_text_df = pd.read_csv("clean_data_5/fic_text.csv", index_col=0, encoding="utf-8-sig")
fic_text_df['title'] = fic_text_df['title'].astype(str)

In [5]:
# Function to make title string
def strip_title(title):
    clean_str = ''
    for char in title:
        if char.isalnum():
            clean_str += char
    return clean_str.lower().replace(" ", "")

# Testing
strip_title("he's the fun uncle. you know, the immortal one.")

'hesthefununcleyouknowtheimmortalone'

In [116]:
# Code to add cleaned title string col
fic_text_df['title_str'] = fic_text_df['title'].apply(strip_title)

### Remerge s5 fic text on the new title_str
- Merges on title_str

In [7]:
def convert_to_dict_s5_text(df):
    cols = df.to_dict('list')

    for key in cols:
        new_row = []

        for val in cols[key]:
            # If list-wrapped string, break up values
            if isinstance(val, str) and len(val) > 0 and val[0] == '[':
                try: 
                    clean_val = val.replace("['", '["').replace("']", '"]').replace("', '", '", "').replace("','", '","')
                    val = json.loads(clean_val)
                
                    for ele in val: 
                        new_row.append(ele)
                except:
                    new_row.append("."+val)
            # If stringified boolean
            elif isinstance(val, str) and val.lower() == "false":
                new_row.append(False) 
            elif isinstance(val, str) and val.lower() == "true":
                new_row.append(True) 
            else:
                new_row.append(val)

        # De-dup & drop all nulls
        new_row = list(set([ele for ele in new_row if not pd.isna(ele)]))
        cols[key] = new_row
        
    return cols

In [8]:
# Function to merge fics
def merge_fic(parent_df, title_str):
    df = parent_df[parent_df['title_str'] == title_str]
    
    # Make sure there are NOT empty dfs
    if len(df) == 0:
        raise Exception("ERROR: given empty dict in df_to_dict()")
   
    # Get dict, remove NaN's & de-dup lists
    cols = convert_to_dict_s5_text(df)

    # Sort & clean various attributes
    sorted_sources = sorted(cols['smk_sources'], key = lambda source: float(source.split('_')[0][1:]))
    clean_authors = list(set([author.replace(",", "") for author in cols['author']]))
    sorted_versions = sorted(cols['version_nums'])
    clean_locs = [loc for loc in cols["locations"] if loc!= '???']
    sorted_locations = sorted(clean_locs, key=lambda loc: LOCATION_ORDER[loc])
    
    clean_dtbs = set()
    for dtb_str in cols['dtb_types']:
        new_dtbs = dtb_str.split(',')
        for dtb in new_dtbs:
            clean_dtbs.add(dtb.strip())
    clean_dtbs = list(clean_dtbs)
    sorted_dtbs = sorted(clean_dtbs, key=lambda dtb: DTB_ORDER[dtb])

    clean_tags = set()
    for tag_str in cols['all_tags']:
        new_tags = tag_str.split(',')
        for tag in new_tags:
            clean_tags.add(tag.strip())
    sorted_tags = sorted(list(clean_tags))
    
    avg_rating = None if len(cols['all_ratings']) == 0 else round(sum(cols['all_ratings'])/len(cols['all_ratings']), 2)
                
    return {'num_appeared': len(df),
            'primary_versions': "" if len(sorted_versions) == 0 else sorted_versions[0],
            'version_num': sorted_versions,
            'primary_source': "" if len(sorted_sources) == 0 else sorted_sources[0],
             'smk_source': sorted_sources,
            'primary_dtb': "" if len(sorted_dtbs) == 0 else sorted_dtbs[0],
             'dtb_type': sorted_dtbs,
            'primary_loction': "" if len(sorted_locations) == 0 else sorted_locations[0],
             'location': sorted_locations,
            'primary_url': "" if len(cols['primary_link']) == 0 else cols['primary_link'][0],
             'url': sorted(cols['all_links']),
             'categories': cols['categories'],
             'is_bold': None if len(cols["is_bold"]) == 0 else any(cols["is_bold"]), 
             'fandom': cols['fandom'],
             'fic_status': None, # this column is entirely empty
             'title': "" if len(cols['title']) == 0 else cols['title'][0],
             'title_str': "" if len(cols['title']) == 0 else cols['title'][0],
             'is_coffee': None if len(cols["is_coffee"]) == 0 else any(cols["is_coffee"]), 
             'fic_series': cols['fic_series'],
             'author': clean_authors,
             'length': None if len(cols["length"]) == 0 else max(cols["length"]),
             'is_complete': None if len(cols["is_complete"]) == 0 else any(cols["is_complete"]), 
             'is_subbed': None if len(cols["is_subbed"]) == 0 else any(cols["is_subbed"]), 
             'is_backedup': None if len(cols["is_backedup"]) == 0 else any(cols["is_backedup"]), 
             'is_bookmarked': None if len(cols["is_bookmarked"]) == 0 else any(cols["is_bookmarked"]), 
             'in_category': None, # this column is entirely empty
             'all_tags': sorted_tags,
             'current_chapter': "" if len(cols["current_chapter"]) == 0 else max(cols["current_chapter"]),
             'fic_rating': avg_rating, 
             'early_ratings': cols['early_ratings'],
             'to_read_description': cols['tr_descriptions'],
             'is_finished_inputting_data': None if len(cols["is_complete"]) == 0 else any(cols["is_complete"]), 
           }

# Testing
# merge_fic(fic_text_df, 'allhallowsevenewyork')

In [9]:
# Code to merge s5 fic text
title_strs = fic_text_df.title_str.drop_duplicates().sort_values().to_list()
merged_rows = []

for title_str in title_strs:
    new_dict = merge_fic(fic_text_df, title_str)
    merged_rows.append(new_dict)

merged_df = pd.DataFrame(merged_rows)

### Clean s5 fic text fandoms


In [10]:
with open('reference_info/fandom_aliases.json', 'r') as infile:
    FANDOM_NAMES = json.load(infile)

In [11]:
# Cleaned fic fandoms
# Function to uncompact (split, dedup, order) fandom strings
def split_and_sort_fandoms(dict):
    for fic in dict:
        clean_fandoms = set()
        for fandom_str in fic['fandom']:
            for fandom in fandom_str.split(","):
                if fandom != "":
                    clean_fandoms.add(fandom.strip())
                
        fandom_list = list(clean_fandoms)
        fic['fandom'] = sorted(fandom_list)

# Clean dict
split_and_sort_fandoms(merged_rows)

# Check for compacted fandoms
for fic in merged_rows:
    for fandom_str in fic['fandom']:
        if ',' in fandom_str:
            print(fic['fandom'])

# Printing none means it workeda

In [14]:
FANDOM_CONVERSION_2 = {
    "1/2_prince": "1/2 Wangzi | 1/2 Prince",
    "2ha": "二哈和他的白猫师尊 - 肉包不吃肉 | The Husky and His White Cat Shizun - Meatbun Doesn't Eat Meat",
    "arrow": "Arrow (TV 2012)",
    "artemis_fowl": "Artemis Fowl - Eoin Colfer",
    "assassins_creed": "Assassin's Creed - All Media Types",
    "ateez_band": "ATEEZ (Band)",
    "atla": "Avatar: The Last Airbender",
    "attack_on_titan": "Shingeki no Kyojin | Attack on Titan",
    "avatar": "Avatar (Cameron Movies)",
    "avengers": "Avengers (Marvel) - All Media Types",
    "batman": "Batman - All Media Types",
    "big_hero_6": "Big Hero 6 (2014)",
    "black_butler": "Kuroshitsuji | Black Butler",
    "black_panther": "Black Panther (2018)",
    "bleach": "Bleach",
    "bnha": "僕のヒーローアカデミア | Boku no Hero Academia | My Hero Academia",
    "books_of_the_raksura": "Books of the Raksura - Martha Wells",
    "brooklyn_99": "brooklyn_99",
    "bts": "방탄소년단 | Bangtan Boys | BTS",
    "captain_america": "Captain America - All Media Types",
    "captive_prince": "Captive Prince - C. S. Pacat",
    "castlevania": "Castlevania (Cartoon 2017-2021)",
    "chronicles_of_narnia": "Chronicles of Narnia - C. S. Lewis",
    "code_geass": "Code Geass",
    "criminal_minds": "Criminal Minds (US TV)",
    "damien": "Damien (TV)",
    "danny_phantom": "Danny Phantom",
    "daredevil": "Daredevil (TV)",
    "dark_angel": "Dark Angel (TV)",
    "dcu": "DCU",
    "deadpool": "Deadpool - All Media Types",
    "death_note": "Death Note (Anime & Manga)",
    "descendants": "Descendants (Disney Movies)",
    "detroit_become_human": "Detroit: Become Human (Video Game)",
    "downton_abbey": "Downton Abbey",
    "eyeshield_21": "Eyeshield 21",
    "fairy_tail": "Fairy Tail",
    "fantastic_four": "Fantastic Four",
    "fast&furious": "Fast & Furious (Movies)",
    "fbawtft": "Fantastic Beasts and Where to Find Them (Movies)",
    "final_fantasy_vii": "Final Fantasy VII",
    "final_fantasy_viii": "Final Fantasy VIII",
    "final_fantasy_xv": "Final Fantasy XV",
    "folklore": "Folklore",
    "frozen": "Frozen (Disney Movies)",
    "fullmetal_alchemist": "Fullmetal Alchemist - All Media Types",
    "game_of_thrones": "A Song of Ice and Fire & Related Fandoms",
    "good_omens": "Good Omens (TV)",
    "gotham": "Gotham (TV)",
    "gravity_falls": "Gravity Falls",
    "guardians_of_the_galaxy": "Guardians of the Galaxy (Movies)",
    "gundam_wing_ac": "Gundam Wing",
    "hamilton": "Hamilton - Miranda",
    "hannibal": "Hannibal (TV)",
    "harry_potter": "Harry Potter - Fandom",
    "hells_kitchen": "Hell's Kitchen",
    "highschool_of_the_dead": "Gakuen Mokushiroku | Highschool of the Dead",
    "httyd": "How to Train Your Dragon (Movies)",
    "hunger_games": "Hunger Games Series - All Media Types",
    "iron_man": "Iron Man (Movies)",
    "james_bond": "James Bond (Movies)",
    "john_wick": "John Wick (Movies)",
    "joy_of_life": "庆余年 | Joy of Life (TV)",
    "jurassic_park": "Jurassic Park - All Media Types",
    "justice_league": "Justice League - All Media Types",
    "k_anime": "K (Anime)",
    "katekyo_hitman_reborn": "Katekyou Hitman Reborn!",
    "kingsman": "Kingsman (Movies)",
    "kung_fu_panda": "Kung Fu Panda (Movies)",
    "kuroko_no_basuke": "Kuroko no Basuke | Kuroko's Basketball",
    "left4dead": "Left 4 Dead (Video Games)",
    "legend_of_korra": "Avatar: Legend of Korra",
    "leverage": "Leverage",
    "loki": "Loki (TV 2021)",
    "lord_of_the_rings": "The Lord of the Rings - All Media Types",
    "lucifer": "Lucifer (TV)",
    "magi_lom": "Magi: The Labyrinth of Magic",
    "magnus_files": "The Magnus Archives (Podcast)",
    "mcu": "MCU",
    "mdzs": "魔道祖师 - 墨香铜臭 | Módào Zǔshī - Mòxiāng Tóngxiù",
    "megamind": "Megamind (2010)",
    "men_in_black": "men_in_black",
    "merlin": "Merlin (TV)",
    "minecraft": "Minecraft (Video Game)",
    "miraculous_ladybug": "Miraculous Ladybug",
    "mob_psycho": "モブサイコ100 | Mob Psycho 100",
    "monster_hunter": "Monster Hunter (Video Games)",
    "moon_knight": "Moon Knight (TV 2022)",
    "multiple_fandoms": "multiple_fandoms",
    "my_next_life_as_a_villainess": "乙女ゲームの破滅フラグしかない悪役令嬢に転生してしまった… - 山口悟 | My Next Life as a Villainess - Yamaguchi Satoru (Light Novels)",
    "naruto": "Naruto",
    "ncis": "NCIS",
    "nirvana_in_fire": "nirvana_in_fire",
    "none_unsorted": "none_unsorted",
    "once_upon_a_time": "Once Upon a Time (TV)",
    "one_piece": "One Piece",
    "one_punch_man": "ワンパンマン | One-Punch Man",
    "original_work": "Original Work",
    "ouran_hshc": "Ouran High School Host Club - All Media Types",
    "pacific_rim": "Pacific Rim (Movies)",
    "percy_jackson_olympians": "Percy Jackson and the Olympians & Related Fandoms - All Media Types",
    "person_of_interest": "Person of Interest (TV)",
    "phineas_and_ferb": "Phineas and Ferb",
    "pkmn_sword&shield": "Pocket Monsters: Sword & Shield | Pokemon Sword & Shield Versions",
    "pokemon": "Pocket Monsters | Pokemon (Anime)",
    "prince_of_tennis": "Tennis no Oujisama | Prince of Tennis",
    "reincarnated_as_a_sword": "転生したら剣でした - 棚架ユウ | Tensei Shitara Ken Deshita | Reincarnated as a Sword - Tanaka YuuNavigation and Actions",
    "rise_of_the_guardians": "Rise of the Guardians (2012)",
    "riverdale": "riverdale",
    "rwby": "RWBY",
    "sherlock": "Sherlock (TV)",
    "six_of_crows": "Six of Crows Series - Leigh Bardugo",
    "smallville": "Smallville",
    "solo_levelling": "나 혼자만 레벨업 | Solo Leveling (Webcomic)",
    "soul_eater": "Soul Eater (Anime & Manga)",
    "spiderman": "Spider-Man - All Media Types",
    "star_wars": "Star Wars - All Media Types",
    "star_wars_cw": "Star Wars: Clone Wars (2003) - All Media Types",
    "stargate": "Stargate - All Media Types",
    "stargate_atlantis": "Stargate Atlantis",
    "stranger_things": "Stranger Things (TV 2016)",
    "suicide_squad": "Suicide Squad (Movies 2016 2021)",
    "supernatural": "Supernatural",
    "svsss": "人渣反派自救系统 - 墨香铜臭 | The Scum Villain's Self-Saving System - Mòxiāng Tóngxiù",
    "sword_art_online": "Sword Art Online (Anime & Manga)",
    "tangled": "Tangled (2010)",
    "teen_wolf": "Teen Wolf (TV)",
    "temeraire": "Temeraire - Naomi Novik",
    "the_100": "The 100 (TV)",
    "the_arrow": "Arrow (TV 2012)",
    "the_croods": "The Croods (Movies)",
    "the_flash": "The Flash (Comics)",
    "the_hobbit": "The Hobbit - All Media Types",
    "the_kings_avatar": "全职高手 | The King's Avatar (Cartoon)",
    "the_last_of_us": "The Last of Us (Video Games)",
    "the_song_of_achillles": "The Song of Achilles - Madeline Miller",
    "the_witcher": "The Witcher (TV)",
    "thor": "Thor - All Media Types",
    "tiger&bunny": "Tiger & Bunny",
    "tokyo_ghoul": "Tokyo Ghoul",
    "torchwood": "Torchwood",
    "transformers": "Transformers - All Media Types",
    "travelers": "Travelers (TV)",
    "twilight": "Twilight Series - All Media Types",
    "umbrella_academy": "The Umbrella Academy (TV)",
    "vampire_hunter_d": "Vampire Hunter D (Anime & Manga)",
    "venom": "Venom (Marvel Movies)",
    "voltron": "Voltron: Legendary Defender",
    "wicked": "Wicked - All Media Types",
    "winx_club": "Winx Club",
    "xmen": "X-Men - All Media Types",
    "yona_of_the_dawn": "Akatsuki no Yona | Yona of the Dawn",
    "young_hercules": "Young Hercules",
    "young_justice": "Young Justice - All Media Types",
    "yuuri_on_ice": "Yuri!!! on Ice (Anime)",
    "zootopia": "Zootopia (2016)",
}

In [15]:
# Function & code to convert my fandoms to ao3 fandoms
def convert_my_fandoms_to_ao3_fandoms(dict):
    for fic in dict:
        clean_fandoms = []
        for fandom in fic['fandom']:
            if fandom in FANDOM_NAMES:
                fandom = FANDOM_CONVERSION_2[fandom]
            clean_fandoms.append(fandom)
        fic['fandom'] = sorted(clean_fandoms)

convert_my_fandoms_to_ao3_fandoms(merged_rows)

In [17]:
s2 = set()
for fic in merged_rows:
  for fandom in fic['fandom']:
      if fandom == 'avatar':
          print(fic)
      if fandom in FANDOM_NAMES and fandom not in FANDOM_CONVERSION_2:
          s2.add(fandom)

sorted(list(s2))

[]

In [18]:
# Write to JSON
with open('clean_data_5/fic_text.json', 'w') as outfile:
    json.dump(merged_rows, outfile)

## (2) s5 fic text+url merging
- **Problem:** There's many duplicates across s4 fic url & s4 fic text, want to merge them
- **Thus:** Will merge them
- **Desired Result:** JSON with each dict only referencing one fic

In [118]:
# Constants
LOCATION_ORDER = {'ao3': 1,
                  'ffn_net': 2,
                  'tum': 3,
                  'drm': 4,
                  'lvj': 5,
                  'fcb': 6,
                  'wat': 7,
                  'pdf': 9,
                  'oth': 100,
                 }

DTB_ORDER = {'coffee': 1,
            'read': 2,
            'cont_read': 3,
            'to_read': 4}

In [119]:
# Load in relevant files
with open('clean_data_5/fic_text.json', 'r') as infile:
    text_unmerged = json.load(infile)

with open('clean_data_5/fic_url.json', 'r') as infile:
    url_unmerged = json.load(infile)

# Sort dicts by title_str
text_unmerged = sorted(text_unmerged, key=lambda fic: fic['title_str'])
url_unmerged = sorted(url_unmerged, key=lambda fic: fic['title_str'])

### Clean s5 fic text & url data
- Clean duplicates
- Standardizing key names

In [120]:
# (already done) Add title_str column to both s5 fic url & text
# for fic in url_unmerged: 
#     fic['title_str'] = strip_title(fic['title'])

# for fic in text_unmerged: 
#     fic['title_str'] = strip_title(fic['title'])

In [121]:
# Code to switch keys in dicts (to standardize key names of shared data kinds)
swaps = [{'original': 'primary_versions', 'new': 'primary_version'},
        {'original': 'location', 'new': 'locations'},
        {'original': 'smk_source', 'new': 'smk_sources'},
        {'original': 'dtb_type', 'new': 'dtb_types'},
        {'original': 'primary_loction', 'new': 'primary_location'},
        {'original': 'primary_link', 'new': 'primary_url'},
        {'original': 'all_links', 'new': 'all_urls'},
        {'original': 'url', 'new': 'all_urls'},
        {'original': 'version_num', 'new': 'version_nums'},]


for swap in swaps:
    ORIGINAL = swap['original']
    NEW = swap['new']

    for fic in url_unmerged:
        if ORIGINAL in fic:
            fic[NEW] = fic[ORIGINAL]
            del fic[ORIGINAL]
    
    
    for fic in text_unmerged:
        if ORIGINAL in fic:
            fic[NEW] = fic[ORIGINAL]
            del fic[ORIGINAL]
        

In [122]:
# Check - Keys only in url dict
url_only_keys = []
for k1 in url_unmerged[0]:
    if k1 not in text_unmerged[0]:
        print(k1)
        url_only_keys.append(k1)

# Should print a few key names (these are not in text dict)

work_type
fic_id
all_ratings
fandom_type
ffn_date_added
ffn_date_updated_2-4-23
readability_status
to_read_rating
error


In [123]:
# Check - Keys only in text dict 
text_only_keys = []
for k1 in text_unmerged[0]:
    if k1 not in url_unmerged[0]:
        print(k1)
        text_only_keys.append(k1)

# Should print a few key names (these are not in url dict)

length
early_ratings


### Clean s5 fic text & url data
- combining ao3/ao3 and ao3/ffn duplicates

In [124]:
# title_str with >1 fic (all happened to be in url_unmerged, none in text_unmerged)
# Doubles filled when running "Code to merge all s5 fic text + url", which is intensive, so saved here
doubles = ['achildavenged', 'adropofpoison', 'anacttoooftenneglected', 'babyitscoldoutside', 'bestlaidplans', 'byanyothername', 
            'catalysis', 'dusktodawn', 'everythingcanchange', 'familybonds', 'foolmeonce', 'foradventureandchaos', 
            'gaslightingthenecronomicon', 
            'guillotine', 'heavensgarden', 'herphantom', 'iholdwiththosewhofavorfire', 'imaginaryfriend', 'intertwined', 'learningtofly', 
            'lostboys', 'loyalty', 'nest', 'practicemakesperfect', 'retrograde', 'rumorhasit', 'scorched', 'seagullsandbeetles',
            'solstice', 'somewheretobelong', 'spadesonscreen', 'standards', 'tenyearsdifference', 'thegamehasonlyjustbegun', 
            'thelivesworthsaving', 'thenightthehouseofcardswasbuilt', 'tobuildahome', 'unexpected', 'wanderer'
          ]

In [125]:
# Get formatted doubles
diff_doubles = []
ind_doubles = []
for ts in doubles: 
    locs = []
    d = []
    for i, fic in enumerate(url_unmerged):
        if fic['title_str'] == ts:
            # locs.append(fic['primary_location'])
            d.append(i)
    # if locs[0] == locs[1]:
    diff_doubles.append(ts)
    ind_doubles.append(d)

pairs = list(zip(diff_doubles, ind_doubles))

# Print pairs to manually determine which are duplicates
for pair in pairs:
    print(pair[0])
    for i in pair[1]:
        print(f"TITLE ({i}): {url_unmerged[i]['title']}")
        print(f"AUTHOR: {url_unmerged[i]['author']}")
        print(f"FANDOM: {url_unmerged[i]['fandom']}")
        print()
    print('-------\n') 



achildavenged
TITLE (46): A Child Avenged
AUTHOR: ['pandaswearglasses']
FANDOM: ['Avengers (Marvel) - All Media Types', 'Harry Potter - Fandom']

TITLE (47): A Child Avenged
AUTHOR: ['PandasWearGlasses']
FANDOM: ['Harry Potter - Fandom']

-------

adropofpoison
TITLE (63): A Drop of Poison
AUTHOR: ['Androgyninja']
FANDOM: ['Naruto']

TITLE (64): A Drop of Poison
AUTHOR: ['Angel of Snapdragons']
FANDOM: ['Naruto']

-------

anacttoooftenneglected
TITLE (128): An Act Too Often Neglected
AUTHOR: ['thehoyden']
FANDOM: ['X-Men - All Media Types']

TITLE (129): an act too often neglected
AUTHOR: ['Ariaste']
FANDOM: ['陈情令 | The Untamed (TV)', '魔道祖师 - 墨香铜臭 | Módào Zǔshī - Mòxiāng Tóngxiù']

-------

babyitscoldoutside
TITLE (253): Baby, It's Cold Outside
AUTHOR: ['Fweeble']
FANDOM: ['Tokyo Ghoul']

TITLE (254): Baby It's Cold Outside
AUTHOR: ['Defective_Avian']
FANDOM: ['Batman - All Media Types', 'DCU (Comics)']

-------

bestlaidplans
TITLE (285): Best Laid Plans
AUTHOR: ['esama']
FANDOM: ['

In [126]:
def merge_values(d1, d2, key):
    """
    Takes two dicts (w/ same keys) and one key name (str).
    Returns one list with all the values of the two given dicts at the key combined, de-dedupped, and sorted.
    """
    if key not in d1 and key not in d2:
        return
    
    v1 = [] if key not in d1 else d1[key]
    v2 = [] if key not in d2 else d2[key]

    v1 = v1 if isinstance(v1, list) else [v1]
    v2 = v2 if isinstance(v2, list) else [v2]

    dedup = list(set(v1 + v2))
    cleaned = [ele for ele in dedup if ele is not None and not pd.isna(ele) and ele != ""]

    return sorted(cleaned)

def merge_dicts(d1, d2):
    """
    Takes 2 dicts.
    Returns one dict with all values for d1 & d2 combined in list form.
    For s5 fic url, keys NOT list: 'primary_version', 'primary_location', 'work_type', 'title', 'fic_id', 'num_appeared', 
        'primary_dtb', 'fic_rating', 'title_str', 'primary_url'
    """
    # Get dict, remove NaN's & de-dup lists
    cols = {key: merge_values(d1, d2, key) for key in d1}

    # Combine values from d1 & d2 for all keys
    sorted_keys = sorted(list(fic1.keys() | fic2.keys()))
    res = {key: combine_dedup(d1, d2, key) for key in sorted_keys}

    not_list_keys = ['primary_version', 'primary_location', 'work_type', 'title', 'primary_dtb', 'title_str', 'primary_url']
    for key in not_list_keys:
        res[key] = res[key][0] if len(res[key]) != 0 else None
        
    res['num_appeared'] = sum(res['num_appeared'])
    res['fic_id'] = d1['fic_id'] if d1['primary_location'] == 'ao3' else d2['fic_id']
    res['fic_rating'] = None if len(res['fic_rating']) == 0 else sum(res['fic_rating'])
    return res

# Testing
fic1 = url_unmerged[1036]
fic2 = url_unmerged[1038]
merge_dicts(fic1, fic2)

{'all_ratings': [],
 'all_tags': [],
 'all_urls': ['https://archiveofourown.org/works/14693193',
  'https://archiveofourown.org/works/14693193/chapters/33951429',
  'https://www.fanfiction.net/s/10599176/1/Intertwined'],
 'author': ['BloodyWar2411'],
 'categories': [],
 'current_chapter': [],
 'dtb_types': ['cont_read', 'read'],
 'fandom': ['Fullmetal Alchemist - All Media Types'],
 'fandom_type': [],
 'ffn_date_added': ['4/2/18'],
 'ffn_date_updated_2-4-23': ['8/5/20'],
 'fic_id': 14693193,
 'fic_rating': None,
 'fic_series': [],
 'fic_status': [],
 'in_category': [],
 'is_backedup': [],
 'is_bold': [True],
 'is_bookmarked': [],
 'is_coffee': [],
 'is_complete': [],
 'is_finished_inputting_data': [False],
 'is_subbed': [],
 'length': [],
 'locations': ['ao3', 'ffn_net'],
 'num_appeared': 5,
 'primary_dtb': 'read',
 'primary_location': 'ao3',
 'primary_source': ['v7_updates', 'v8_ffn_net_account_fic'],
 'primary_url': 'https://archiveofourown.org/works/14693193',
 'primary_version': 7.

In [127]:
# Code to merge duplicates, remove originals, and add merged
dups_to_merge = [['intertwined', 1036, 1038], 
                    ['loyalty', 1229, 1231],
                    ['achildavenged', 46, 47],
                    ['dusktodawn', 586, 587],
                    ['everythingcanchange', 616, 617],
                    ['familybonds', 645, 646],
                    ['foradventureandchaos', 706, 707],
                    ['gaslightingthenecronomicon', 763, 764],
                    ['heavensgarden', 857, 858],
                    ['herphantom', 880, 881],
                    ['imaginaryfriend', 1001, 1002],
                    ['nest', 1335, 1336],
                    ['scorched', 1701, 1702],
                    ['seagullsandbeetles', 1708, 1709],
                    ['spadesonscreen', 1813, 1814],
                    ['standards', 1832, 1833],
                    ['tenyearsdifference', 1936, 1937],
                    ['thegamehasonlyjustbegun', 2025, 2026],
                    ['thelivesworthsaving', 2067, 2068],
                    ['thenightthehouseofcardswasbuilt', 2096, 2097],
                    ['wanderer', 2420, 2421],
                     ['catalysis', 398, 399],
                ]

# Checker
pre_len = len(url_unmerged)

# Merge dicts
merged_dicts = []
inds_remove = []
for title_str, i1, i2 in dups_to_merge:
    # Check getting right fics to merge
    if url_unmerged[i1]['title_str'] != title_str or url_unmerged[i2]['title_str'] != title_str:
        print(f"ERROR: '{title_str}', ({i1}) {url_unmerged[i1]['title_str']}, ({i2}) {url_unmerged[i2]['title_str']}!")

    # Merge dicts
    new_dict = merge_dicts(url_unmerged[i1], url_unmerged[i2])
    merged_dicts.append(new_dict)

    # Get indexes to remove
    inds_remove.append(i1)
    inds_remove.append(i2)
    
# Add new merged dicts to url_unmerged
url_unmerged.extend(merged_dicts)

# Remove original fics (that created merged one)
inds_remove.sort(reverse=True)

for ind in inds_remove:
    url_unmerged.pop(ind)

In [128]:
# Checking it worked
f"# dups: {len(dups_to_merge)} | pre-len: {pre_len} | post-len: {len(url_unmerged)} (should be: {pre_len-len(dups_to_merge)})"

'# dups: 22 | pre-len: 2620 | post-len: 2598 (should be: 2598)'

In [129]:
# # Checker - make sure right fics are adding & removed
# dups_strs = [dup[0] for dup in dups_to_merge]

# for name in dups_strs:
#     print(name)
#     for i, fic in enumerate(url_unmerged):
#         if fic['title_str'] == name:
#             print(f"TITLE ({i}): {fic['title']}")
#             print(f"AUTHOR: {fic['author']}")
#             print(f"FANDOM: {fic['fandom']}")
#             print('-------') 
#     print()

# Should print 

In [130]:
# Add numbers to the end of the title_str of duplicates
# Get title_str with more than 1 instance
all_ts = [fic['title_str'] for fic in url_unmerged if fic['title_str'] != ""]
ts_count = Counter(all_ts)
still_doubles = [key for key in ts_count if ts_count[key] > 1]

# Add _num tag on end of duplicates
for title_str in still_doubles:
    count = 1
    for i, fic in enumerate(url_unmerged):
        if fic['title_str'] == title_str:
            url_unmerged[i]['title_str'] = f"{title_str}_{count}"
            count += 1

In [131]:
# Checker - make sure all title_strs are unique
all_ts = [fic['title_str'] for fic in url_unmerged]
Counter(all_ts)

# Should only have title_str '' have more than one instance

Counter({'': 15,
         '10reasonsnottofightadragonbirdaguidebyarthurpendragon': 1,
         '10thingsihateaboutdatingatgusuacademy': 1,
         '1861c100s60': 1,
         '30hoursonmillbillillie': 1,
         '4018': 1,
         '50heartbeatsawayfromromance': 1,
         '50waystoloseanarmandalegbyedwardelric': 1,
         '5timesjasontookcareoftimandonetimetimtookcareofjason': 1,
         '5timeskurokowishedheneveropenedhisdamnmouth': 1,
         '5timespetermadetonylaughoutloud': 1,
         '5timespeterpretendedtobetougherthanhewas': 1,
         '5timesspidermansavedanavengersassand1timetheysavedhim': 1,
         '5timestimspendsthenightatwaynemanor1timehecomeshome': 1,
         '5timestonydidntneedtoworryaboutpeter': 1,
         '5timestonyforgotpeterwasjustakid': 1,
         '5timestonyhadtoomuchmoneyandonetimepeterdid': 1,
         '60years': 1,
         'abeatingheartofstone': 1,
         'abirdinthehand': 1,
         'aboutdamageaboutpainaboutheartache': 1,
         'absolu

### Combining s5 fic url + text

In [132]:
def clean_authors(author_list):    
    # Get all unique, lowercase author names
    author_lower = [author.lower() for author in author_list]
    unique = list(set(author_lower))

    # For each unique author, get one author name (preferrably one with uppercase in it)
    clean_authors = []
    for author_name in unique:
        all_authors = []
        best = author_name # save name with uppercase in it, if possible
        for author2 in author_list:
            if author2.lower() == author_name:
                for char in author2:
                    if char.isupper():
                        best = author2
                        break

        clean_authors.append(best)
    return clean_authors

for i, fic in enumerate(url_unmerged):
    url_unmerged[i]['author'] = clean_authors(url_unmerged[i]['author'])

for i, fic in enumerate(text_unmerged):
    text_unmerged[i]['author'] = clean_authors(text_unmerged[i]['author'])


In [133]:
url_unmerged

[{'primary_version': 8.0,
  'version_nums': [8.0],
  'primary_source': [],
  'smk_sources': ['v8_local_files'],
  'primary_location': 'ao3',
  'locations': ['ao3'],
  'work_type': 'fic',
  'fandom': [],
  'title': '',
  'author': [],
  'fic_id': 13142217,
  'num_appeared': 1,
  'primary_dtb': '',
  'dtb_types': [],
  'fic_series': [],
  'is_bold': [],
  'is_coffee': [],
  'is_complete': [],
  'is_subbed': [],
  'is_bookmarked': [],
  'is_finished_inputting_data': [],
  'is_backedup': [],
  'categories': [],
  'current_chapter': [],
  'all_tags': [],
  'fic_rating': None,
  'all_ratings': [],
  'fandom_type': [],
  'ffn_date_added': [],
  'ffn_date_updated_2-4-23': [],
  'fic_status': [],
  'in_category': [],
  'readability_status': ['not_found'],
  'to_read_description': [],
  'to_read_rating': [],
  'error': 'InvalidIdError',
  'title_str': '',
  'primary_url': 'https://archiveofourown.org/works/13142217',
  'all_urls': ['https://archiveofourown.org/works/13142217']},
 {'primary_versi

In [151]:
doubles = []
def match_fics(text_unmerged, url_unmerged, title_str):
    text = [fic for fic in text_unmerged if fic['title_str'] == title_str]
    url = [fic for fic in url_unmerged if fic['title_str'] == title_str]

    # Error & null case handling
    if len(text) > 1 or len(url) > 1:
        error_message = f"\nERROR: title_str: {title_str} has {len(text)} fics in text_unmerged and {len(url)} fics in url_unmerged!"
        print(error_message)
        doubles.append(title_str)
        return
        # raise Exception(error_message)
    elif len(text) == 0 and len(url) == 0:
        raise Exception(f"ERROR: not fic found in either text or url dict with title_str: {title_str}")

    # Set dict variables
    text = text[0] if len(text) != 0 else {}
    url = url[0] if len(url) != 0 else {}

    # No need to check matches if only one fic
    if text == {} or url == {}:
        return [text, url]

    # text_fandom, url_fandom = set(text['fandom']), set(url['fandom'])
    text_authors = set([author.lower() for author in text['author'] if author != ""])
    text_authors = sorted(list(set(text_authors)))
    url_authors = set([author.lower() for author in url['author'] if author != ""])
    url_authors = sorted(list(set(url_authors)))
    
    # , url_authors = set(text['author']), set(url['author'])
    

    # if text_authors != set() and url_authors != set():
    #     if 

    # if text_fandom != url_fandom:
    #     print(f"{title_str} (text --- url): {text_fandom} --- {url_fandom}")
        

    
    
    # if not set(text['fandom']) == set(url['fandom']):
    #     print(f'{title_str} (text, url), fandoms: {set(text["fandom"])}, {set(url["fandom"])}')

    if text_authors != url_authors and text_authors != [] and url_authors != []:
        print(f'{title_str} (text, url), authors: {text_authors}, {url_authors}')
        print()
        


unmerge: 
- antithesis ['cywscross', 'oceanbreeze7'], 
- basicinstincts (text, url), authors: ['miss_lv'], ['manic_intent']
- bite (text, url), authors: ['blackkat', 'esama'], ['blackkat']


In [153]:
misspellings = [{'right': 'amournfulhowlinthenight', 'wrong': 'amournfulhowlinthedark'},
                {'right': 'kitsune foxfire', 'wrong': 'kitsune firefox'},
                {'right': 'RIGHT', 'wrong': 'WRONG'},
                {'right': 'RIGHT', 'wrong': 'WRONG'},
                {'right': 'RIGHT', 'wrong': 'WRONG'},
                {'right': 'RIGHT', 'wrong': 'WRONG'},
                {'right': 'RIGHT', 'wrong': 'WRONG'},
                {'right': 'RIGHT', 'wrong': 'WRONG'},
                {'right': 'RIGHT', 'wrong': 'WRONG'},
                {'right': 'RIGHT', 'wrong': 'WRONG'},
                {'right': 'RIGHT', 'wrong': 'WRONG'},
                {'right': 'RIGHT', 'wrong': 'WRONG'},
               ]

Strip whitespace
split psudeonyms into two authors

- ariadne83 somehowunbroken
- drelfina evocates -> drelfina, evocates
- mikimoo pentapus
- firefright pentapus
- aventria iluxia ->
- aventriailuxia -> aventria iluxia
- PSEUDONYM: ceiaofsilence
- PSEUDONYM: tsume_yuki, tsume yuki
- PSEUDONYM: ['luki', 'luki (kelpiecodyne)'
- lala_the_lessermisminorqwerty224 -> lala_the_lesser misminorqwerty224

In [152]:
# MATCH FIC TESTING
url_title_strs = [fic['title_str'] for fic in url_unmerged if 'error' not in fic]
text_title_strs = [fic['title_str'] for fic in text_unmerged]
all_title_strs = sorted(list(set(url_title_strs+text_title_strs)))

merged_fics = []
for title_str in all_title_strs:
    match_fics(text_unmerged, url_unmerged, title_str)

adoptedbydefault (text, url), authors: ['kitsune firefox'], ['kitsune firefox', 'kitsune foxfire']

afterthestorm (text, url), authors: ['ariadne83', 'somehowunbroken'], ['ariadne83somehowunbroken']

ahundredspringsfloweasttowardsthesea (text, url), authors: ['drelfina', 'evocates'], ['drelfina evocates']

anotherperfectcatastrophe (text, url), authors: ['mikimoo', 'pentapus'], ['mikimoopentapus']

antithesis (text, url), authors: ['cywscross', 'oceanbreeze7'], ['oceanbreeze7']

anunexpectedfind (text, url), authors: ['firefright', 'pentapus'], ['firefrightpentapus']

basicinstincts (text, url), authors: ['miss_lv'], ['manic_intent']

becomingthephoenix (text, url), authors: ['branch', 'lodestone'], ['branch']

bite (text, url), authors: ['blackkat', 'esama'], ['blackkat']

blue (text, url), authors: ['wynnebat'], ['bealeciphers']

brothers (text, url), authors: ['heartslogos'], ['grumpyhedgehogs']

cardiacarrest (text, url), authors: ['amournfulhowlinthedark'], ['amournfulhowlinthenig

In [26]:
def combine_dedup(text_unmerged, url_unmerged, colName, debug=False):
    if colName not in text_unmerged and colName not in url_unmerged:
        return None
    
    text = [] if colName not in text_unmerged else text_unmerged[colName]
    url = [] if colName not in url_unmerged else url_unmerged[colName]
    if debug: print(f"{colName} (text, url): ", text, url)

    text = text if isinstance(text, list) else [text]
    url = url if isinstance(url, list) else [url]
    
    combined_list = text + url
    if debug: print(f"- combined_list: {combined_list}")
        
        
    cleaned_list = [ele for ele in combined_list if ele != None and not pd.isna(ele) and ele != ""]
    if debug: print(f"- cleaned_list: {cleaned_list}")
        
    dedup_list = list(set(cleaned_list))
    if debug: print(f"- dedup_list: {dedup_list}")
    
    return sorted(dedup_list)
    
def bool_clean(text_unmerged, url_unmerged, colName):
    clean_bools = combine_dedup(text_unmerged, url_unmerged, colName)
    return None if len(clean_bools) == 0 else any(clean_bools)

doubles = []
def merge_fic(text_unmerged, url_unmerged, title_str):
    # Get relevant fics from url & text dicts
    text = [fic for fic in text_unmerged if fic['title_str'] == title_str]
    url = [fic for fic in url_unmerged if fic['title_str'] == title_str]

    # Error & null case handling
    if len(text) > 1 or len(url) > 1:
        error_message = f"\nERROR: title_str: {title_str} has {len(text)} fics in text_unmerged and {len(url)} fics in url_unmerged!"
        print(error_message)
        doubles.append(title_str)
        return
        # raise Exception(error_message)
    elif len(text) == 0 and len(url) == 0:
        raise Exception(f"ERROR: not fic found in either fict with title_str: {title_str}")

    # Set dict variables
    text = text[0] if len(text) != 0 else {}
    url = url[0] if len(url) != 0 else {}

    # Cleaning relevant keys
    clean_dtbs = combine_dedup(text, url, 'dtb_types')
    sorted_dtbs = sorted(clean_dtbs, key=lambda dtb: DTB_ORDER[dtb])

    clean_ratings = combine_dedup(text, url, 'fic_rating')
    avg_rating = None if len(clean_ratings) == 0 else round(sum(clean_ratings)/len(clean_ratings), 2)

    clean_locs = combine_dedup(text, url, 'locations')
    sorted_locations = sorted(clean_locs, key=lambda loc: LOCATION_ORDER[loc])

    clean_sources = combine_dedup(text, url, 'smk_sources')
    sorted_sources = sorted(clean_sources, key = lambda source: float(source.split('_')[0][1:]))

    sorted_versions = combine_dedup(text, url, 'in_category')

    sorted_urls = combine_dedup(text, url, 'all_urls')
    primary_url = None if len(sorted_urls) == 0 else sorted_urls[0]
    if 'fic_id' in url:
        primary_url = "https://archiveofourown.org/works/" + str(url['fic_id'])

    title = url['title'] if (url.get('title', None) is not None and len(url['title']) != 0) else text['title']

    num_appeared_total = text.get('num_appeared', 0) + url.get('num_appeared', 0)
    num_appeared_total = num_appeared_total if num_appeared_total != 0 else None

    total = {'all_ratings': url.get('all_ratings', None),
            'all_tags': combine_dedup(text, url, 'all_tags'),
            'all_urls': sorted_urls,
            'author': combine_dedup(text, url, 'author'),
            'categories': combine_dedup(text, url, 'categories'),
            'current_chapter': combine_dedup(text, url, 'current_chapter'),
            'dtb_types': sorted_dtbs,
            'early_ratings': text.get('early_ratings', None),
            'fandom': combine_dedup(text, url, 'fandom'),
            'fandom_type': url.get('fandom_type', None),
            'ffn_date_added': url.get('ffn_date_added', None),
            'ffn_date_updated_2-4-23': url.get('ffn_date_updated_2-4-23', None),
            'fic_id': url.get('fic_id', None),
            'fic_rating': avg_rating,
            'fic_series': combine_dedup(text, url, 'fic_series'),
            'fic_status': combine_dedup(text, url, 'fic_status'),
            'in_category': combine_dedup(text, url, 'in_category'),
            'is_backedup': bool_clean(text, url, 'is_backedup'),
            'is_bold': bool_clean(text, url, 'is_bold'),
            'is_bookmarked': bool_clean(text, url, 'is_bookmarked'),
            'is_coffee': bool_clean(text, url, 'is_coffee'),
            'is_complete': bool_clean(text, url, 'is_complete'),
            'is_finished_inputting_data': bool_clean(text, url, 'is_finished_inputting_data'),
            'is_subbed': bool_clean(text, url, 'is_subbed'),
            'length': text.get('length', None),
            'locations': sorted_locations,
            'num_appeared_total': num_appeared_total,
            'num_appeared_text': text.get('num_appeared', None),
            'num_appeared_url': url.get('num_appeared', None),
            'primary_dtb': None if len(sorted_dtbs) == 0 else sorted_dtbs[0],
            'primary_location': None if len(sorted_locations) == 0 else sorted_locations[0],
            'primary_source': None if len(sorted_sources) == 0 else sorted_sources[0],
            'primary_url': primary_url,
            'primary_versions': None if len(sorted_versions) == 0 else sorted_versions[0],
            'readability_status': url.get('readability_status', None),
            'smk_sources': sorted_sources,
            'title': title,
            'title_str': title_str,
            'to_read_description': combine_dedup(text, url, 'to_read_description'),
            'to_read_rating': url.get('to_read_rating', None),
            'version_nums': sorted_versions,
            'work_type': url.get('work_type', None),
           }

    if 'error' in url:
        total['error'] = url['error']

    print(".", end="")
    return total


# merge_fic(text_unmerged, url_unmerged, 'cestlavie')

In [27]:
# Code to merge all s5 fic text + url 
url_title_strs = [fic['title_str'] for fic in url_unmerged if 'error' not in fic]
text_title_strs = [fic['title_str'] for fic in text_unmerged]
all_title_strs = sorted(list(set(url_title_strs+text_title_strs)))

merged_fics = []
for title_str in all_title_strs:
    new_fic = merge_fic(text_unmerged, url_unmerged, title_str)
    merged_fics.append(new_fic)
print("SEE: variable 'doubles' to see a list of all title_strs of dups")

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

# WORKSPACE

In [29]:
def fic_equal(row1, row2) -> bool:
    """
    Takes two fic or series rows from a database (df).
    Compares them to determine if they're the same fic.
    Returns a boolean of whether or not they're equal (bool).
    """
    # Get titles boolean
    title1 = str(row1['title']).lower().strip()
    title2 = str(row2['title']).lower().strip()
    same_title = title1 == title2
    
    # Get authors boolean (strip, lower, not empty)
    authlist1 = [] if pd.isnull(row1['author']) else row1['author'].split(',')
    authlist2 = [] if pd.isnull(row2['author']) else row2['author'].split(',')
    author1 = set([auth.lower().strip() for auth in authlist1 if auth != ''])
    author2 = set([auth.lower().strip() for auth in authlist2 if auth != ''])
    
    same_author = (author1 == author2) or (not author1 and not author2)
    absent_author = not author1 or not author2

    # Get locations boolean
    same_location = (row1['location'] == row2['location']) or \
        pd.isnull(row1['location']) or pd.isnull(row2['location'])
    
    # Get fandoms boolean
    same_fandoms = False
    if not pd.isnull(row1['fandom']) and not pd.isnull(row2['fandom']):
        fandom1 = [fan.strip() for fan in row1['fandom'].split(',')]
        fandom2 = [fan.strip() for fan in row2['fandom'].split(',')]
        diff = set(fandom1) ^ set(fandom2)
        same_fandoms = (len(diff) == 0) or (diff == {'dcu', 'batman'}) or \
            (diff == {'spiderman', 'avengers'})
        
    
    # print(f'same_title: {same_title}, same_fandoms: {same_fandoms}, same_author: {same_author}, absent_author: {absent_author}')
    # print()
    return same_title and same_fandoms and (same_author or (absent_author and same_location))
