# FFN Analysis
- <b>Name:</b> Sofia Kobayashi
- <b>Date:</b> 12/10/202
- <b>Description:</b> Helper functions for ffv9.1 Jupyter Notebook

#### <b><u>Functions Table of Contents</u></b>
1. [Cleaning & Incoming Data functions](#sec1)
1. [Display & Interface functions](#sec2)
1. [Search functions](#sec3)
1. [Meta-functions: testing & report](#sec4)
1. [Fic functions](#sec5)

1. [Misc. functions](#sec6)
1. [Single-use functions](#sec7)

In [15]:
# IMPORT STATEMENTS & GLOBAL VARIABLES
import re
import json
from datetime import datetime

known_work_types = ["works","collections","series","users","tags","search","external_works","comments","chapters"]
masterNoDupUrls = "MASTER_noDupURLs.json"
masterNoDupWorks = "MASTER_noDupWorks.json"
masterOthers = "MASTER_others.json"

<a id="sec1"></a>
## Cleaning & Incoming Data Functions

<u>Types of URLs</u>
1. works (regular, chapters, colWorks
1. external works
1. users
1. collections 
1. search
1. tags


In [31]:
# testing getting type & id from: 
url_0 = 'https://archiveofourown.org/collections/WorksOfGreatQualityAcrossTheFandoms/works/29387814'
url_1 = 'https://archiveofourown.org/works/22269148/chapters/53178208'
url_2 = 'https://archiveofourown.org/users/miscellea/pseuds/The%20Feels%20Whale'
url_3 = 'https://archiveofourown.org/collections/TheCrackheadBible/works?commit=Sort+and+Filter&include_work_search%5Bfandom_ids%5D%5B%5D=3828398&page=6&utf8=%E2%9C%93&work_search%5Bcomplete%5D=&work_search%5Bcrossover%5D=&work_search%5Bdate_from%5D=&work_search%5Bdate_to%5D=&work_search%5Bexcluded_tag_names%5D=&work_search%5Blanguage_id%5D=&work_search%5Bother_tag_names%5D=&work_search%5Bquery%5D=kudos%3A+%26gt%3B1000&work_search%5Bsort_column%5D=kudos_count&work_search%5Bwords_from%5D=25000&work_search%5Bwords_to%5D='
url_4 = 'https://archiveofourown.org/external_works/637417'
url_5 = 'https://archiveofourown.org/tags/Danny%20Phantom/works'
url_6 = "https://archiveofourown.org/collections/Clever_Crossovers_and_Fantastic_Fusions"
url_7 = "https://archiveofourown.org/works?commit=Sort+and+Filter&work_search%5Bsort_column%5D=kudos_count&work_search%5Bother_tag_names%5D=&work_search%5Bexcluded_tag_names%5D=&work_search%5Bcrossover%5D=&work_search%5Bcomplete%5D=&work_search%5Bwords_from%5D=&work_search%5Bwords_to%5D=&work_search%5Bdate_from%5D=&work_search%5Bdate_to%5D=&work_search%5Bquery%5D=&work_search%5Blanguage_id%5D=&tag_id=L%C3%A1n+Q%C7%90r%C3%A9n*s*M%C3%A8ng+Y%C3%A1o+%7C+J%C4%ABn+Gu%C4%81ngy%C3%A1o"
url_8 = "https://archiveofourown.org/works/search?utf8=%E2%9C%93&commit=Search&work_search%5Bquery%5D=&work_search%5Btitle%5D=Assembly+of+Pain%2C+Happiness%2C+%26+Feelings.&work_search%5Bcreators%5D=&work_search%5Brevised_at%5D=&work_search%5Bcomplete%5D=&work_search%5Bcrossover%5D=&work_search%5Bsingle_chapter%5D=0&work_search%5Bword_count%5D=&work_search%5Blanguage_id%5D=&work_search%5Bfandom_names%5D=&work_search%5Brating_ids%5D=&work_search%5Bcharacter_names%5D=TommyInnit+%28Video+Blogging+RPF%29&work_search%5Brelationship_names%5D=&work_search%5Bfreeform_names%5D=&work_search%5Bhits%5D=&work_search%5Bkudos_count%5D=&work_search%5Bcomments_count%5D=&work_search%5Bbookmarks_count%5D=&work_search%5Bsort_column%5D=_score&work_search%5Bsort_direction%5D=desc#:~:text=Works%20List-,Assembly%20of%20Pain%2C%20Happiness%2C%20%26%20Feelings.,-by%20RandomlySane"
url_9 = "https://archiveofourown.org/chapters/747149?show_comments=true"
url_10 = "https://archiveofourown.org/collections/asoiaftimetraveltransmigration/works/29620161"
url_11 = "https://archiveofourown.org/bookmarks?commit=Sort+and+Filter&bookmark_search%5Bsort_column%5D=created_at&include_bookmark_search%5Brelationship_ids%5D%5B%5D=27817261&bookmark_search%5Bother_tag_names%5D=&bookmark_search%5Bother_bookmark_tag_names%5D=&bookmark_search%5Bexcluded_tag_names%5D=&bookmark_search%5Bexcluded_bookmark_tag_names%5D=&bookmark_search%5Bbookmarkable_query%5D=&bookmark_search%5Bbookmark_query%5D=&bookmark_search%5Blanguage_id%5D=&bookmark_search%5Brec%5D=0&bookmark_search%5Bwith_notes%5D=0&user_id=kyme"
url_12 = "https://archiveofourown.org/collections:TheCrackheadBible/15774906"

import re
def getTypeAndId(url):
    """Give an AO3 url. Returns a tuple with (type-of-work, work-id). Type = works, series, tags, etc.
    If type = 'collections', assumed to be a colWork, returns (collections:colName, workId).
    Depends on: re"""
    # Check if it's a search result
#     print(url)
    if ("works?" in url) or ("search?" in url) or ("bookmarks?" in url):
        pattern = re.compile('archiveofourown.org/(.+)')
        search = pattern.findall(url)[0]
        return("search", search)
    
    if "collections:" in url:
        pattern = re.compile ("archiveofourown.org/collections:(.+)/(\d+)")
        search = pattern.findall(url)[0]
        url = f'https://archiveofourown.org/collections/{search[0]}/works/{search[1]}'
    
    # Find work type
    pattern = re.compile("(archiveofourown.org/)(\w+)/")
    info = pattern.findall(url)
    wType = info[0][1]

    # Check if it's an unknown type
    if wType not in known_work_types:
        raise Exception(f'WorkType not in global variable known_work_types!\n- url: {url}\n- output type: {wType}')
    
    # *** FIND TYPE & ID ***
    # If work type is 'collections', I think it has to be a colWork (different URL format)
    if wType == "collections":
        pattern = re.compile("archiveofourown.org/collections/(\w+)/works/(\d+)")
        info2 = pattern.findall(url)
        
        # Check if it's a colWork
        if info2 == []:
            pattern = re.compile("archiveofourown.org/collections/(.+)$")
            info3 = pattern.findall(url)

            # Check if it's NOT a colWork or collection
            if info3 == []:
                raise Exception(f'type="collections", but not a colWork or collection!\n- url: {url}\n- output type: {wType}')
             
            #Return collection data
            return (wType, info3[0])
        
        # Return colWork data
        colName = info2[0][0]
        wId = info2[0][1]
        return(f"{wType}:{colName}", wId)

    elif wType == "users":
        pattern = re.compile("archiveofourown.org/users/(\w+)")
        authorName = pattern.findall(url)[0]
        return (wType, authorName)
    
    elif wType == "tags":
        pattern = re.compile("archiveofourown.org/tags/(.+)/works")
        tag = pattern.findall(url)[0]
        return (wType, tag)
    
    # Else, return type & idNum
    else:
        pattern = re.compile("(archiveofourown.org/)(\w+)/(\d+)")
        info = pattern.findall(url)
        return (wType, info[0][2])

# for i in range(12):
#     print(getTypeAndId(url_12))

# getTypeAndId(url_8)

https://archiveofourown.org/works/search?utf8=%E2%9C%93&commit=Search&work_search%5Bquery%5D=&work_search%5Btitle%5D=Assembly+of+Pain%2C+Happiness%2C+%26+Feelings.&work_search%5Bcreators%5D=&work_search%5Brevised_at%5D=&work_search%5Bcomplete%5D=&work_search%5Bcrossover%5D=&work_search%5Bsingle_chapter%5D=0&work_search%5Bword_count%5D=&work_search%5Blanguage_id%5D=&work_search%5Bfandom_names%5D=&work_search%5Brating_ids%5D=&work_search%5Bcharacter_names%5D=TommyInnit+%28Video+Blogging+RPF%29&work_search%5Brelationship_names%5D=&work_search%5Bfreeform_names%5D=&work_search%5Bhits%5D=&work_search%5Bkudos_count%5D=&work_search%5Bcomments_count%5D=&work_search%5Bbookmarks_count%5D=&work_search%5Bsort_column%5D=_score&work_search%5Bsort_direction%5D=desc#:~:text=Works%20List-,Assembly%20of%20Pain%2C%20Happiness%2C%20%26%20Feelings.,-by%20RandomlySane


('search',
 'works/search?utf8=%E2%9C%93&commit=Search&work_search%5Bquery%5D=&work_search%5Btitle%5D=Assembly+of+Pain%2C+Happiness%2C+%26+Feelings.&work_search%5Bcreators%5D=&work_search%5Brevised_at%5D=&work_search%5Bcomplete%5D=&work_search%5Bcrossover%5D=&work_search%5Bsingle_chapter%5D=0&work_search%5Bword_count%5D=&work_search%5Blanguage_id%5D=&work_search%5Bfandom_names%5D=&work_search%5Brating_ids%5D=&work_search%5Bcharacter_names%5D=TommyInnit+%28Video+Blogging+RPF%29&work_search%5Brelationship_names%5D=&work_search%5Bfreeform_names%5D=&work_search%5Bhits%5D=&work_search%5Bkudos_count%5D=&work_search%5Bcomments_count%5D=&work_search%5Bbookmarks_count%5D=&work_search%5Bsort_column%5D=_score&work_search%5Bsort_direction%5D=desc#:~:text=Works%20List-,Assembly%20of%20Pain%2C%20Happiness%2C%20%26%20Feelings.,-by%20RandomlySane')

In [17]:
import os 

def combineToTxt(dirPath):
    """
    Takes a string-path to a directory full of TXT files. Function then combines all files into 1 TXT file,
    will only add all text to 1 file, no de-duppinhg.
    """
    # Get date
    now = datetime.now().strftime("%m-%d-%y")
    
    # Get all files in given directory
    allFiles = get_all_files(dirPath)
    others = []

    # Get all lines in files
    for file in allFiles:
        # read in file
        with open(f"{dirPath}/{file}", "r") as infile:
            for line in infile:
                line.strip() # remove trailing whitespace
                others.append(line)

    # Write to json
    with open(f"txtOutput_{now}.txt","w") as outfile:
        outfile.writelines(others)
    
    return f"txtOutput_{now}.txt"

In [30]:
import os 

def combineToJson(dirPath):
    """Takes a string-path to a directory full of TXT files. Function then combines all files into 1 JSON file."""
    # Get date
    now = datetime.now().strftime("%m-%d-%y")
    
    # Get all files in given directory
    allFiles = get_all_files(dirPath)
    others = []

    # Get all lines in files
    for file in allFiles:
        # read in file
        with open(f"{dirPath}/{file}", "r") as infile:
            for line in infile:
                line = line.strip() #to get rid of \n  
                others.append(line)

    # Write to json
    with open(f"jsonOutput_{now}.json","w") as outfile:
        json.dump(others, outfile)
        
    return f"jsonOutput_{now}.json"

# combineToJson("urlsOutput")

'jsonOutput_12-27-22.json'

In [19]:
import json
from datetime import datetime

def add_to_masterfiles(urlFile):
    """
    Takes ONE txt file of URLs, appends new urls (probably from a new reading list) to the 3 MASTER json files:
    MASTER_noDupURLs, MASTER_noDupWorks, MASTER_others. Pair with `combineToTxt(dirPath)` to convert whole 
    folders of TXT files.
    Returns 'success' if successful. 
    """
    # Get current date & initialize 3 lists
    now = datetime.now()
    date_str = f"<Added: {now.strftime('%m-%d-%y %H:%M:%S')}>"
    
    
    # Initialize variables
    files = ["MASTER_noDupURLs.json", "MASTER_noDupWorks.json", "MASTER_others.json"]
    
    for file in files:
        if not os.path.isfile(file):
            with open(file,"w") as outfile:
                json.dump([], outfile)
            print(f"Made {file}")
    
    newNoDupURLs = []
    newNoDupWorks = []
    newOthers = []

    
    # Read in original files
    with open(files[0], "r") as infile:
        noDupURLs = json.load(infile)
    
    # rules a little different for noDupWorks bc it's formatted: [[typeI, url], ...
    with open(files[1], "r") as infile:  
        noDupWorks = json.load(infile)
        if noDupWorks == []: typeIdList = []
        else: 
            typeIdList = list(list(zip(*noDupWorks))[0])
        
    with open(files[2], "r") as infile:
        others = json.load(infile)

    totalLen = 0
    # Read in new URLs 
    with open(urlFile, "r") as infile:
        for line in infile:
            line = line.strip() #to get rid of \n
#             print(line) #DID SOMETHING GO WRONG?
            # if not an AO3 url
            # 1. others filter
            if "archiveofourown.org" not in line:
                if line not in others:
                    newOthers.append(line)

            else:
                # 2. noDupUrls filter
                if line not in noDupURLs:
                    newNoDupURLs.append(line)

                # 3. noDupWorks filter
                typeId = list(getTypeAndId(line))
                if typeId not in typeIdList:
                    typeIdList.append(typeId)
                    pair = [typeId, url]
                    newNoDupWorks.append(pair)
            totalLen += 1
    
    # Format & Write newly added-to files
    fileTypes = [[noDupURLs, newNoDupURLs, files[0]], 
                 [noDupWorks, newNoDupWorks, files[1]], 
                 [others, newOthers, files[2]]]
        
    for original, new, file in fileTypes:
        original.append(date_str) # add date stamp
        original.extend(new) # add new URLs
        
        # Write newly appended-lists
        with open(file, "w") as infile:
            json.dump(original, infile)
        
    # print addition report
    print(f"There were {totalLen} url(s) in '{urlFile}'")
    print(f"Added {len(newNoDupURLs)} url(s) to MASTER_noDupURLs.json")
    print(f"Added {len(newNoDupWorks)} url(s) to MASTER_noDupWorks.json")
    print(f"Added {len(newOthers)} url(s) to MASTER_others.json")
    
    return "success"

    
# add_to_masterfiles("txtOutput_12-25-22.txt")

In [20]:
def dir_to_masterfiles(dirPath):
    """
    Takes a directory (full of URL TXT files), makes a combined TXT files, then adds all those URLs to MASTER
    files.
    Returns nothing.
    """
    txtFile = combineToTxt(dirPath)
    result = add_to_masterfiles(txtFile)
    print(f"{result.title()}!")

# all_to_masterfiles("urlsOutput")

In [21]:
def readinglist_to_masterfiles():
    """
    Makes TXT file from Safari reading list, adds all those urls to master files.
    Returns nothing.
    """
    txtFile = getReadingList()
    result = add_to_masterfiles(txtFile)
    print(f"{result.title()}!")

# readinglist_to_masterfiles()

<a id="sec2"></a>
## Display & Interface functions

In [22]:
def getCorrectInput(allowedList):
    """Gets & returns user input but keeps prompting user until input is within correctList."""
    # run loop until input is within correctList
    passes = False
    while not passes: 
        passes = True
        ans = input()
        if ans not in allowedList:
            passes = False
    
    return ans

<a id="sec3"></a>
## Search functions

<a id="sec4"></a>
## Meta-functions: Testing & Report

In [23]:
import json

def reportMasters():
    for file in [masterNoDupUrls, masterNoDupWorks, masterOthers]:
        with open(file) as infile:
            data = json.load(infile)
            print(f"{file} has {len(data)} url(s)")
            
# reportMasters()

<a id="sec5"></a>
## Fic functions

<a id="sec6"></a>
## Misc. functions

In [24]:
from os import listdir

def get_all_files(dirName): 
    """Takes a string - name of directory. Returns list of ALL files within that directory minus the .DS_Store"""
    allFiles = [f for f in listdir(dirName)]
    if ".DS_Store" in allFiles:
        allFiles.remove(".DS_Store")
    return allFiles

# get_all_files("urlsOutput")

In [25]:
def getReadingList():
    """extracturls.py ~ This script gets a list of all the URLs in Safari Reading List, and
    writes them all to a file. Requires Python 3. ~ from someone on StackOverflow"""
    #!/usr/bin/env python
    import os
    import plistlib

    # Get current date 
    now = datetime.now()
    current_date = now.strftime("%m-%d-%y")

    # set file paths
    INPUT_FILE  = os.path.join(os.environ['HOME'], 'Library/Safari/Bookmarks.plist')
    OUTPUT_FILE = f"readinglist_{current_date}.txt"

    # Load and parse the Bookmarks file
    with open(INPUT_FILE, 'rb') as plist_file:
        plist = plistlib.load(plist_file)

    # Look for the child node which contains the Reading List data.
    # There should only be one Reading List item
    children = plist['Children']
    for child in children:
        if child.get('Title', None) == 'com.apple.ReadingList':
            reading_list = child

    # Extract the bookmarks
    bookmarks = reading_list['Children']

    # For each bookmark in the bookmark list, grab the URL
    urls = (bookmark['URLString'] for bookmark in bookmarks)

    # Write the URLs to a file
    with open(OUTPUT_FILE, 'w') as outfile:
        outfile.write('\n'.join(urls))
    
    print(f"Wrote to {OUTPUT_FILE}")
    return OUTPUT_FILE

In [26]:
def getOnlyAO3(fileName):
    """DEPRICATED - Takes a TXT file of URLs & returns 2 lists: [[the AO3 link], [non-AO3 links]]"""
    # Read in file of URLs
    with open(fileName, "r") as infile:
        lines = infile.readlines()
    
    # Sort URLs into archive & non-archive lists 
    archive = []
    notArchive = []
    for line in lines:
        if "archiveofourown.org" in line:
            archive.append(line)
        else:
            notArchive.append(line)

    return [archive, notArchive]
    
    
#     # Write archive URLs to file
#     fileNice = fileName.split("/")[-1] \
#                         .replace(' ','-')
#     archiveFile = f"archive_{fileNice}"
#     with open(archiveFile, "w") as outfile:
#         outfile.writelines(archive)
#         print(f"Wrote {len(archive)} AO3 link(s) to {archiveFile}")
    
#     # Write non-archive URLs to file
#     notArchiveFile = f"notArchive_{fileNice}"
#     with open(notArchiveFile, "w") as outfile:
#         outfile.writelines(notArchive)
#         print(f"Wrote {len(notArchive)} non-AO3 links to {notArchiveFile}")

<a id="sec7"></a>
## Single-use functions

In [27]:
def makeCol():
    """Single use - Made Collection Works URLs from (idNum, collectionName) tuples."""
    # Reads in numColWorks TXT file
    res = []
    with open("urls/numColWorks.txt", "r") as inFile:
        lines = inFile.readlines()
        
    # Creates ColWorks URLs from the id & name
    for line in lines:
        data = line[:-1].split(",")
        col = data[1]
        idNum = data[0]
        print(f"https://archiveofourown.org/collections/{col}/works/{idNum}")

    # Writes ColWorks URLs
    fileName = f"urlsOutput/from_{'numColWorks'.lower().replace(' ','-')}.txt"
    with open(fileName, "w") as outFile:
        print(f"Writing to {fileName}")
        outFile.writelines(res)

In [28]:
def addToStart(infile, strToAdd):
    """Single Use (kinda) - Takes a TXT file & string. Adds given string to the front of each line in the file, 
    writes a new file named 'from_{given file}'."""
    # Reads in given file
    res = []
    with open(f"urls/urlFiles/{infile}", "r") as inFile:
        lines = inFile.readlines()
        
    # Attaches given string to front
    for line in lines:
        res.append(strToAdd+line)

    # Overwrites given file with new 
    with open(f"urlsOutput/from_{infile.lower().replace(' ','-')}", "w") as outFile:
        outFile.writelines(res)

In [29]:
print("Success!")

Success!
