# Set Up to use Tumblr API

In [None]:
!pip install pytumblr pyyaml rauth

In [None]:
!pip install nltk
!pip install pandas
!pip install seaborn
!pip install beautifulsoup4

In [None]:
import pytumblr
import os
import yaml
import webbrowser
from rauth import OAuth1Service
from pathlib import Path
from urllib.parse import urlparse, parse_qs

In [None]:
import random
import re
import os
import time
import ast

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from collections import defaultdict
from datetime import datetime
from collections import Counter
from bs4 import BeautifulSoup

In [None]:
# Set your Tumblr App credentials
consumer_key = "<consumer_key>"
consumer_secret = "<consumer_secret>"

# Token storage path
tumblr_token_path = Path.home() / ".tumblr"

# Function to load tokens from file
def load_tokens():
    if tumblr_token_path.exists():
        with open(tumblr_token_path, "r") as f:
            return yaml.safe_load(f)
    return None

# Function to save tokens to file
def save_tokens(tokens):
    with open(tumblr_token_path, "w") as f:
        yaml.dump(tokens, f)
    print(f"Tokens saved to {tumblr_token_path}")

In [None]:
def do_oauth():
    tumblr = OAuth1Service(
        name='tumblr',
        consumer_key=consumer_key,
        consumer_secret=consumer_secret,
        request_token_url='https://www.tumblr.com/oauth/request_token',
        access_token_url='https://www.tumblr.com/oauth/access_token',
        authorize_url='https://www.tumblr.com/oauth/authorize',
        base_url='https://api.tumblr.com/v2/'
    )
    # Step 1: Get request token
    request_token, request_token_secret = tumblr.get_request_token(params={'oauth_callback': 'http://www.example.com'})
    authorize_url = tumblr.get_authorize_url(request_token) 

    print("Go to this URL and authorize the app:")
    print(authorize_url)

    webbrowser.open(authorize_url)

    verifier = input("Paste the verifier Tumblr shows you: ").strip()

    session = tumblr.get_auth_session(request_token, request_token_secret,
                                 method='POST', data={'oauth_verifier': verifier})
     # Print the oauth_token and oauth_token_secret
    print("OAuth Token:", session.access_token)
    print("OAuth Token Secret:", session.access_token_secret)

    return {
        'consumer_key': consumer_key,
        'consumer_secret': consumer_secret,
        'oauth_token': session.access_token,
        'oauth_token_secret': session.access_token_secret
    }

__After running the snippet below, and authorizing the app, the verifier is found in the URL after 'oauth_verifier='__

In [None]:
tokens = do_oauth()

# Corpus Collection

In [None]:
#Fill in these to make it easier to use again
consumer_key = '<consumer_key>'
consumer_secret = '<consumer_secret>'
oauth_token = '<oauth_token>'
oauth_token_secret = '<oauth_secret>'

In [None]:
client = pytumblr.TumblrRestClient(
    '<consumer_key>',
    '<consumer_secret>',
    '<oauth_token>',
    '<oauth_secret>',
)

blog_name = '<replace with link to tumblr page>'

In [None]:
client.info()

In [None]:
import requests
import json
import pandas as pd

# Compile and Preprocess Archive

In [None]:
#Use this function to collect posts by month
#month param should be an int 1-12
def collect_monthly_posts(blog_name, consumer_key, year, month, limit=20):
    offset = 0
    posts_collected = []

    # Calculate start and end timestamps for the specified month
    start_date = int(datetime(year, month, 1).timestamp())
    if month == 12:
        end_date = int(datetime(year + 1, 1, 1).timestamp())
    else:
        end_date = int(datetime(year, month + 1, 1).timestamp())

    def collect_posts(offset):
        request_uri = (
            f"https://api.tumblr.com/v2/blog/{blog_name}/posts"
            f"?api_key={consumer_key}&limit={limit}&offset={offset}"
        )
        response = requests.get(request_uri)
        response.raise_for_status()
        return response.json()

    while True:
        data = collect_posts(offset)
        posts = data['response'].get('posts', [])

        if not posts:
            break

        for post in posts:
            post_date_str = post['date']
            post_date = datetime.strptime(post_date_str[:-4], "%Y-%m-%d %H:%M:%S")
            post_timestamp = int(post_date.timestamp())

            if start_date <= post_timestamp < end_date:
                posts_collected.append(post)
        
        offset += limit

    return pd.DataFrame(posts_collected)

In [None]:
#save dataframes into variables
month_df = collect_monthly_posts(blog_name, consumer_key, 2025, month, limit=20)
#eg: jan_df = collect_monthly_posts(blog_name, consumer_key, 2025, 1, limit=20)

In [None]:
#list columns in df
list(month_df)

In [None]:
columns_to_keep = ['date','id', 'post_url','tags', 'body'] #adapt for your purposes
month_posts = month_df[columns_to_keep]
month_posts = month_posts.sort_values(by='date', ascending=True)

In [None]:
#combine all dfs needed
posts_df = [month1_posts, month2_posts, month3_posts]
df = pd.concat(posts_df)

In [None]:
df

## Preprocess

In [None]:
#use the following if/when needed or combine to create a multi-step preprocess function
def html_to_text(x):
    if isinstance(x, str) and ('<' in x and '>' in x):  # crude check for HTML
        return BeautifulSoup(x, 'html.parser').get_text()
    return x  # return as-is if not HTML

def remove_punc(text):
    punctuation = '!@#$%^&*()_+={}[]:;"\'|<>,.?/~`'
    text = ''.join(character for character in text if character not in punctuation) 
    return text

#for removing user links and names while before removing html
def remove_users(text):
    if not isinstance(text, str):
        text = ''  # or str(text) if you want to preserve the original content

    # Remove all <a ...>username</a>: patterns
    cleaned = re.sub(r'<a[^>]*>[^<]*</a>:', '', text)

    # Repeat in case of multiple user links
    while re.search(r'<a[^>]*>[^<]*</a>:', cleaned):
        cleaned = re.sub(r'<a[^>]*>[^<]*</a>:', '', cleaned)
    text = cleaned.strip()
    return text

def del_urls(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags = re.MULTILINE)
    return text

def del_htmltags(text):
    text = re.sub(r'<.*?>', '', text) 
    return text
    
def rem_allhtml(text):
    if pd.isna(text):
        return ''  # Return empty string for NaN or None
    else:
        return BeautifulSoup(text, 'html.parser').get_text()

def del_emoj(text):
    if not isinstance(text, str):
        return text
    # Remove emojis and other non-ASCII characters
    else: 
        text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text

In [None]:
#apply preprocess
df['body'] = df['body'].apply(remove_users)
df['body'] = df['body'].apply(html_to_text)
df['body'] = df['body'].str.replace('\n', ' ', regex=False).str.replace('\r', ' ', regex=False)

In [None]:
#remove posts to be excluded from dataset for analysis
#remove posts tagged "not thg" 
df = df[~df['tags'].apply(lambda tags: 'not thg' in tags)]

In [None]:
#save archive as CSV
df.to_csv('blog_archive.csv', index=False)

# Tag Frequency Analysis

In [None]:
# combine/use alias maps to combine tags that mean the same - adapt for usage
alias_map = {
    'katniss': 'katniss everdeen',
    'katniss everdeen': 'katniss everdeen',
    'thg katniss': 'katniss everdeen',
    'peeta mellark': 'peeta mellark',
    'peeta': 'peeta mellark',
    'thg peeta': 'peeta mellark',
    'katniss x peeta': 'everlark',
    'peeta x katniss': 'everlark',
    'Everlark': 'everlark',
    'haymitch': 'haymitch abernathy',
    'haymitch abernathy': 'haymitch abernathy',
    'finnick': 'finnick odair',
    'finnick odair': 'finnick odair',
    'thg finnick': 'finnick odair',
    'gale hawthorne': 'gale hawthorne',
    'gale': 'gale hawthorne',
    'hunger games': 'the hunger games',
    'the hunger games': 'the hunger games'
}

In [None]:
def df_tag_count(df):
    # Convert string representation of lists into actual Python lists
    df['tags'] = df['tags'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    all_tags = [tag for tags in df['tags'] for tag in tags]
    all_tags = [tag.lower() for tag in all_tags]
    tag_counts = Counter(all_tags)
    return tag_counts

In [None]:
#if you want to compile individual dfs for each theme/character/tag eg: 'talking about characters'
def extract_character(df, tag):
    tag_df = df[df['tags'].apply(lambda tags: tag in tags if isinstance(tags, list) else False)]
    tag_df.reset_index(drop=True, inplace=True)
    return tag_df

In [None]:
character_df = extract_character(df_characters, 'character')
#eg: katniss_df = extract_character(df_characters, 'katniss everdeen')

In [None]:
df_characters.to_csv('phase1_characters.csv', index=False)

# Riveter

## Set up

In [None]:
#for riveter
!pip install -U spacy-experimental
!pip install https://github.com/explosion/spacy-experimental/releases/download/v0.6.0/en_coreference_web_trf-3.4.0a0-py3-none-any.whl
#egg=en_coreference_web_trf
!spacy download en_core_web_sm

In [None]:
pwd #to confirm directory

In [None]:
!pip install -e .

In [None]:
#just in case it was missed earlier
from collections import defaultdict
from collections import Counter
from datetime import datetime
import pandas as pd
import random
import numpy as np
import requests
import re
import ast

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# SPACY & COREF IMPORTS
import spacy
import spacy_experimental
nlp = spacy.load("en_core_web_sm")
nlp_coref = spacy.load("en_coreference_web_trf")

nlp_coref.replace_listeners("transformer", "coref", ["model.tok2vec"])
nlp_coref.replace_listeners("transformer", "span_resolver", ["model.tok2vec"])

nlp.add_pipe("coref", source=nlp_coref)
nlp.add_pipe("span_resolver", source=nlp_coref)

In [None]:
from riveter import Riveter

In [None]:
#for preprocessing text
from bs4 import BeautifulSoup
import html

In [None]:
phase_1_corpus = character_df

## Preprocess

In [None]:
keep_columns = ['post_id', 'text']

In [None]:
#convert dicts to a list of values from the dictionary
def dict_val_to_list(my_dict):
  if not isinstance(my_dict, dict):
    print("Error: Input is not a dictionary.")
    return []  # Return an empty list if the input is not a dictionary

  return list(my_dict.values())

In [None]:
def parse_html_list(html_list):
    cleaned = []
    for item in html_list:
        if not isinstance(item, str):
            # Option 1: convert to string
            item = str(item)
            # Option 2: skip non-string items by uncommenting the next line
            # continue
        soup = BeautifulSoup(item, 'html.parser')
        text = soup.get_text(separator=' ', strip=True)
        text = re.sub(r'\xa0', '', text)
        cleaned.append(text)
    return cleaned

In [None]:
p1 = phase_1_corpus[keep_columns]
p1_id_dict = p1['post_id'].to_dict()
p1_ids = dict_val_to_list(p1_id_dict)
p1_body_dict = p1['text'].to_dict()
p1_text = dict_val_to_list(p1_body_dict)

In [None]:
p1_texts = parse_html_list(p1_text)

In [None]:
#Use this dict like an alias map - use regex so model is trained to recognize different ways of naming a character
persona_patterns_dict={'katniss': r'^katniss$|^katniss everdeen$|^girl on fire$',
                        'peeta': r'^peeta$|^peeta mellark$|^boy with the bread$',
                        'haymitch': r'^haymitch$|^haymitch abernathy$',
                        'president snow': r'^coryo$|^snow$|^president snow$|^coriolanus snow$',
                        'president coin': r'^coin$|^alma coin$|^president coin$|^d13 president$',
                        'primrose': r'^prim$|^primrose$|^primrose everdeen$',
                        'gale': r'^gale$|^gale hawthorne$', 
                        'finnick': r'^finnick$|^finnick odair$',
                        'plutarch': r'^plutarch$|^plutarch heavensbee$',
                        'annie': r'^annie$|^annie cresta$',
                        'johanna': r'^johanna$|^johanna mason$',
                        'rue': r'^rue$',
                        'effie': r'^effie$|^effie trinket$',
                        'capitol': r'^the capitol$|^capitol$|^the rich$|^the wealthy$|^government$',
                       }

## Load and Train Models

Choose between riveter.load_sap_lexicon('agency') and riveter.load_sap_lexicon('power')

In [None]:
riveter = Riveter()
riveter.load_sap_lexicon('power') 
#when doing for my data - the arguments will be (body, ids) where ids = df['id']
riveter.train(p1_texts, p1_ids,
              persona_patterns_dict={'katniss': r'^katniss$|^katniss everdeen$|^girl on fire$',
                        'peeta': r'^peeta$|^peeta mellark$|^boy with the bread$',
                        'haymitch': r'^haymitch$|^haymitch abernathy$',
                        'president snow': r'^coryo$|^snow$|^president snow$|^coriolanus snow$',
                        'president coin': r'^coin$|^alma coin$|^president coin$|^d13 president$',
                        'primrose': r'^prim$|^primrose$|^primrose everdeen$',
                        'gale': r'^gale$|^gale hawthorne$', 
                        'finnick': r'^finnick$|^finnick odair$',
                        'plutarch': r'^plutarch$|^plutarch heavensbee$',
                        'annie': r'^annie$|^annie cresta$',
                        'johanna': r'^johanna$|^johanna mason$',
                        'rue': r'^rue$',
                        'effie': r'^effie$|^effie trinket$',
                        'capitol': r'^the capitol$|^capitol$|^the rich$|^the wealthy$|^government$',
                       }
             )

Check documentation, or myriv-test to see options for explanation

Load riveter.load_rashkin_lexicon('dimension') and choose from different available dimensions

#### Connotation Frames available via Rashkin: 
- __effect__: whether the event denoted by a predicate is good or bad for the entity
- __state__: the likely mental state of an entity as the result of an event
- __value__: whether an entity is presupposed to be valuable
- __writer_perspective__/__reader_perspective__: the directed sentiment from the writer to an entity or the _predicted_ directed sentiment from reader to an entity
- __agent_theme_perspective__/__theme_agent_perspective__: the directed sentiment between the agent and theme (usually reciprocal and not likely to totally contradict each other). 

Connotation frame polarity can be positive, negative, or neutral. 


In [None]:
riveter = Riveter()
riveter.load_sap_lexicon('dimension')
riveter.train(p2_texts,
              p2_ids,
              persona_patterns_dict={'katniss': r'^katniss$|^katniss everdeen$|^girl on fire$',
                        'peeta': r'^peeta$|^peeta mellark$|^boy with the bread$',
                        'haymitch': r'^haymitch$|^haymitch abernathy$',
                        'president snow': r'^coryo$|^snow$|^president snow$|^coriolanus snow$',
                        'president coin': r'^coin$|^alma coin$|^president coin$|^d13 president$',
                        'primrose': r'^prim$|^primrose$|^primrose everdeen$',
                        'gale': r'^gale$|^gale hawthorne$', 
                        'finnick': r'^finnick$|^finnick odair$',
                        'plutarch': r'^plutarch$|^plutarch heavensbee$',
                        'annie': r'^annie$|^annie cresta$',
                        'johanna': r'^johanna$|^johanna mason$',
                        'rue': r'^rue$',
                        'effie': r'^effie$|^effie trinket$',
                        'capitol': r'^the capitol$|^capitol$|^the rich$|^the wealthy$|^government$',
                       }
             )