# Get placenames from newspaper titles

This notebook looks for place names in the titles of digitised newspapers from Trove, matches the names to the VicNames dataset, and adds the place coordinates.

This is a modified version of the code I used to create the original Trove Places app in 2014.

In [52]:
import pandas as pd
import re
import string
from nltk import bigrams, word_tokenize, ngrams
from nltk.corpus import stopwords
import csv
from datetime import datetime

## Load place data

In [3]:
df_places = pd.read_csv("places.csv")
df_places["placename_upper"] = df_places["Place Name"].str.upper()

In [4]:
# order types by priority in case there are dupe names
code = {
    "LOCB": 1,
    "LGA": 2,
    "CNTY": 3,
    "PRSH": 4,
    "NBHD": 5
}
df_places["feature_order"] = df_places["Feature Type Code"].apply(lambda x: code[x])

In [5]:
df_places = df_places.sort_values(["placename_upper", "feature_order"])

In [6]:
df_places.drop_duplicates("placename_upper", keep="first", inplace=True)

In [7]:
df_places

Unnamed: 0,State,Municipality,Name Id,Place Name,Place Name Status,Feature Type Code,Feature Type,Longitude,Latitude,Place Id,placename_upper,feature_order
0,VIC,MANSFIELD SHIRE,25,A1 MINE SETTLEMENT,REGISTERED,NBHD,NEIGHBOURHOOD,146.201260,-37.499848,9127,A1 MINE SETTLEMENT,5
1,VIC,ALPINE SHIRE,100117,ABBEYARD,REGISTERED,LOCB,LOCALITY,146.752408,-37.025339,100103,ABBEYARD,1
2,VIC,YARRA CITY,100118,ABBOTSFORD,REGISTERED,LOCB,LOCALITY,144.998711,-37.802505,100104,ABBOTSFORD,1
6,VIC,MOONEE VALLEY CITY,100119,ABERFELDIE,REGISTERED,LOCB,LOCALITY,144.897934,-37.759856,100105,ABERFELDIE,1
7,VIC,BAW BAW SHIRE,100120,ABERFELDY,REGISTERED,LOCB,LOCALITY,146.378349,-37.702066,100106,ABERFELDY,1
...,...,...,...,...,...,...,...,...,...,...,...,...
9536,VIC,SOUTHERN GRAMPIANS SHIRE,30779,YUPPECKIAR,REGISTERED,PRSH,PARISH OR HUNDRED,142.507923,-37.655959,9110,YUPPECKIAR,4
9538,VIC,HUME CITY,103506,YUROKE,REGISTERED,LOCB,LOCALITY,144.863798,-37.581629,103492,YUROKE,1
9539,VIC,COLAC OTWAY SHIRE,103507,YUULONG,REGISTERED,LOCB,LOCALITY,143.307785,-38.722737,103493,YUULONG,1
9551,VIC,GREATER SHEPPARTON CITY,103508,ZEERUST,REGISTERED,LOCB,LOCALITY,145.401341,-36.273084,103494,ZEERUST,1


## Load Trove data

Load the most reecent harvest of newspaper titles.

In [8]:
# Load Trove data
dft = pd.read_csv("https://raw.githubusercontent.com/wragge/trove-newspaper-totals/refs/heads/master/data/total_articles_by_newspaper.csv")

In [9]:
dfv = dft.loc[dft["state"] == "Victoria"]

In [10]:
dfv

Unnamed: 0,title_id,total,title,state,issn,start_date,end_date
27,1023,855,The Melbourne Weekly Courier (Vic. : 1844 - 1845),Victoria,14403684,1844-01-06,1845-03-28
28,1024,2071,The Melbourne Courier (Vic. : 1845 - 1846),Victoria,14403692,1845-06-16,1846-03-11
29,1025,1565,Melbourne Times (Vic. : 1842 - 1843),Victoria,1440219X,1842-04-09,1843-12-08
34,103,2143,The Australian News for Home Readers (Vic. : 1...,Victoria,18373542,1864-01-25,1867-06-28
49,1043,14,"Seamen's Strike Bulletin (Melbourne, Vic. : 1919)",Victoria,2205085X,,
...,...,...,...,...,...,...,...
1787,958,2204,The Melbourne Leader (Vic. : 1861),Victoria,22044949,1861-01-12,1861-12-28
1788,959,19039,Bell's Life in Victoria and Sporting Chronicle...,Victoria,22044868,1857-01-03,1868-01-04
1790,960,36609,The Snowy River Mail and Tambo and Croajingolo...,Victoria,22044906,1890-08-09,1911-08-31
1791,961,20225,"The Tocsin (Melbourne, Vic. : 1897 - 1906)",Victoria,22044944,1897-10-02,1906-10-25


Load the dataset from Trove Places and find which titles have been added since I last updated it.

In [26]:
df_trove = pd.read_csv("trove-newspaper-titles-locations.csv")

In [27]:
df_trove = df_trove.loc[df_trove["state"] == "VIC"]

In [29]:
urls = list(df_trove["title_id"].unique())
df_trove_extra = dfv.loc[~dfv["title_id"].isin(urls)]

In [31]:
df_trove_extra.shape

(50, 7)

## Find and match place names

To find placenames we tokenise the titles and look up individual words, bigrams, and trigrams in the places dataset.

In [55]:
def find_place(placename):
    placename = placename.replace(".", "")
    #print(placename)
    matches = df_places.loc[df_places["placename_upper"] == placename.upper()]
    if not matches.empty:
        return matches.iloc[0].to_dict()

def locate_titles():
    '''
    Tokenize titles and look-up each uni/bigram in the places db.
    Write the results to a CSV file for manual checking/editing.
    '''
    not_found = 0
    title_stopwords = []
    # Add any words you don't want to treated as potential places
    with open('title_stop_words.txt', 'r') as title_stop_file:
        for title_word in iter(title_stop_file):
            title_stopwords.append(title_word.lower().strip())
    # We're going to write results to a CSV file for checking
    with open('titles-2025.csv', 'w') as titles_csv:
        writer = csv.writer(titles_csv)
        writer.writerow(["title_id", "newspaper_title", "place_id", "place", "latitude", "longitude"])
        for newspaper in df_trove_extra.itertuples():
            title = newspaper.title
            places = []
            try:
                # Get things that could be placenames in the brackets at the end of titles
                placename = re.search(r'\(([A-Za-z \/\.]+),', title).group(1)
                # print placename
            except AttributeError:
                placename = None
            # print '\n{}'.format(title.encode('utf-8'))
            # Remove the stuff in brackets from the title
            title = re.sub(r'\(.*\)', '', title).strip().encode('utf-8').decode()
            # title = re.sub(r'\(.*\)', '', title).strip()
            # Add the de-bracketed names back to the title
            if placename:
                title = '{} {}'.format(title, placename)
            stop = stopwords.words('english') + [p for p in string.punctuation] + title_stopwords
            # Tokenize titles -- unigrams and bigrams
            title_words = [word for word in word_tokenize(title.replace('-', ' ').replace('/', ' ').lower()) if word not in stop]
            title_trigrams = ngrams(title_words, 3)
            title_bigrams = bigrams(title_words)
            for title_trigram in title_trigrams:
                # Look up each bigram in the places db
                place = find_place(' '.join(title_trigram))
                if place:
                    # print '  {}'.format(place['name'])
                    writer.writerow([newspaper.title_id, newspaper.title.encode('utf-8').decode(), place['Name Id'], place["Place Name"].title(), place['Latitude'], place['Longitude']])
                    places.append(place)
            # Check the bigrams first
            for title_bigram in title_bigrams:
                # Look up each bigram in the places db
                place = find_place(' '.join(title_bigram))
                if place:
                    # print '  {}'.format(place['name'])
                    writer.writerow([newspaper.title_id, newspaper.title.encode('utf-8').decode(), place['Name Id'], place["Place Name"].title(), place['Latitude'], place['Longitude']])
                    places.append(place)
            # Now check for unigrams
            for title_word in title_words:
                place = find_place(title_word)
                if place and place not in places:
                    # print '  {}'.format(place['name'])
                    writer.writerow([newspaper.title_id, newspaper.title.encode('utf-8').decode(), place['Name Id'], place["Place Name"].title(), place['Latitude'], place['Longitude']])
                    places.append(place)
            # Write titles without places to the CSV file for manual checking
            if not places:
                writer.writerow([newspaper.title_id, newspaper.title.encode('utf-8').decode(), '', '', ''])
                not_found += 1
            # titles.save(newspaper)
    print(not_found)

In [56]:
locate_titles()

0
