In [None]:

#This cell installs all the modules needed for this code. You only need to run it once. When running the code again, start it as cell two below.


import sys
!{sys.executable} -m pip install wordcloud
!{sys.executable} -m pip install TextBlob
!{sys.executable} -m pip install nltk
!{sys.executable} -m pip install spacy
!{sys.executable} -m spacy download en_core_web_sm
!{sys.executable} -m pip install vega








In [None]:

#This code allows you to automatically search historical newspapers via successive street addresses using Trove's API. 
#You just enter in the street you are searching for, the street numbers you wish to look between, the year span you wish to explore, and your Trove API key and the code does the rest.

#The code finds newspapers article via your specifications, automatically culls irrelevant results, and creates property and street level CSV files containing article information and full text. The CSV files  are saved to a newly created folder on your computer named after your search terms, i.e. Little_Lonsdale_street_1870_1890.

#In addition, the code graphs article frequency by date and street number, displays an on-screen summary of all the articles found on a street level. It produces street level Wordclouds of the most common words in the article text and in the heading text and displays the 20 most common Ngrams on a street level. All of this material is saved to your directory.
#The code also attempts to extract all the people mentioned in the articles and adds them to your CSV files. Please note this work's via Stanford University’s Spacy AI that was trained on modern American webcontent. It is not perfect with nineteenth-century material and will often overlook non-American sounding names. As always, machine learning based code reflects the biases of the material it was trained on.

#have fun,

#Pete 



#To successfully run the code, you need an individual TROVE API key. You can acquire a key by signing up as a registered user.
#Sign up for Trove here: https://trove.nla.gov.au/
#Once you have signed up:
                        log in to Trove, select your username and select My Profile;
                        select the:  For developers tab;
                        fill in the form to apply for a Trove API key;
                        read the documentation and start using your key to access the API;
                        insert your API key into the code below as instructed.


# Insert your individual TROVE API key between the quotation marks below.
api_key = ''


import requests
import json
import pandas as pd
from pathlib import Path
import re
import time
import os
from wordcloud import WordCloud
from textblob import TextBlob
from collections import Counter
import nltk
from nltk.corpus import stopwords
import spacy
import vega
import altair as alt
nlp = spacy.load("en_core_web_sm")
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')










#Enter your search terms below 


# define street search term

street = ['Elizabeth  Street']



#state to seatch within
#accepted options are: 'ACT', 'International', 'National', '
                    #New South Wales', 'Northern Territory', 'Queensland', 'South Australia', 'Tasmania', 'Victoria'

state = ['Victoria']

#street numbers to be searched between


First_number = 1

Last_number = 10

#dates to search between - year - month -date 

start_date = 1914
end_date = 1919

#type of articles to search for, the accepted options are:
                                            #'Article';'Advertising';'Details lists, results, guides';'Family Notices'; 'Literature'
    
#Please note that the API can only search for one article type at a time, or for all article types. 
#Leave the square brackets below empty if you want to search for all article types

articlegenre = ['Article']

# rs = the Trove Relevance Score below which this code will cull the results as irrelevant.
#If you are finding few results try lowering th RTscore, if you getting false positives try raising the score. 
#My experiments have shown me that 5 is a  good default rs score. 

rs = 5

# search parameters
#generally do not mess with this stuff

params = {
    'key': api_key,
    'zone': 'newspaper',
    'include': 'articleText',
    'n': 100,
    'encoding': 'json',
    'bulkHarvest':'false',
    'reclevel': 'brief',
    'sortby': 'relevance',
    'l-state':state,
    'l-category':articlegenre 
}





for street in street:

# Slugify street names to use in paths and filenames
    street_slug = street.replace(' ', '_')

# pathlib makes working with files and directories easier
    street_path = Path((street_slug)+"_" +str(start_date)+"_" + str(end_date))
    street_path.mkdir(exist_ok=True)
    street_name_dates = (street_slug)+"_" +str(start_date)+"_" + str(end_date)
    

for num in range((First_number), (Last_number)):

    # Use text: and ~0 to make the search as exact as possible
    params['q'] = f'text:"{num} {street}"~0  date:[{start_date} TO {end_date}]'
    string_num = str(num)
# Get the data from the API
    response = requests.get(
        'https://api.trove.nla.gov.au/v2/result', params=params)

    data = response.json()

    time.sleep(.2)

    try:
        articles = data['response']['zone'][0]['records']['article']
    except KeyError:
        continue
    else:
        df = pd.json_normalize(articles)
        pd.set_option('max_colwidth', 100000) #important to allow proper searching
        
# culling returns based on relevence score

        df["relevance"] = df["relevance.score"].astype('float')

        df = df[df.relevance >= (rs)]
#inserting search term column
        df.insert(loc=0, column='search_term', value=(
            (string_num) + (" ") + (street_slug)))

# ending loop if row empty and results null

        shape = df.shape
        shape = shape[0]

    if (shape) >= (1):

        try:
# droppinguseless columns
            df.drop(columns=['url', 'pageSequence', 'title.id',
                    'relevance.score', 'relevance.value', 'snippet'], inplace=True)
        except KeyError:
            pass
        else:
# converting dftypes
            df['id'] = df['id'].astype('int')

            df['date'] = pd.to_datetime((df['date']), yearfirst=False,)
            
       
        try:
# stripping out HTML
            df['article_text'] = df['articleText'].str.replace(
                r'<[^<>]*>', '', regex=True)
            df.drop(columns='articleText', inplace=True)
        except KeyError:
            pass
       
        

        else:


    
# relabelling and reordering columns
            df['article_ID'] = df['id']
            df['article_type'] = df['category']
            df['article_heading'] = df['heading']
            df['newspaper'] = df['title.value']
            df.drop(columns=['id', 'heading', 'title.value'], inplace=True)
            df.insert(loc=10, column='people_in_text', value =(" "))
            
            df = df[["search_term",'article_ID', "relevance", "article_type", "article_heading",
                     "newspaper", "date", "page", "troveUrl", "article_text", "people_in_text"]]
            
#finding names of people, inserting into DF
            

        for number in range(len(df)):
 
                article_text_value =df.loc[[(number)],['article_text']]
                article_text_value =str(article_text_value)
                
                
                doc = nlp(article_text_value)
                    
                people = ([(ent.text, ent.label_) for ent in doc.ents if ent.label_ == 'PERSON'])
                people  = re.sub(r'<[^<>]*>','',str(people))
                df.loc[[(number)],['people_in_text']] = str(people)
     
     
  # savingfile based on actual results being found, does not save null results

        df.to_csv(Path(street_path, f'{num}_{street_slug}.csv'), index=False)
        print("Relevant articles were found for" + " " + (string_num) + (" ") + (street_slug))

print("Relevant articles were found the street addresses above  and saved as individual csv files  in a" + " " + street_name_dates + " " + "folder in your active path.")



In [None]:
#opening created files, making consolidated street level file


list_of_dataframes = []

for num1 in range((First_number), (Last_number)):

    try:
        list_of_dataframes.append(pd.read_csv(
            Path(street_path, f'{num1}_{street_slug}.csv')))

    except FileNotFoundError:
        pass
    else:

        merged_df = pd.concat(list_of_dataframes)
        merged_df.drop_duplicates(subset=None, keep='first', inplace=True)

merged_df.to_csv(Path(street_path, f'{street_name_dates}.csv'), index=False)

print("A consolidated CSV file of all the results for" + " " + street_name_dates + " " + " has been created and saved in a" + " "+(street_name_dates) +" "+ "folder in active path.")

In [None]:
#creating graph of article frequency by date, useful for seeing when news was being produced on a street
alt.renderers.enable('default')

value_counts = merged_df['date'].value_counts()

print (('Article numbers by date for')+" " + (street) + ' ' + 'from' +" "+ str(start_date)+" " + 'to' + " " + str(end_date))

print( " ")

print ('Hover over the data point with mouse to get precise details, scroll sideways to see full dates.')
       
print ('This graph can be saved using the three dot tool in the upper right hand corner.')

# converting to df and assigning new names to the columns
df_value_counts = pd.DataFrame(value_counts)
df_value_counts = df_value_counts.reset_index()
df_value_counts.columns = ['dates', 'number_of_articles']
df_value_counts

alt.Chart(df_value_counts, title =('Article numbers by date for')+" " + (street) + ' ' + 'from' +" "+ str(start_date)+" " + 'to' + " " + str(end_date) ).mark_bar().encode(
   
    alt.X('dates'),
    alt.Y('number_of_articles'),
    tooltip = [alt.Tooltip('dates'),
               alt.Tooltip('number_of_articles')
               
              ]
    
).interactive()






In [None]:

#creates graph of article numbers by street address 


address_counts = merged_df['search_term'].value_counts()



print(('Article numbers by address for')+" " + (street) + ' ' + 'from' +" "+ str(start_date)+" " + 'to' + " " + str(end_date)) 

print( " ")

print ('Hover over the data point with mouse to get precise details, scroll to focus on specific graph areas.')
       
print ('The graph can be saved using the three dot tool in the upper right hand corner.')

# converting to df and assigning new names to the columns
df_address_counts = pd.DataFrame(address_counts)
df_address_counts = df_address_counts.reset_index()
df_address_counts.columns = ['address', 'number_of_articles']
#extracting integers from address, sorting via streetnumber
df_address_counts['street_number'] = df_address_counts['address'].str.replace (r'\D+', '', regex=True)
df_address_counts['street_number'] =   df_address_counts['street_number'].astype(int)                                                                       




df_address_counts.sort_values(by='street_number', inplace =True)



alt.Chart(df_address_counts, title =('Article numbers by address for')+" " + (street) + ' ' + 'from' +" "+ str(start_date)+" " + 'to' + " " + str(end_date) ).mark_bar().encode(
   
    alt.X('street_number'),
    alt.Y('number_of_articles'),
    tooltip = [alt.Tooltip('address'),
               alt.Tooltip('number_of_articles')
               
              ]
    
).interactive()

In [None]:
#displaying consolidated street level dataframe

print ("All the articles found for" +" " + (street) + ' ' + 'from' +" "+ str(start_date)+" " + 'to' + " " + str(end_date)+".") 
 
print (" ")

print ("The complete dataset, including full article  text, URLS and people list, can be found as a saved CSV file in the " + " " + street_name_dates + " " + "folder in your active path.") 
pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth', 300) 

         

display (merged_df[['search_term' , 'article_heading' ,'newspaper' ,'date' ,'page','people_in_text']])


In [None]:
#creating wordclouds based on headings and article text



        
text= merged_df["article_text"]
headings_text = merged_df["article_heading"]
text = str(text)
headings_text = str(headings_text)



wc = WordCloud(width=600, height=300, collocations=True,regexp=r"\w+", min_word_length= 3 )
hwc = WordCloud(width=600, height=300, collocations=True,regexp=r"\w+", min_word_length= 3 )
processed_text = wc.process_text (text)
processed_heading = hwc.process_text (headings_text)
processed_text = str(processed_text)
processed_headindig = str(processed_heading)
wc.generate(processed_text)
processed_heading = str(processed_heading)
hwc.generate(processed_heading)

# Display and save the wordcloud
print("Wordcloud based on heading text.")
display (hwc.to_image())
hwc.to_file(Path(street_path, f'{street_name_dates} headings text .png'))

print("Wordcloud based on article text.")
display (wc.to_image())
wc.to_file(Path(street_path, f'{street_name_dates} article text .png'))




In [None]:
#create and display the most common Trigrams (groups of words in text)

text = text.encode('utf-8').decode ('ascii','ignore')
blob = TextBlob(text)
c1 = Counter ([' '.join(l) for l in blob.ngrams(3)]).most_common(20)
c2= str(c1)
Path(street_path, f'{street_name_dates}top twenty trigrams.txt').write_text(c2)
print ("The 20 most common three word ngrams from the article text: ")
display (c1)






print (" All Ngrams and wordclouds saved to" + " " + street_name_dates + " " + "folder in active path.")

print ("All done")