# Notebook pour importer les données 

Source : https://huggingface.co/datasets/dell-research-harvard/AmericanStories/tree/main

Test API

In [8]:
import requests
import pandas as pd
import json
import os
from dotenv import load_dotenv

In [2]:
load_dotenv()
url="https://huggingface.co/api/datasets/dell-research-harvard/AmericanStories"
token=os.getenv("HuggingFaceToken")
headers = {
    "Authorization" : f"Bearer {token}"
}

In [3]:
def url_request(fileName):
    url= f"https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/{fileName}"
    print(url)
    return url

In [4]:
wb=requests.get(url_request("faro_1944.tar.gz"), headers=headers)
print(wb.status_code)  

https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1944.tar.gz
200


In [5]:
with open("faro_1944.tar.gz", "wb") as f:
    f.write(wb.content)

In [6]:
import tarfile
with tarfile.open("faro_1944.tar.gz", "r:gz") as tar:
    tar.extractall(path="faro_1944")

Parcous de tout les JSON du dossier pour récupérer tout les noms de journaux

In [18]:
import os
year=1929
files=os.listdir(f"ArticlesTarGz/faro_{year}/mnt\\122a7683-fa4b-45dd-9f13-b18cc4f4a187/ca_rule_based_fa_clean/faro_{year}")
print(len(files))

33138


In [None]:

n=0
newspapers=[]
for i in files:
    n+=1
    if n%1000==0:
        print(n, end=" ")
    #open the json file and get the newspaper name
    with open(f"ArticlesTarGz/faro_{year}/mnt\\122a7683-fa4b-45dd-9f13-b18cc4f4a187/ca_rule_based_fa_clean/faro_{year}/{i}") as f:
        data = json.load(f)
    for j in data['full articles']:
        newspapers.append((data["lccn"]["title"],j['article']))
        
    

KeyError: 'lccn'

In [23]:
file=files[0]
with open(f"ArticlesTarGz/faro_{year}/mnt\\122a7683-fa4b-45dd-9f13-b18cc4f4a187/ca_rule_based_fa_clean/faro_{year}/{file}") as f:
    data = json.load(f)
print(data.keys())
print(data['lccn']['title'])
print(data["edition"]["date"])

dict_keys(['lccn', 'edition', 'page_number', 'scan', 'bboxes', 'full articles'])
Evening star.
1929-01-01


In [6]:
print(len(newspapers))
df=pd.DataFrame(newspapers, columns=["newspaper","article"])
df=df[df["article"].str.contains("inflation",case=False)]
print(len(df))

433769
145


In [7]:
print(df.head())
print(df["article"].iloc[0])

                     newspaper  \
30    The Daily Alaska empire.   
103             The messenger.   
1193             Evening star.   
1565   The Waterbury Democrat.   
3259    Imperial Valley press.   

                                                article  
30    cific-"With Gods help, we are on Jour way back...  
103   "The one outstanding news development of 1943 ...  
1193  MEXICO CITY, Jan. 3 LP.-An\nanti-inflation pro...  
1565  OPA continues it's indefensible fiat that an e...  
3259  and still maintain the American\nstandards\n\n...  
cific-"With Gods help, we are on Jour way back" declared General Macarthur after the Allied victory at Lac Sept 16) That victory was only one in long series from the capture of Guadalcanal tFeb. lW to the reconquest of the Gilberts in ovember. They broke Japan's grip in the south Pacific and opened the way for a grand offen- sive. Admiral Nimita called the Gil- bert invasion another road to To- kyo.' and added, in due time we'll have enough eq

# Fonctions pour manipuler les données 

Maintenant nous allons définir quelques fonctions pour intéragir simplement avec la base de données sans avoir à tout télécharger car le volume des données rend le téléchargement intégral difficile.

Ces fonctions définies ci-dessous seront aussi définies dans le fichier *getData.py* pour pouvoir être importées dans le notebook principal

load_credentials renvoie le header contenant le token pour pouvoir requêter l'API

In [9]:
def load_credentials():
    load_dotenv()
    url="https://huggingface.co/api/datasets/dell-research-harvard/AmericanStories"
    token=os.getenv("HuggingFaceToken")
    headers = {
        "Authorization" : f"Bearer {token}"
    }
    return headers

download_targz prend en argument les années que l'on veut télécharger et les télécharge dans le dossier *ArticlesTarGz*

In [10]:
def download_targz(years):
    if isinstance(years, str):
        years=[years]
    os.makedirs("ArticlesTarGz", exist_ok=True)
    for i in years:
        print(i, end=" ")
        file=f"faro_{i}.tar.gz"
        url=f"https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/{file}"
        wb=requests.get(url, headers=headers)
        print(wb.status_code)
        if wb.ok:
            with open("ArticlesTarGz/"+file, "wb") as f:
                f.write(wb.content)
        else:
            print(f"Error {wb.status_code} downloading {file}")

In [10]:
headers=load_credentials()
download_targz([i for i in range(1929,1931)])

1929 200
1930 200


In [12]:
def extract_targz(years):
    if isinstance(years, str):
        years=[years]
    for i in years:
        print(i, end=" ")
        try :
            with tarfile.open(f"ArticlesTarGz/faro_{i}.tar.gz", "r:gz") as tar:
                tar.extractall(path=f"ArticlesTarGz/faro_{i}")
        except FileNotFoundError:
            print(f"Error extracting {i} The file is not found")

In [13]:
import tarfile

extract_targz([i for i in range(1929,1931)])

1929 1930 

In [11]:
def fold_left_local(f, acc, years):
    if isinstance(years, str):
        years=[years]
    for i in years:
        print(i, end=" ")
        files=os.listdir(f"ArticlesTarGz/faro_{i}/mnt\\122a7683-fa4b-45dd-9f13-b18cc4f4a187/ca_rule_based_fa_clean/faro_{i}")
        for j in files:
            with open(f"ArticlesTarGz/faro_{i}/mnt\\122a7683-fa4b-45dd-9f13-b18cc4f4a187/ca_rule_based_fa_clean/faro_{i}/{j}") as f:
                data = json.load(f)
            acc=f(data, acc, (data["lccn"]["title"],data['edition']['date'],data['full articles']))
    return acc

In [114]:
def map_local(fonction,years):
    if isinstance(years, str):
        years=[years]
    acc=[]
    for i in years:
        print(i, end=" ")
        files=os.listdir(f"ArticlesTarGz/faro_{i}/mnt\\122a7683-fa4b-45dd-9f13-b18cc4f4a187/ca_rule_based_fa_clean/faro_{i}")
        for j in files:
            with open(f"ArticlesTarGz/faro_{i}/mnt\\122a7683-fa4b-45dd-9f13-b18cc4f4a187/ca_rule_based_fa_clean/faro_{i}/{j}") as f:
                data = json.load(f)
            if data['full articles']!=[]:
                try :
                    acc.append(fonction(data["lccn"]["title"],data['edition']['date'],data['full articles']))
                except:
                    acc.append(fonction("No titles found",data['edition']['date'],data['full articles']))
    return acc
    


In [115]:
def filter_and_freq(df,isInflation,title,date,articles):
    if not (date in df.keys()):
        df[date]=0
    for i in range(len(articles)): 
        if isInflation(articles[i]['article']):
            df[date]+=1
        else :
            articles[i]=None
    articles=[i for i in articles if i is not None]
    return (title,date,articles)

In [116]:
from functools import partial
frequences={}
def isInflation(article):
    return article.lower().find("inflation")!=-1
f_f=partial(filter_and_freq, frequences, isInflation)

In [117]:
f_f("newspaper","date",[{"article" : "inflation is a problem in the US"},{"article" : "bread is a problem in the US"}])
print(frequences)

{'date': 1}


In [118]:
inflationArticles=map_local(f_f,[i for i in range(1929,1931)])

1929 1930 

In [121]:
print(frequences)
frequences={k : v for k,v in frequences.items() if v>0}
print(frequences)

{'date': 1, '1929-01-24': 1, '1930-04-11': 1, '1930-11-01': 1, '1930-12-13': 1, '1930-12-20': 1}
{'date': 1, '1929-01-24': 1, '1930-04-11': 1, '1930-11-01': 1, '1930-12-13': 1, '1930-12-20': 1}


In [127]:
print(len(inflationArticles))
inflationArticles=[i for i in inflationArticles if len(i[2])>0]
print(inflationArticles)
#writing the inflation articles to a txt file
with open("inflationArticles.txt", "w") as f:
    for i in inflationArticles:
        f.write(f"{i[0]} {i[1]}\n")
        for j in i[2]:
            f.write(f"{j['article']}\n")
        f.write("\n\n")

5
[('The Daily Alaska empire.', '1929-01-24', [{'object_ids': [16, 29, 42, 3], 'headline': 'CLIMBS BACK\n\n TO POWER IN\n\n FRENCH STATE\n\nSenator and Former Pre-. mier Caillaux Now to\n\n Be Big Figure', 'article': 'PARIS, Jan. 24.-Parliamentary prophets are predicting that the Kaleidoscope of French politics will bring Senator and ex-Premier Joseph Caillaux back into the pie ture.\n\n Caillaux has declared against the governments policy of amortiza- ton of the public debt, has criti- cised the purchase of foreign ex- change through the issue of paper money, which he calls "back hand ed inflation," and has pronounced himself strongly in favor of reduc- ing taxation rather than of using the sinking fund to retire govern ment bonds.\n\n The ex-premier argues that am ortization in connection with the high rate of taxation will require the present generation to pay an unequal share of the cost of war, while 1t will unduly relieve future generations that will benefit from all the sacrific