# Notebook pour importer les données 

Source : https://huggingface.co/datasets/dell-research-harvard/AmericanStories/tree/main

Test API

In [3]:
import requests
import pandas as pd
import json
import os
from dotenv import load_dotenv
import tarfile

In [4]:
load_dotenv()
url="https://huggingface.co/api/datasets/dell-research-harvard/AmericanStories"
token=os.getenv("HuggingFaceToken")
print(token)
headers = {
    "Authorization" : f"Bearer {token}"
}

None


In [3]:
def url_request(fileName):
    url= f"https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/{fileName}"
    print(url)
    return url

In [4]:
wb=requests.get(url_request("faro_1944.tar.gz"), headers=headers)
print(wb.status_code)  

https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/faro_1944.tar.gz


KeyboardInterrupt: 

In [5]:
with open("faro_1944.tar.gz", "wb") as f:
    f.write(wb.content)

In [6]:
import tarfile
with tarfile.open("faro_1944.tar.gz", "r:gz") as tar:
    tar.extractall(path="faro_1944")

Parcous de tout les JSON du dossier pour récupérer tout les noms de journaux

In [18]:
import os
year=1929
files=os.listdir(f"ArticlesTarGz/faro_{year}/mnt\\122a7683-fa4b-45dd-9f13-b18cc4f4a187/ca_rule_based_fa_clean/faro_{year}")
print(len(files))

33138


In [None]:

n=0
newspapers=[]
for i in files:
    n+=1
    if n%1000==0:
        print(n, end=" ")
    #open the json file and get the newspaper name
    with open(f"ArticlesTarGz/faro_{year}/mnt\\122a7683-fa4b-45dd-9f13-b18cc4f4a187/ca_rule_based_fa_clean/faro_{year}/{i}") as f:
        data = json.load(f)
    for j in data['full articles']:
        newspapers.append((data["lccn"]["title"],j['article']))
        
    

KeyError: 'lccn'

In [23]:
file=files[0]
with open(f"ArticlesTarGz/faro_{year}/mnt\\122a7683-fa4b-45dd-9f13-b18cc4f4a187/ca_rule_based_fa_clean/faro_{year}/{file}") as f:
    data = json.load(f)
print(data.keys())
print(data['lccn']['title'])
print(data["edition"]["date"])

dict_keys(['lccn', 'edition', 'page_number', 'scan', 'bboxes', 'full articles'])
Evening star.
1929-01-01


In [6]:
print(len(newspapers))
df=pd.DataFrame(newspapers, columns=["newspaper","article"])
df=df[df["article"].str.contains("inflation",case=False)]
print(len(df))

433769
145


In [7]:
print(df.head())
print(df["article"].iloc[0])

                     newspaper  \
30    The Daily Alaska empire.   
103             The messenger.   
1193             Evening star.   
1565   The Waterbury Democrat.   
3259    Imperial Valley press.   

                                                article  
30    cific-"With Gods help, we are on Jour way back...  
103   "The one outstanding news development of 1943 ...  
1193  MEXICO CITY, Jan. 3 LP.-An\nanti-inflation pro...  
1565  OPA continues it's indefensible fiat that an e...  
3259  and still maintain the American\nstandards\n\n...  
cific-"With Gods help, we are on Jour way back" declared General Macarthur after the Allied victory at Lac Sept 16) That victory was only one in long series from the capture of Guadalcanal tFeb. lW to the reconquest of the Gilberts in ovember. They broke Japan's grip in the south Pacific and opened the way for a grand offen- sive. Admiral Nimita called the Gil- bert invasion another road to To- kyo.' and added, in due time we'll have enough eq

# Fonctions pour manipuler les données 

Maintenant nous allons définir quelques fonctions pour intéragir simplement avec la base de données sans avoir à tout télécharger car le volume des données rend le téléchargement intégral difficile.

Ces fonctions définies ci-dessous seront aussi définies dans le fichier *getData.py* pour pouvoir être importées dans le notebook principal

load_credentials renvoie le header contenant le token pour pouvoir requêter l'API

In [4]:
def load_credentials():
    load_dotenv()
    url="https://huggingface.co/api/datasets/dell-research-harvard/AmericanStories"
    token=os.getenv("HuggingFaceToken")
    headers = {
        "Authorization" : f"Bearer {token}"
    }
    return headers

download_targz prend en argument les années que l'on veut télécharger et les télécharge dans le dossier *ArticlesTarGz*

In [5]:
def download_targz(years):
    if isinstance(years, str):
        years=[years]
    os.makedirs("ArticlesTarGz", exist_ok=True)
    for i in years:
        print(i, end=" ")
        file=f"faro_{i}.tar.gz"
        url=f"https://huggingface.co/datasets/dell-research-harvard/AmericanStories/resolve/main/{file}"
        wb=requests.get(url, headers=headers)
        print(wb.status_code)
        if wb.ok:
            with open("ArticlesTarGz/"+file, "wb") as f:
                f.write(wb.content)
        else:
            print(f"Error {wb.status_code} downloading {file}")

In [10]:
headers=load_credentials()
download_targz([i for i in range(1929,1931)])

1929 200
1930 200


In [2]:
def extract_targz(years):
    if isinstance(years, str):
        years=[years]
    for i in years:
        print(i, end=" ")
        try :
            with tarfile.open(f"ArticlesTarGz/faro_{i}.tar.gz", "r:gz") as tar:
                tar.extractall(path=f"ArticlesTarGz/faro_{i}")
        except FileNotFoundError:
            print(f"Error extracting {i} The file is not found")

In [13]:
import tarfile

extract_targz([i for i in range(1929,1931)])

1929 1930 

In [14]:
def fold_left_local(fonction, acc, years):
    if isinstance(years, str):
        years=[years]
    for i in years:
        print(i, end=" ")
        files=os.listdir(f"ArticlesTarGz/faro_{i}/mnt\\122a7683-fa4b-45dd-9f13-b18cc4f4a187/ca_rule_based_fa_clean/faro_{i}")
        for j in files:
            with open(f"ArticlesTarGz/faro_{i}/mnt\\122a7683-fa4b-45dd-9f13-b18cc4f4a187/ca_rule_based_fa_clean/faro_{i}/{j}") as f:
                data = json.load(f)
            acc=fonction(data, acc, (data["lccn"]["title"],data['edition']['date'],data['full articles']))
    return acc

In [1]:
def map_local(fonction,years):
    if isinstance(years, str):
        years=[years]
    acc=[]
    for i in years:
        print(i, end=" ")
        files=os.listdir(f"ArticlesTarGz/faro_{i}/mnt\\122a7683-fa4b-45dd-9f13-b18cc4f4a187/ca_rule_based_fa_clean/faro_{i}")
        for j in files:
            with open(f"ArticlesTarGz/faro_{i}/mnt\\122a7683-fa4b-45dd-9f13-b18cc4f4a187/ca_rule_based_fa_clean/faro_{i}/{j}") as f:
                data = json.load(f)
            if data['full articles']!=[]:
                try :
                    acc.append(fonction(data["lccn"]["title"],data['edition']['date'][:-3],data['full articles']))
                except:
                    acc.append(fonction("No titles found",data['edition']['date'][:-3],data['full articles']))
    return acc
    


In [2]:
def filter_and_freq(df,isInflation,title,date,articles):
    if not (date in df.keys()):
        df[date]=0,0
    for i in range(len(articles)): 
        f,n=df[date]
        if isInflation(articles[i]['article']):            
            df[date]=(f+1,n+1)
        else :
            df[date]=(f,n+1)
            articles[i]=None
    articles=[i for i in articles if i is not None]
    return (title,date,articles)

In [4]:
from functools import partial
frequences={}
def isInflation(article):
    return article.lower().find("inflation")!=-1
f_f=partial(filter_and_freq, frequences, isInflation)

NameError: name 'filter_and_freq' is not defined

In [18]:
f_f("newspaper","date"[:-3],[{"article" : "inflation is a problem in the US"},{"article" : "bread is a problem in the US"}])
print(frequences)

{'d': (1, 2)}


In [6]:
inflationArticles=map_local(f_f,[i for i in range(1920,1921)])

1920 

In [20]:
frequences=pd.DataFrame(frequences).T

In [10]:
type(inflationArticles)
len(inflationArticles)
print(inflationArticles[0])
curated=[i for i in inflationArticles if len(i[2])>0]

('The commoner.', '1920-01', [])


In [11]:
len(curated)

59

In [1]:
import getNews as gn

gn.delete_files([i for i in range(1919,1923)])

Error deleting 1919 The file is not found
Error deleting 1920 The file is not found
Error deleting 1921 The file is not found
Error deleting 1922 The file is not found


In [16]:
import os
import pandas as pd

parquets=os.listdir("ArticlesInflation")
parquets.sort
parquets=parquets[:-1]
df=pd.DataFrame()
for i in parquets:
    print(i)
    df=pd.concat([df,pd.read_parquet("ArticlesInflation/"+i)])


FileNotFoundError: [Errno 2] No such file or directory: 'ArticlesInflation'

In [11]:
import s3fs
import os

load_dotenv()
fs = s3fs.S3FileSystem(
    client_kwargs={'endpoint_url': 'https://'+'minio.lab.sspcloud.fr'},
    key = os.environ["AWS_ACCESS_KEY_ID"], 
    secret = os.environ["AWS_SECRET_ACCESS_KEY"], 
    token = os.environ["AWS_SESSION_TOKEN"])

MY_BUCKET = "rapamel"
fs.ls(MY_BUCKET)
path=f"{MY_BUCKET}/diffusion/ProjetDataScienceInflation"

In [17]:
df=pd.read_parquet("ArticlesInflation/AllInflation.parquet")

FileNotFoundError: [Errno 2] No such file or directory: 'ArticlesInflation/AllInflation.parquet'

In [18]:
with fs.open(f"{path}/AllInflation.parquet", 'wb') as f:
    df.to_parquet(f)

NameError: name 'df' is not defined

In [19]:
fs.ls(f"{MY_BUCKET}/diffusion/ProjetDataScienceInflation")

['rapamel/diffusion/ProjetDataScienceInflation/.keep',
 'rapamel/diffusion/ProjetDataScienceInflation/AllInflation.parquet']