# Data Collection

### CSV

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/us_indicators.csv')

In [3]:
df

Unnamed: 0,date,cpi,policy_rate,neer,money_supply,inflation_target,type_of_monetary_policy
0,31/1/2001,1.32,1.5,89.24,147.07,inrange,ease
1,28/2/2001,1.44,1.5,90.47,145.49,inrange,ease
2,31/3/2001,1.44,1.5,89.33,140.02,inrange,ease
3,30/4/2001,2.52,1.5,87.32,138.88,inrange,ease
4,31/5/2001,2.76,1.5,87.16,139.49,inrange,ease
...,...,...,...,...,...,...,...
277,29/2/2024,-0.77,2.5,118.51,718.27,outrrange,tight
278,31/3/2024,-0.47,2.5,118.11,711.58,outrrange,tight
279,30/4/2024,0.19,2.5,116.80,695.69,outrrange,tight
280,31/5/2024,1.54,2.5,117.21,702.68,inrange,tight


### Excel

In [4]:
import pandas as pd
# Also require installing openpyxl

In [5]:
df = pd.read_excel('data/us_indicators.xlsx', sheet_name='Sheet1')

In [6]:
df

Unnamed: 0,date,cpi,policy_rate,neer,money_supply,inflation_target,type_of_monetary_policy
0,2001-01-31,1.32,1.5,89.24,147.07,inrange,ease
1,2001-02-28,1.44,1.5,90.47,145.49,inrange,ease
2,2001-03-31,1.44,1.5,89.33,140.02,inrange,ease
3,2001-04-30,2.52,1.5,87.32,138.88,inrange,ease
4,2001-05-31,2.76,1.5,87.16,139.49,inrange,ease
...,...,...,...,...,...,...,...
277,2024-02-29,-0.77,2.5,118.51,718.27,outrrange,tight
278,2024-03-31,-0.47,2.5,118.11,711.58,outrrange,tight
279,2024-04-30,0.19,2.5,116.80,695.69,outrrange,tight
280,2024-05-31,1.54,2.5,117.21,702.68,inrange,tight


### Database

In [7]:
import sqlite3
import pandas as pd

In [8]:
# Create connection to database
conn = sqlite3.connect(database='data/data.db')

In [9]:
# Set query command
query = 'SELECT date, cpi, policy_rate, neer, money_supply FROM us_indicators'

In [10]:
df = pd.read_sql_query(query, conn)

In [11]:
# Close connection
conn.close()

In [12]:
df

Unnamed: 0,date,cpi,policy_rate,neer,money_supply
0,date,cpi,policy_rate,neer,money_supply
1,31/1/2001,1.32,1.5,89.24,147.07
2,28/2/2001,1.44,1.5,90.47,145.49
3,31/3/2001,1.44,1.5,89.33,140.02
4,30/4/2001,2.52,1.5,87.32,138.88
...,...,...,...,...,...
278,29/2/2024,-0.77,2.5,118.51,718.27
279,31/3/2024,-0.47,2.5,118.11,711.58
280,30/4/2024,0.19,2.5,116.8,695.69
281,31/5/2024,1.54,2.5,117.21,702.68


### JSON

In [13]:
import json
import pandas as pd

In [14]:
# Load JSON file as dictionary

with open('data/us_indicators.json') as file:
    data = json.load(file)

In [15]:
data

[{'date': '31/1/2001',
  'cpi': 1.32,
  'policy_rate': 1.5,
  'neer': 89.24,
  'money_supply': 147.07,
  'inflation_target': 'inrange',
  'type_of_monetary_policy': 'ease'},
 {'date': '28/2/2001',
  'cpi': 1.44,
  'policy_rate': 1.5,
  'neer': 90.47,
  'money_supply': 145.49,
  'inflation_target': 'inrange',
  'type_of_monetary_policy': 'ease'},
 {'date': '31/3/2001',
  'cpi': 1.44,
  'policy_rate': 1.5,
  'neer': 89.33,
  'money_supply': 140.02,
  'inflation_target': 'inrange',
  'type_of_monetary_policy': 'ease'},
 {'date': '30/4/2001',
  'cpi': 2.52,
  'policy_rate': 1.5,
  'neer': 87.32,
  'money_supply': 138.88,
  'inflation_target': 'inrange',
  'type_of_monetary_policy': 'ease'},
 {'date': '31/5/2001',
  'cpi': 2.76,
  'policy_rate': 1.5,
  'neer': 87.16,
  'money_supply': 139.49,
  'inflation_target': 'inrange',
  'type_of_monetary_policy': 'ease'},
 {'date': '30/6/2001',
  'cpi': 2.15,
  'policy_rate': 2.5,
  'neer': 88.3,
  'money_supply': 139.53,
  'inflation_target': 'inran

In [16]:
df = pd.DataFrame(data)

In [17]:
df

Unnamed: 0,date,cpi,policy_rate,neer,money_supply,inflation_target,type_of_monetary_policy
0,31/1/2001,1.32,1.5,89.24,147.07,inrange,ease
1,28/2/2001,1.44,1.5,90.47,145.49,inrange,ease
2,31/3/2001,1.44,1.5,89.33,140.02,inrange,ease
3,30/4/2001,2.52,1.5,87.32,138.88,inrange,ease
4,31/5/2001,2.76,1.5,87.16,139.49,inrange,ease
...,...,...,...,...,...,...,...
277,29/2/2024,-0.77,2.5,118.51,718.27,outrrange,tight
278,31/3/2024,-0.47,2.5,118.11,711.58,outrrange,tight
279,30/4/2024,0.19,2.5,116.80,695.69,outrrange,tight
280,31/5/2024,1.54,2.5,117.21,702.68,inrange,tight


### API

Get data from https://fred.stlouisfed.org/docs/api/fred/

In [18]:
import requests

In [19]:
api_key = '5fd98b97565288c4cecea09d41f7a4ea'
target_series = 'GNPCA'

In [20]:
url = 'https://api.stlouisfed.org/fred/series/observations'

params = {
    'series_id': target_series,
    'api_key': api_key,
    'file_type': 'json'
}

In [21]:
response = requests.get(url, params=params)

In [22]:
# Check if the request was successful
if response.status_code == 200:
    # Load response as dictionary
    data = response.json()
else:
    # If request was not successful, print reason
    print(f"Error: {response.status_code}")

In [23]:
data

{'realtime_start': '2024-10-04',
 'realtime_end': '2024-10-04',
 'observation_start': '1600-01-01',
 'observation_end': '9999-12-31',
 'units': 'lin',
 'output_type': 1,
 'file_type': 'json',
 'order_by': 'observation_date',
 'sort_order': 'asc',
 'count': 95,
 'offset': 0,
 'limit': 100000,
 'observations': [{'realtime_start': '2024-10-04',
   'realtime_end': '2024-10-04',
   'date': '1929-01-01',
   'value': '1202.659'},
  {'realtime_start': '2024-10-04',
   'realtime_end': '2024-10-04',
   'date': '1930-01-01',
   'value': '1100.67'},
  {'realtime_start': '2024-10-04',
   'realtime_end': '2024-10-04',
   'date': '1931-01-01',
   'value': '1029.038'},
  {'realtime_start': '2024-10-04',
   'realtime_end': '2024-10-04',
   'date': '1932-01-01',
   'value': '895.802'},
  {'realtime_start': '2024-10-04',
   'realtime_end': '2024-10-04',
   'date': '1933-01-01',
   'value': '883.847'},
  {'realtime_start': '2024-10-04',
   'realtime_end': '2024-10-04',
   'date': '1934-01-01',
   'value':

In [24]:
data['observations']

[{'realtime_start': '2024-10-04',
  'realtime_end': '2024-10-04',
  'date': '1929-01-01',
  'value': '1202.659'},
 {'realtime_start': '2024-10-04',
  'realtime_end': '2024-10-04',
  'date': '1930-01-01',
  'value': '1100.67'},
 {'realtime_start': '2024-10-04',
  'realtime_end': '2024-10-04',
  'date': '1931-01-01',
  'value': '1029.038'},
 {'realtime_start': '2024-10-04',
  'realtime_end': '2024-10-04',
  'date': '1932-01-01',
  'value': '895.802'},
 {'realtime_start': '2024-10-04',
  'realtime_end': '2024-10-04',
  'date': '1933-01-01',
  'value': '883.847'},
 {'realtime_start': '2024-10-04',
  'realtime_end': '2024-10-04',
  'date': '1934-01-01',
  'value': '978.188'},
 {'realtime_start': '2024-10-04',
  'realtime_end': '2024-10-04',
  'date': '1935-01-01',
  'value': '1065.716'},
 {'realtime_start': '2024-10-04',
  'realtime_end': '2024-10-04',
  'date': '1936-01-01',
  'value': '1201.443'},
 {'realtime_start': '2024-10-04',
  'realtime_end': '2024-10-04',
  'date': '1937-01-01',
  

In [25]:
df = pd.DataFrame(data['observations'])

In [26]:
df

Unnamed: 0,realtime_start,realtime_end,date,value
0,2024-10-04,2024-10-04,1929-01-01,1202.659
1,2024-10-04,2024-10-04,1930-01-01,1100.67
2,2024-10-04,2024-10-04,1931-01-01,1029.038
3,2024-10-04,2024-10-04,1932-01-01,895.802
4,2024-10-04,2024-10-04,1933-01-01,883.847
...,...,...,...,...
90,2024-10-04,2024-10-04,2019-01-01,21000.945
91,2024-10-04,2024-10-04,2020-01-01,20482.341
92,2024-10-04,2024-10-04,2021-01-01,21648.657
93,2024-10-04,2024-10-04,2022-01-01,22176.949


### Web Scraping

Web scraping is not recommended as it often violates the terms of use for many websites.  
Whenever possible, use an API.

#### Table

Get data from https://en.wikipedia.org/wiki/List_of_Nobel_laureates_in_Physics

In [27]:
import pandas as pd
# Also require installing lxml

In [28]:
url = 'https://en.wikipedia.org/wiki/List_of_Nobel_laureates_in_Physics'

In [29]:
# Get response
response = pd.read_html(url)

In [30]:
response

[     Year Image                  Laureate[a]          Country  \
 0    1901   NaN  Wilhelm Röntgen (1845–1923)    German Empire   
 1    1902   NaN  Hendrik Lorentz (1853–1928)      Netherlands   
 2    1902   NaN    Pieter Zeeman (1865–1943)      Netherlands   
 3    1903   NaN  Henri Becquerel (1852–1908)           France   
 4    1903   NaN     Pierre Curie (1859–1906)           France   
 ..    ...   ...                          ...              ...   
 226  2022   NaN       John Clauser (b. 1942)    United States   
 227  2022   NaN    Anton Zeilinger (b. 1945)          Austria   
 228  2023   NaN    Anne L'Huillier (b. 1958)    France Sweden   
 229  2023   NaN      Ferenc Krausz (b. 1962)  Austria Hungary   
 230  2023   NaN    Pierre Agostini (b. 1941)           France   
 
                                           Rationale[b]    Ref  
 0    "in recognition of the extraordinary services ...   [19]  
 1    "in recognition of the extraordinary service t...   [20]  
 2    "in r

In [31]:
df = response[0]

In [32]:
df

Unnamed: 0,Year,Image,Laureate[a],Country,Rationale[b],Ref
0,1901,,Wilhelm Röntgen (1845–1923),German Empire,"""in recognition of the extraordinary services ...",[19]
1,1902,,Hendrik Lorentz (1853–1928),Netherlands,"""in recognition of the extraordinary service t...",[20]
2,1902,,Pieter Zeeman (1865–1943),Netherlands,"""in recognition of the extraordinary service t...",[20]
3,1903,,Henri Becquerel (1852–1908),France,"""for his discovery of spontaneous radioactivity""",[21]
4,1903,,Pierre Curie (1859–1906),France,"""for their joint researches on the radiation p...",[21]
...,...,...,...,...,...,...
226,2022,,John Clauser (b. 1942),United States,"""for experiments with entangled photons, estab...",[126]
227,2022,,Anton Zeilinger (b. 1945),Austria,"""for experiments with entangled photons, estab...",[126]
228,2023,,Anne L'Huillier (b. 1958),France Sweden,"""for experimental methods that generate attose...",[127]
229,2023,,Ferenc Krausz (b. 1962),Austria Hungary,"""for experimental methods that generate attose...",[127]


#### Text

Get data from http://example.com

In [33]:
import requests
from bs4 import BeautifulSoup

In [34]:
url = 'http://example.com'

In [35]:
# Get response
response = requests.get(url)

In [36]:
# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract the text content
    paragraphs = soup.find_all('p')

    # Create a list to store data
    data = []
    
    # Print the text content
    for paragraph in paragraphs:
        data.append(paragraph.get_text())
else:
    # If request was not successful, print reason
    print(f"Error: {response.status_code}")

In [37]:
data

['This domain is for use in illustrative examples in documents. You may use this\n    domain in literature without prior coordination or asking for permission.',
 'More information...']

### Text file

In [38]:
# Load text line in the file as list

with open('data/fed_meeting.txt', 'r') as file:
    data = file.readlines()

In [39]:
data

['At the latest Federal Open Market Committee (FOMC) meeting, the Federal Reserve decided to keep the federal funds rate unchanged at 5.25%–5.50% for the seventh consecutive meeting. The Fed emphasized that it would not consider lowering rates until inflation is consistently moving toward its 2% target. Despite some easing in inflation over the past year, it remains elevated. The Fed continues to reduce its holdings of U.S. Treasury securities and agency mortgage-backed securities. The economic outlook remains uncertain, with the Fed noting that recent economic activity has expanded at a solid pace, job gains remain strong, and the unemployment rate is low. However, inflationary pressures persist, and the Fed remains vigilant regarding inflation risks. Looking forward, the Fed expects to keep rates steady until there is greater confidence that inflation is sustainably moving toward the 2% goal. The Committee also projects that there might be one rate cut later this year and several mor

### Picture

In [40]:
import os
from PIL import Image

In [41]:
# Target picture folder
folder_path = 'data/nasdaq'

In [42]:
# Target files in the folder
input_list = os.listdir(folder_path)

In [43]:
input_list

['nasdaq_1d.png', 'nasdaq_1m.png', 'nasdaq_1h.png']

In [44]:
# Create a list to store data
image_list = []

In [45]:
for filename in input_list:
    # Check if a file is picture
    if filename.endswith(".jpg") or filename.endswith(".png"):
        # Create full path to the file
        file_path = os.path.join(folder_path, filename)
        img = Image.open(file_path)
        image_list.append(img)

In [46]:
image_list

[<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=2318x1590>,
 <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=2318x1590>,
 <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=2318x1590>]

In [47]:
image_list[0].show()