## Getting Started with Data Sourcing

In [None]:
# Import modules 
import pandas as pd 

### API Connections

In [None]:
# Import API-related Python modules 
import json 
import certifi 
# import ssl,  if necessary
import urllib3 
from urllib3 import request 

In [None]:
# Get credentials for the web request
http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) 

# Load data from the API request as a dataframe
data = json.loads(http.request('GET', url).data.decode('utf-8')) 
api_df = pd.json_normalize(data) 

### Databases

In [None]:
# Import Python db 
import sqlite3 

In [None]:
# Read sqlite query results into a pandas DataFrame 
with sqlite3.connect("movies.sqlite") as conn: 
    df_sqlite = pd.read_sql("SELECT * from movies", conn) 

# Print the first five rows
df_sqlite.head() 

### Data from Webpages

In [None]:
# Import modules 
import numpy as np  
from unicodedata import normalize 

In [None]:
# Define the HTML variable 
html_source = 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)' 

# Import the data to organize ‘by country’ 
html_df = pd.read_html(html_source, match='by country') 

# Let's see how many tables are there with tag ' by county' 
print(len(df_html)) # There are 4 tables 

# Let's see the first table 
df_html[0] 

## Data Extraction Pipeline using Python

In [None]:
# Import modules  
import json  
import sqlite3  
import certifi  
import urllib3
import pandas as pd 

### Creating a Data Extraction Pipeline using Python

In [None]:
# Function to Import CSV Data  
def source_data_from_csv(csv_file_name):  
    try:  
        df_csv = pd.read_csv(csv_file_name)  
    except Exception as e:  
        df_csv = pd.DataFrame()  
    return df_csv 

In [None]:
# Function to Import Parquet Data  
def source_data_from_parquet(parquet_file_name):  
    try:  
        df_parquet = pd.read_parquet(parquet_file_name)  
    except Exception as e:  
        df_parquet = pd.DataFrame()  
    return df_parquet 

In [None]:
# Function to Import API Data  
def source_data_from_api(api_endpoint):  
    try:  
        # Create a Pool manager that can be used to read the API response  
        http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())  
        api_response = http.request('GET', api_endpoint)  
        apt_status = api_response.status
        
        # Check if API is available to retrive the data  
        if apt_status == 200:
            data = json.loads(api_response.data.decode('utf-8'))  
            df_api = pd.json_normalize(data)  
        
        # Sometimes we get certificate error.  
        else: 
            df_api = pd.Dataframe()  
            
    # We should never silence certificate errors as this may cause a security threat.          
    except Exception as e:
        df_api = pd.DataFrame()  
    return df_api  

In [None]:
# Function to Import SQL lite Data  
def source_data_from_table(db_name, table_name):  
    try:  
        # Read sqlite query results into a pandas DataFrame  
        with sqlite3.connect(db_name) as conn:  
            df_table = pd.read_sql(f"SELECT * from {table_name}", conn)  

    except Exception as e:  
        df_table = pd.DataFrame()  
    return df_table  

In [None]:
# Function to Import Webpage Data  
def source_data_from_webpage(web_page_url, matching_keyword):  
    try:  
        # Read webpage table into a pandas DataFrame  
        df_html = pd.read_html(web_page_url, match=matching_keyword)  
        df_html = df_html[0]  

    except Exception as e:  
        df_html = pd.DataFrame()  
    return df_html 

In [None]:
# Function to Import All Data Sources  
def extracted_data_files():  

    """  
    Extract data from all source systems for loading data into VSA(Virtual Staging Area)  
    :return: example dataframes of relevant input data sources  
    """  

    # define all data sources  
    parquet_file_name = "data/yellow_tripdata_2022-01.parquet"  
    csv_file_name = "data/h9gi-nx95.csv"  
    api_endpoint = "https://data.cityofnewyork.us/resource/h9gi-nx95.json?$limit=500"  
    db_name = "data/movies.sqlite"  
    table_name = "movies" 
    web_page_url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"  
    matching_keyword = "by country"  

    # import all data types into dataframes
    df_parquet, df_csv, df_api, df_table, df_html = (source_data_from_parquet(parquet_file_name),  
                                                     source_data_from_csv(csv_file_name),  
                                                     source_data_from_api(api_endpoint),  
                                                     source_data_from_table(db_name, table_name),  
                                                     source_data_from_webpage(web_page_url, matching_keyword))     

    return df_parquet, df_csv, df_api, df_table, df_html 

In [None]:
# Test it Out 
df_parquit,_,_,_,_ = extraction_functional.extracted_data() 
df_parquit.head() 