# ETL


## Extract

In [1]:
import pandas as pd

def extract_from_csv(file_to_process):
    dataframe = pd.read_csv(file_to_process, lines=True)
    return dataframe

In [4]:
def extract_from_json(file_to_process):
    dataframe = pd.read_json(file_to_process, lines=True)
    return dataframe

In [3]:
# df = extract_from_csv('source1.csv')  #file format can be either json or csv

In [5]:
def extract():
    #create an empty data frame to hold extracted data
    extracted_data = pd.DataFrame(coluumns=['name','height','weight'])
    
    #process all csv files
    for csvfile in glob.glob("*.csv"):
        extracted_data = extracted_data.append(extract_from_csv(csvfile), ignore_index=True)
        
    #process all json files
    for jsonfile in glob.glob("*.json"):
        extracted_data = extracted_data.append(extract_from_json(jsonfile), ignore_index=True)
        
    return extracted_data

## Transform

In [6]:
def transform(data):
    #inches -> m and round off to two decimals (1 inch = 0.0254 m)
    data['height'] = round(data.height * 0.0254, 2)
    
    #pounds -> kg (1 pound = 0.45359237 kg)
    data['weight'] = round(data.weight * 0.45359237, 2)
    
    return data
    

## Load and log

In [None]:
def load(targetfile, data_to_load):
    data_to_load.to_csv(targetfile)
    
target_file = "transformed_data.csv"

load(targetfile, transformed_data)

In [None]:
from datetime import datetime

In [7]:
def log(message):
    timestamp_format = '%Y-%h-%d-%H:%M:%S'
    now = datetime.now()
    timestamp = now.strftime(timestamp_format)
    with open ( "logfile.txt" , "a") as f:
        f.write (timestamp + ',' + message + '\n')

## ETL LAB


In [None]:
# w terminalu
wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-PY0221EN-SkillsNetwork/labs/module%206/Lab%20-%20Extract%20Transform%20Load/data/source.zip
Copied!Executed!
b. Unzip the downloaded file.

unzip source.zip

In [None]:
import glob
import pandas as pd 
import xml.etree.ElementTree as ET 
from datetime import datetime

log_file = "log_file.txt"
target_file = "transformed_data.csv"


def extract_from_csv(file_to_process):
    dataframe = pd.read_csv(file_to_process)
    return dataframe

def extract_from_json(file_to_process):
    dataframe = pd.read_json(file_to_process, lines=True)
    return dataframe

def extract_from_xml(file_to_process):
    dataframe = pd.DataFrame(columns=['name','height', 'weight'])
    tree = ET.parse(file_to_process)
    root = tree.getroot()
    for person in root:
        name = person.find("name").text
        height = float(person.find("height").text)
        weight = float(person.find("weight").text) 
        dataframe = pd.concat([dataframe, pd.DataFrame([{"name":name, "height": height, "weight":weight}])], ignore_index=True)
    return dataframe


def extract(): 
    extracted_data = pd.DataFrame(columns=['name','height','weight']) # create an empty data frame to hold extracted data 
     
    # process all csv files 
    for csvfile in glob.glob("*.csv"): 
        extracted_data = pd.concat([extracted_data, pd.DataFrame(extract_from_csv(csvfile))], ignore_index=True) 
         
    # process all json files 
    for jsonfile in glob.glob("*.json"): 
        extracted_data = pd.concat([extracted_data, pd.DataFrame(extract_from_json(jsonfile))], ignore_index=True) 
     
    # process all xml files 
    for xmlfile in glob.glob("*.xml"): 
        extracted_data = pd.concat([extracted_data, pd.DataFrame(extract_from_xml(xmlfile))], ignore_index=True) 
         
    return extracted_data 

def transform(data): 
    '''Convert inches to meters and round off to two decimals 
    1 inch is 0.0254 meters '''
    data['height'] = round(data.height * 0.0254,2) 
 
    '''Convert pounds to kilograms and round off to two decimals 
    1 pound is 0.45359237 kilograms '''
    data['weight'] = round(data.weight * 0.45359237,2) 
    
    return data 

#to_csv() allows to save a dataframe to csv file
def load_data(target_file, transformed_data): 
        transformed_data.to_csv(target_file) 

def log_progress(message): 
        timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second 
        now = datetime.now() # get current timestamp 
        timestamp = now.strftime(timestamp_format) 
        with open(log_file,"a") as f: 
            f.write(timestamp + ',' + message + '\n') 



# Log the initialization of the ETL process
log_progress("ETL Job Started")

# Log the beginning of the Extraction process 
log_progress("Extract phase Started")
extracted_data = extract()

# Log the completion of the Extraction process 
log_progress("Extract phase Ended")

#Log the beginning of the Transformation process
log_progress("Transform phase started")
transformed_data = transform(extracted_data)
print("Transformed data: ", transformed_data)

# Log the completion of the Tranformation process
log_progress("Transform phase Ended") 

# Log the beginning of the Loading process 
log_progress("Load phase Started") 
load_data(target_file,transformed_data) 


# Log the completion of the Loading process 
log_progress("Load phase Ended") 
 
# Log the completion of the ETL process 
log_progress("ETL Job Ended") 

# Practice lab excercise

In [None]:
import glob
import pandas as pd 
import xml.etree.ElementTree as ET 
from datetime import datetime 

log_file = "log_file.txt"
target_file = "transformed_data.csv"

def extract_from_csv(file_to_process):
    dataframe = pd.read_csv(file_to_process)
    return dataframe


def extract_from_json(file_to_process):
    dataframe = pd.read_json(file_to_process, lines = True)
    return dataframe


def extract_from_xml():
    dataframe = pd.DataFrame(columns = ['car_mode','year_of_manufacture','price','fuel'])
    tree = ET.parse(file_to_process)
    root = tree.getroot()
    for car in root:
        car_mode = car.find('car_mode').text
        year_of_manufacture = car.find('year_of_manufacture').text
        price = float(car.find('price').text)
        fuel = car.find('fuel').text
        dataframe = pd.concat([dataframe, pd.DataFrame([{
            "car_mode":car_mode,
            "year_of_manufacture":year_of_manufacture,
            "price":price,
            "fuel":fuel
        }])], ignore_index=True)
    return dataframe


def extract():
    extracted_data = pd.DataFrame(columns = ['car_model','year_of_manufacture','price','fuel'])

    for csvfile in glob.glob("*.csv"):
        extracted_data = pd.concat([extracted_data, pd.DataFrame(extract_from_csv(csvfile))], ignore_index=True)

    for jsonfile in glob.glob("*.json"):
        extracted_data = pd.concat([extracted_data, pd.DataFrame(extract_from_json(jsonfile))], ignore_index=True)

    for xmlfile  in glob.glob(".xml"):
        extracted_data = pd.concat([extracted_data, pd.DataFrame(extract_from_xml(xmlfile)),], ignore_index=True)

    return extracted_data


def transform(data):
    data['price'] = round(data.price, 2)
    return data


def load_data(target_file, transformed_data):
    transformed_data.to_csv(target_file)

def log_progress(message):
    timestamp_format = '%Y-%h-%d-%H:%M:%S'
    now = datetime.now()
    timestamp = now.strftime(timestamp_format)
    with open(log_file, 'a') as f:
        f.write(timestamp + ' , ' + message + '\n')

# Log the initialization of the ETL process
log_progress("ETL Job Started")

# Log the beginning of the Extraction process 
log_progress("Extract phase Started")
extracted_data = extract()

# Log the completion of the Extraction process 
log_progress("Extract phase Ended")

#Log the beginning of the Transformation process
log_progress("Transform phase started")
transformed_data = transform(extracted_data)
print("Transformed data: ", transformed_data)

# Log the completion of the Tranformation process
log_progress("Transform phase Ended") 

# Log the beginning of the Loading process 
log_progress("Load phase Started") 
load_data(target_file,transformed_data) 

# Log the completion of the Loading process 
log_progress("Load phase Ended") 
 
# Log the completion of the ETL process 
log_progress("ETL Job Ended") 