In [1]:
import glob                      
import pandas as pd                 
from datetime import datetime

In [2]:
!wget -P data/ppl/source https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-PY0221EN-SkillsNetwork/labs/module%206/Lab%20-%20Extract%20Transform%20Load/data/source.zip

--2023-06-13 22:45:04--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-PY0221EN-SkillsNetwork/labs/module%206/Lab%20-%20Extract%20Transform%20Load/data/source.zip
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 

169.63.118.104
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.63.118.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2707 (2,6K) [application/zip]
Saving to: ‘data/ppl/source/source.zip’


2023-06-13 22:45:04 (427 MB/s) - ‘data/ppl/source/source.zip’ saved [2707/2707]



In [3]:
!unzip -o data/ppl/source/source.zip -d data/ppl/source

Archive:  data/ppl/source/source.zip
  inflating: data/ppl/source/source3.json  
  inflating: data/ppl/source/source1.csv  
  inflating: data/ppl/source/source2.csv  
  inflating: data/ppl/source/source3.csv  
  inflating: data/ppl/source/source1.json  
  inflating: data/ppl/source/source2.json  
  inflating: data/ppl/source/source1.xml  
  inflating: data/ppl/source/source2.xml  
  inflating: data/ppl/source/source3.xml  


In [4]:
folder_path = "data/ppl"
sourcefolder  = f"{folder_path}/source"               
logfile    = f"{folder_path}/logfile.txt"            
targetfile = f"{folder_path}/transformed_data.csv"

### Extract


In [5]:
def extract_from_csv(file_to_process):
    dataframe = pd.read_csv(file_to_process)
    return dataframe

In [6]:
def extract_from_json(file_to_process):
    dataframe = pd.read_json(file_to_process, lines=True)
    return dataframe

In [7]:
def extract_from_xml(file_to_process):
    dataframe = pd.read_xml(file_to_process)
    return dataframe

In [8]:
def extract(source_data):
    extracted_data = pd.DataFrame(columns=['name','height','weight'])
    
    for csvfile in glob.glob(f"{sourcefolder}/*.csv"):
        extracted_data = pd.concat([extracted_data, pd.DataFrame.from_records(extract_from_csv(csvfile))], ignore_index=True) 
        
    #process all json files
    for jsonfile in glob.glob(f"{sourcefolder}/*.json"):
        extracted_data = pd.concat([extracted_data, extract_from_json(jsonfile)], ignore_index=True)
    
    #process all xml files
    for xmlfile in glob.glob(f"{sourcefolder}/*.xml"):
        extracted_data = pd.concat([extracted_data, extract_from_xml(xmlfile)], ignore_index=True)
        
    return extracted_data

### Transform
The transform function does the following tasks.

1. Convert height which is in inches to millimeter
2. Convert weight which is in pounds to kilograms

In [9]:
def transform(data):
    data.height = data.height.astype(float)
    data['height'] = round(data.height * 0.0254,2)

    data.weight = data.weight.astype(float)
    data['weight'] = round(data.weight * 0.453592,2)

    return data

### Load

In [10]:
def load(targetfile, data_to_load):
    data_to_load.to_csv(targetfile)

### Logging


In [11]:
def log(message):
    timestamp_format = '%Y-%h-%d-%H:%M:%S'
    now = datetime.now()
    timestamp = now.strftime(timestamp_format)
    with open(logfile,"a") as f:
        f.write(timestamp + ',' + message + '\n')

#### Running

In [12]:
log("ETL Job Started")

In [14]:
log("Extract phase Started")
extracted_data = extract(sourcefolder)
log("Extract phase Ended")
extracted_data.head(5)

Unnamed: 0,name,height,weight
0,alex,65.78,112.99
1,ajay,71.52,136.49
2,alice,69.4,153.03
3,ravi,68.22,142.34
4,joe,67.79,144.3


In [15]:
log("Transform phase Started")
transformed_data = transform(extracted_data)
log("Transform phase Ended")
transformed_data.head(5)

Unnamed: 0,name,height,weight
0,alex,1.67,51.25
1,ajay,1.82,61.91
2,alice,1.76,69.41
3,ravi,1.73,64.56
4,joe,1.72,65.45


In [16]:
log("Load phase Started")
load(targetfile,transformed_data)
log("Load phase Ended")

In [17]:
log("ETL Job Ended")