### Import libraries

In [None]:
import requests
import pandas as pd
import tarfile
import os

### Download the data file

In [86]:
#Function that download the .tgz file and unzip it
def download_unzip():
    file_name = 'tolldata.tgz'  #declare the file.tgz name
    
    if os.path.exists('output'):
        os.system('rm -r output')
        os.mkdir('output')
    else:
        os.mkdir('output')
    
        
    #download the file using requests lib
    response = requests.get('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0250EN-SkillsNetwork/labs/Final%20Assignment/tolldata.tgz')
    if response.status_code == 200:
        with open('tolldata.tgz', 'wb') as file:
            file.write(response.content)
    else:
        print("Something Wrong Happened")
        
    #Unzip the tar file to a directory data_files
    try:
        with tarfile.open(file_name,'r:gz') as tar:
            tar.extractall('source_files')
    except tarfile.ReadError as e:
        print(f"Cannot read tar file: {e}")
    except FileNotFoundError as e :
        print(f"{file_name} is not found: {e}")
    except :
        print(f"Something Wrong happend")
    finally:
        print(f"File extracted successfully")

### Extract data from csv file
We need to extract just the following columns:
- Rowid
- Timestamp
- Anonymized Vehicle Number
- Vehicle Type

In [87]:
def extract_csv_data():
    csv_df=pd.read_csv('source_files/vehicle-data.csv',header=None)
    col_names = ['Rowid','Timestamp','Anonymized_vehicle_number','Vehicle_type','Number_of_axles','Vehicle_code']
    #Add column names 
    csv_df.columns=col_names
    extracted_csv=csv_df[['Rowid','Timestamp','Anonymized_vehicle_number','Vehicle_type']]
    extracted_csv.to_csv('output/csv_data.csv',index=False)

### Extract data from tsv file
We need to extract just the following columns:
- Number of axles
- Tollplaza id
- Tollplaza code

In [88]:
def extract_tsv_data():
    #replace tab separator file to comman separator file called tsv_data.csv
    with open('source_files/tollplaza-data.tsv', 'r') as file:
        with open('output/tsv_data.csv','w') as output:
            output.write(file.read().replace('\t',','))

    tsv_df = pd.read_csv('output/tsv_data.csv',header=None)
    col_names = ['Rowid','Timestamp','Anonymized_vehicle_number','Vehicle_type','Number_of_axles','Tollplaza_id','Tollplaza_code']
    tsv_df.columns=col_names
    tsv_df = tsv_df[['Rowid','Number_of_axles','Tollplaza_id','Tollplaza_code']]
    tsv_df.to_csv('output/tsv_data.csv',index=False)

### Extract data from fixed width file
We need to extract just the following columns:
- Type of payment
- Vehicle Code

In [89]:
def extract_fixed_width_data():
    row_id=[]
    payment_type=[]
    vehicle_code=[]
    with open('source_files/payment-data.txt','r') as input:
        for line in input: 
            # extract only what we needs which are the last two columns and convert it to csv file
            transformed_line = line.strip().replace(' ',',')
            # output.write()
            ls=transformed_line.split(',')
            row_id.append(ls[0])
            payment_type.append(ls[-2])
            vehicle_code.append(ls[-1])#to remove the addition \n which is added after split
    
    dictionary={
        'Rowid':row_id,
        'Payment_type':payment_type,
        'Vehicle_code':vehicle_code
    }            
                
    fixed_df=pd.DataFrame(dictionary)
    fixed_df.to_csv('output/fixed_width_data.csv',index=False)
                
            
    
    
    
    # col_names = ['Rowid','Timestamp','Anonymized_vehicle_number','Tollplaza_id','Tollplaza_code','Type_of_payment','Vehicle_code']
    # fixed_df.columns = col_names
    # fixed_df = fixed_df[['Rowid','Payment_type','Vehicle_type']]
    # fixed_df.to_csv('output/fixed_width_data.csv',index=False)

### Consolidate files (Combine the three files into one file called extracted_data.csv)

In [102]:
def consolidate():    
    csv_df = pd.read_csv('output/csv_data.csv')
    tsv_df = pd.read_csv('output/tsv_data.csv')
    fixed_width_df = pd.read_csv('output/fixed_width_data.csv')
    final_df=csv_df.merge(tsv_df,'inner', on=None).merge(fixed_width_df,'inner',on=None)
    final_df.to_csv('output/extracted_data.csv',index=False)

### Transform data
Do some trivial transformation just for an example like capitalize the Vehicle_type column

In [108]:
def transform():
    final_df = pd.read_csv('output/extracted_data.csv')
    final_df.Vehicle_type  = final_df['Vehicle_type'].apply(lambda x : x.upper())
    final_df.to_csv('output/transformed_data.csv',index=False)


In [109]:
download_unzip()
extract_csv_data()
extract_tsv_data()
extract_fixed_width_data()
consolidate()
transform()

File extracted successfully


  tar.extractall('source_files')
