In [1]:
# definition of configuration file

config = '''dataset_name: 'arboles.csv'
dataset_name_to_save: 'arboles.csv.gz'
separator_read: ','
separator_write: '|'
column_names: ['id', 'diametro', 'longitud', 'especie']'''

with open('config.yaml', 'w') as f:
    f.write(config)

In [2]:
# imports

import dask.dataframe as dd
import yaml

In [3]:
# load configuration info in a dict

with open('config.yaml') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

config

{'dataset_name': 'arboles.csv',
 'dataset_name_to_save': 'arboles.csv.gz',
 'separator_read': ',',
 'separator_write': '|',
 'column_names': ['id', 'diametro', 'longitud', 'especie']}

In [4]:
# create dask dataframe

df = dd.read_csv(config['dataset_name'], sep = config['separator_read'])

# remove special characters

df.columns = df.columns.str.replace('[#,@,&]', '', regex=True)

# remove white spaces from col names

df.columns = df.columns.str.replace(' ', '')

In [5]:
df.columns

Index(['id', 'diametro', 'longitud', 'especie'], dtype='object')

In [6]:
# separate columns names as a list

column_names = df.columns.values.tolist()

In [7]:
# function for csv summary

def summary(dataframe):
    column_n = len(dataframe.columns)
    rows_n = len(dataframe.index)
    size = len(dataframe.columns)*len(dataframe.index)
    print('Summary')
    print('-' * 23)
    print(f'Number of columns: {column_n}')
    print(f'Number of rows   : {rows_n}')
    print(f'Size of dataframe: {size}')
    print('\n')

In [8]:
# Validate column names

if column_names == config['column_names']:
    
    summary(df)
    df.to_csv(config['dataset_name_to_save'],sep = config['separator_write'],compression='gzip')
    
    print('file saved as gz format')
else:
    print('columns names are not what expected')


Summary
-----------------------
Number of columns: 4
Number of rows   : 42562000
Size of dataframe: 170248000


file saved as gz format
