In [1]:
%%writefile testutility.py
import logging
import os
import subprocess
import yaml
import pandas as pd
import datetime 
import gc
import re

################
# File Reading #
################

def read_config_file(filepath):
    with open(filepath, 'r') as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            logging.error(exc)

def replacer(string, char):
    pattern = char + '{2,}'
    string = re.sub(pattern, char, string) 
    return string

def col_header_val(df,table_config):
    '''
    replace whitespaces in the column
    and standardized column names
    '''
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace('[^\w]','_',regex=True)
    df.columns = list(map(lambda x: x.strip('_'), list(df.columns)))
    df.columns = list(map(lambda x: replacer(x,'_'), list(df.columns)))
    expected_col = list(map(lambda x: x.lower(),  table_config['columns']))
    expected_col.sort()
    df.columns =list(map(lambda x: x.lower(), list(df.columns)))
    df = df.reindex(sorted(df.columns), axis=1)
    if len(df.columns) == len(expected_col) and list(expected_col)  == list(df.columns):
        print("column name and column length validation passed")
        return 1
    else:
        print("column name and column length validation failed")
        mismatched_columns_file = list(set(df.columns).difference(expected_col))
        print("Following File columns are not in the YAML file",mismatched_columns_file)
        missing_YAML_file = list(set(expected_col).difference(df.columns))
        print("Following YAML columns are not in the file uploaded",missing_YAML_file)
        logging.info(f'df columns: {df.columns}')
        logging.info(f'expected columns: {expected_col}')
        return 0


Overwriting testutility.py


Write YAML file

In [2]:
%%writefile file.yaml
file_type: csv
dataset_name: testfile
file_name: test_data
table_name: edsurv
inbound_delimiter: ","
outbound_delimiter: "|"
skip_leading_rows: 1
columns: 
    - city
    - price
    - distance

Overwriting file.yaml


In [3]:
# Read config file
import testutility as util
config_data = util.read_config_file("file.yaml")

In [4]:
config_data['inbound_delimiter']

','

In [5]:
#inspecting data of config file
config_data

{'file_type': 'csv',
 'dataset_name': 'testfile',
 'file_name': 'test_data',
 'table_name': 'edsurv',
 'inbound_delimiter': ',',
 'outbound_delimiter': '|',
 'skip_leading_rows': 1,
 'columns': ['city', 'price', 'distance']}

In [6]:
from google.colab import drive 
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [20]:
##pandas
import pandas as pd
import time

start_time = time.time()
#5.05GB csv file
df_sample = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/vgg19_features_val.csv",delimiter=',')
df_sample.head()

end_time = time.time()
elapsed_time = end_time - start_time

print(f'The code took {elapsed_time} seconds to run.')


KeyboardInterrupt: ignored

Read the files 

In [21]:
## dask
import dask.dataframe as dd
import time

start_time = time.time()

df_sample = dd.read_csv("/content/gdrive/My Drive/Colab Notebooks/vgg19_features_val.csv",delimiter=',')
df_sample.head()

end_time = time.time()
elapsed_time = end_time - start_time

print(f'The code took {elapsed_time} seconds to run.')



The code took 17.445751905441284 seconds to run.


In [23]:
!pip install modin[dask]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting modin[dask]
  Downloading modin-0.22.1-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: modin
Successfully installed modin-0.22.1


In [None]:
##modin
import modin.pandas as pd
import time

start_time = time.time()

chunksize = 10 ** 3

chunks = []

for chunk in pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/vgg19_features_val.csv', chunksize=chunksize):
    chunks.append(chunk)  

df_sample = pd.concat(chunks, axis=0)

end_time = time.time()
elapsed_time = end_time - start_time

print(f'The code took {elapsed_time} seconds to run.')




    from distributed import Client

    client = Client()

INFO:distributed.http.proxy:To route to workers diagnostics web server please install jupyter-server-proxy: python -m pip install jupyter-server-proxy
INFO:distributed.scheduler:State start
INFO:distributed.scheduler:  Scheduler at:     tcp://127.0.0.1:45197
INFO:distributed.scheduler:  dashboard at:            127.0.0.1:8787
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:35337'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:44169'
INFO:distributed.scheduler:Register worker <WorkerState 'tcp://127.0.0.1:44297', name: 0, status: init, memory: 0, processing: 0>
INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:44297
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:38548
INFO:distributed.scheduler:Register worker <WorkerState 'tcp://127.0.0.1:39913', name: 1, status: init, memory: 0, processing: 0>
INFO:distributed.scheduler:Starting worker compute

In [9]:
import os
import time
import modin.pandas as pd

os.environ["MODIN_ENGINE"] = "ray"  # Set Modin to use Ray

start_time = time.time()

chunksize = 10 ** 3 

chunks = []

for chunk in pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/vgg19_features_val.csv', chunksize=chunksize):
    chunks.append(chunk)  

df_sample = pd.concat(chunks, axis=0)

end_time = time.time()
elapsed_time = end_time - start_time

print(f'The code took {elapsed_time} seconds to run.')


INFO:distributed.core:Event loop was unresponsive in Nanny for 3.85s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
INFO:distributed.core:Event loop was unresponsive in Scheduler for 3.85s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
INFO:distributed.core:Event loop was unresponsive in Nanny for 3.85s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
INFO:distributed.scheduler:User asked for computation on lost data, _deploy_dask_func-8ce3dddde3a5e10594c0c2ca591066f6
INFO:distributed.scheduler:User asked for computation on lost data, lambda-8386e0ce79037b43a590a4d8cc938369
INFO:distributed.scheduler:User asked for computation on lost data, lambda-d1c5394fcdb3b72ced872e3d55aa9bf6
INFO:distributed.scheduler:User asked for computatio

CancelledError: ignored

In [None]:
##Perform basic validation on data columns : eg: remove special character , white spaces from the col name
util.col_header_val(df_sample,config_data)

In [None]:
##As you already know the schema hence create a YAML file and write the column name in YAML file. --define separator of read and write file, column name in YAML
%%writefile file.yaml
file_type: csv
dataset_name: testfile
file_name: test_data
table_name: edsurv
inbound_delimiter: ","
outbound_delimiter: "|"
skip_leading_rows: 1
columns: 
    - city
    - price
    - distance