In [1]:
import pandas as pd
import numpy as np
import time
import yaml

In [2]:
# Reading the data using pandas
s = time.time()
df = pd.read_csv("C:/Users/user/Downloads/archive2/Reliance Trends Fashion.csv")
e = time.time()

In [3]:
# Reading the file using dask
import dask.dataframe as dd

In [4]:
s2 = time.time()
df2 = dd.read_csv("C:/Users/user/Downloads/archive2/Reliance Trends Fashion.csv")
e2 = time.time()

In [9]:
# Redaing the file using modin
import modin.pandas as md
import ray


In [7]:
s3 = time.time()
df3 = md.read_csv("C:/Users/user/Downloads/archive2/Reliance Trends Fashion.csv")
e3 = time.time()


    import ray
    ray.init()



In [10]:
# comparing their speeds
print("Pandas Loading Time = {}".format(e-s))
print("Dask Loading Time = {}".format(e2-s2))
print("Modin Loading Time = {}".format(e3-s3))

Pandas Loading Time = 0.26799869537353516
Dask Loading Time = 0.03477215766906738
Modin Loading Time = 11.998594522476196


In [11]:
# displaying all columns
print(df.columns)

Index(['Brand', 'Description', 'Image_URL', 'Product_ID',
       'Discount_Price (in Rs.)', 'Original_Price (in Rs.)', 'Product_URL',
       'Category_by_gender', 'Category'],
      dtype='object')


### Writing the yaml file



In [12]:
%%writefile file.yaml
file_type: csv
dataset_name: reliancefile
file_name: Reliance Trends Fashion
table_name: reliancetrends
inbound_delimiter: ","
outbound_delimiter: "|"
skip_leading_rows: 1
columns:
    - brand
    - description
    - imageurl
    - productid
    - discountprice(inrs.)
    - originalprice(inrs.)
    - producturl
    - categorybygender
    - category
   

Overwriting file.yaml


#### Writing the test utility file

In [14]:
%%writefile fashiontrendstutility.py
import logging
import os
import subprocess
import yaml
import pandas as pd
import datetime 
import gc
import re


################
# File Reading #
################

def read_config_file(filepath):
    with open(filepath, 'r') as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            logging.error(exc)


def replacer(string, char):
    pattern = char + '{2,}'
    string = re.sub(pattern, char, string) 
    return string

def col_header_val(df,config_data):
    '''
    replace whitespaces in the column
    and standardized column names
    '''
    df.columns = df.columns.str.lower()
    df.columns=df.columns.str.replace('[#,@,&,_, ]','')
    expected_col = list(config_data['columns'])
    expected_col.sort()
    df.columns =list(map lambda x: (x.lower(), list(df.columns)))
    df = df.reindex(sorted(df.columns), axis=1)
    
    if len(df.columns) == len(expected_col) and list(expected_col)  == list(df.columns):
        print("column name and column length validation passed")
        return 1
    else:
        print("column name and column length validation failed")
        mismatched_columns_file = list(set(df.columns).difference(expected_col))
        print("Following File columns are not in the YAML file",mismatched_columns_file)
        missing_YAML_file = list(set(expected_col).difference(df.columns))
        print("Following YAML columns are not in the file uploaded",missing_YAML_file)
        logging.info(f'df columns: {df.columns}')
        logging.info(f'expected columns: {expected_col}')
        return 0

Writing fashiontrendstutility.py


###  Read config file

In [15]:
# Read config file
import fashiontrendsutility as util
config_data = util.read_config_file("file.yaml")

In [16]:
# inspecting the config file
config_data

{'file_type': 'csv',
 'dataset_name': 'reliancefile',
 'file_name': 'Reliance Trends Fashion',
 'table_name': 'reliancetrends',
 'inbound_delimiter': ',',
 'outbound_delimiter': '|',
 'skip_leading_rows': 1,
 'columns': ['brand',
  'description',
  'imageurl',
  'productid',
  'discountprice(inrs.)',
  'originalprice(inrs.)',
  'producturl',
  'categorybygender',
  'category']}

In [17]:
# read the file using config file instead of hardcoding it
file_type = config_data['file_type']
source_file = "./" + config_data['file_name'] + f'.{file_type}'
#print("",source_file)
df = pd.read_csv(source_file,config_data['inbound_delimiter'])
df.head()



Unnamed: 0,Brand,Description,Image_URL,Product_ID,Discount_Price (in Rs.),Original_Price (in Rs.),Product_URL,Category_by_gender,Category
0,performax,Typographic Print Track Pants with Insert Pockets,https://assets.ajio.com/medias/sys_master/root...,441136763008,974,1299,https://trends.ajio.com/performax-typographic-...,Men,Activewear
1,performax,Low-Top Lace-Up Sports Shoes,https://assets.ajio.com/medias/sys_master/root...,450137679013,1499,1499,https://trends.ajio.com/performax-low-top-lace...,Men,Activewear
2,teamspirit,Colorblock Crew-Neck T-shirt,https://assets.ajio.com/medias/sys_master/root...,441133214008,461,549,https://trends.ajio.com/teamspirit-colorblock-...,Men,Activewear
3,performax,Camouflage Print Joggers with Insert Pockets,https://assets.ajio.com/medias/sys_master/root...,441134776020,1189,1699,https://trends.ajio.com/performax-camouflage-p...,Men,Activewear
4,teamspirit,Reflective Colourblock Crew-Neck T-shirt,https://assets.ajio.com/medias/sys_master/root...,441136539020,425,599,https://trends.ajio.com/teamspirit-reflective-...,Men,Activewear


In [18]:
#validate tif the two files match
util.col_header_val(df,config_data)

column name and column length validation passed




1

In [19]:
print("columns of files are:" ,df.columns)
print("columns of YAML are:" ,config_data['columns'])

columns of files are: Index(['brand', 'description', 'imageurl', 'productid', 'discountprice(inrs.)',
       'originalprice(inrs.)', 'producturl', 'categorybygender', 'category'],
      dtype='object')
columns of YAML are: ['brand', 'description', 'imageurl', 'productid', 'discountprice(inrs.)', 'originalprice(inrs.)', 'producturl', 'categorybygender', 'category']


In [20]:
if util.col_header_val(df,config_data)==0:
    print("validation failed")
    # write code to reject the file
else:
    print("col validation passed")
    # write the code to perform further action
    # in the pipleine

column name and column length validation passed
col validation passed


