In [1]:
%%writefile testutility.py
import logging
import os
import subprocess
import yaml
import pandas as pd
import datetime 
import gc
import re


################
# File Reading #
################

def read_config_file(filepath):
    with open(filepath, 'r') as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            logging.error(exc)


def replacer(string, char):
    pattern = char + '{2,}'
    string = re.sub(pattern, char, string) 
    return string

def col_header_val(df,table_config):
    '''
    replace whitespaces in the column
    and standardized column names
    '''
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace('[^\w]','_',regex=True)
    df.columns = list(map(lambda x: x.strip('_'), list(df.columns)))
    df.columns = list(map(lambda x: replacer(x,'_'), list(df.columns)))
    expected_col = list(map(lambda x: x.lower(),  table_config['columns']))
    expected_col.sort()
    df.columns =list(map(lambda x: x.lower(), list(df.columns)))
    df = df.reindex(sorted(df.columns), axis=1)
    if len(df.columns) == len(expected_col) and list(expected_col)  == list(df.columns):
        print("column name and column length validation passed")
        return 1
    else:
        print("column name and column length validation failed")
        mismatched_columns_file = list(set(df.columns).difference(expected_col))
        print("Following File columns are not in the YAML file",mismatched_columns_file)
        missing_YAML_file = list(set(expected_col).difference(df.columns))
        print("Following YAML columns are not in the file uploaded",missing_YAML_file)
        logging.info(f'df columns: {df.columns}')
        logging.info(f'expected columns: {expected_col}')
        return 0


Writing testutility.py


In [8]:
%%writefile file.yaml
file_type: csv
file_name: animedata
inbound_delimiter: ","
outbound_delimiter: "|"
skip_leading_rows: 1
columns: 
    - username
    - anime_id
    - my_watched_episodes
    - my_start_date
    - my_finish_date
    - my_score
    - my_status
    - my_rewatching
    - my_rewatching_ep
    - my_last_updated
    - my_tags


Overwriting file.yaml


In [9]:
# Read config file
import testutility as util
config_data = util.read_config_file("file.yaml")

In [4]:
config_data

{'columns': ['username',
  'anime_id',
  'my_watched_episodes',
  'my_start_date',
  'my_finish_date',
  'my_score',
  'my_rewatching',
  'my_rewatching_ep',
  'my_last_updated',
  'my_tags'],
 'dataset_name': 'testfile',
 'file_name': 'test_data',
 'file_type': 'csv',
 'inbound_delimiter': ',',
 'outbound_delimiter': '|',
 'skip_leading_rows': 1,
 'table_name': 'edsurv'}

In [6]:
import pandas as pd
# read the file using config file
file_type = config_data['file_type']
source_file = "./" + config_data['file_name'] + f'.{file_type}'
#print("",source_file)
df = pd.read_csv(source_file,config_data['inbound_delimiter'])
df.head()

Unnamed: 0,username,anime_id,my_watched_episodes,my_start_date,my_finish_date,my_score,my_status,my_rewatching,my_rewatching_ep,my_last_updated,my_tags
0,karthiga,21,586,0000-00-00,0000-00-00,9,1,,0,2013-03-03 10:52:53,
1,karthiga,59,26,0000-00-00,0000-00-00,7,2,,0,2013-03-10 13:54:51,
2,karthiga,74,26,0000-00-00,0000-00-00,7,2,,0,2013-04-27 16:43:35,
3,karthiga,120,26,0000-00-00,0000-00-00,7,2,,0,2013-03-03 10:53:57,
4,karthiga,178,26,0000-00-00,0000-00-00,7,2,0.0,0,2013-03-27 15:59:13,


In [11]:
df.shape

(31284030, 11)

In [10]:
util.col_header_val(df,config_data)
print("columns of files are:" ,df.columns)
print("columns of YAML are:" ,config_data['columns'])

column name and column length validation passed
columns of files are: Index(['username', 'anime_id', 'my_watched_episodes', 'my_start_date',
       'my_finish_date', 'my_score', 'my_status', 'my_rewatching',
       'my_rewatching_ep', 'my_last_updated', 'my_tags'],
      dtype='object')
columns of YAML are: ['username', 'anime_id', 'my_watched_episodes', 'my_start_date', 'my_finish_date', 'my_score', 'my_status', 'my_rewatching', 'my_rewatching_ep', 'my_last_updated', 'my_tags']
