In [1]:
import pandas as pd
import numpy as np
import os
import simplejson


In [34]:
def load_file(path, name):
    file_path = os.path.join(path, name)
    df = pd.read_csv(file_path)
    #Automatic fixes
    df = df.replace(r'^\s*$', np.nan, regex=True) #replaces empty strings spacess with NaN
    return df

def file_params(df):
    params = {}
    #size
    params['size'] = {}
    params['size']['rows'] = int(df.shape[0])
    params['size']['cols'] = int(df.shape[1])

    #missing values
    params['missing'] = {}
    ##rows
    params['missing']['rows'] = int(df[df.isna().any(axis=1)].shape[0])
    params['missing']['rowsPercent'] = float(round(params['missing']['rows'] / params['size']['rows'], 7) * 100)
    ##cells
    params['missing']['cells'] = int(df.isna().sum().sum())
    params['missing']['cellsPercent'] = float(round(params['missing']['cells'] / (params['size']['rows'] * params['size']['cols']), 7) * 100)
    ##columns
    missingColumn = df.isna().sum()
    params['missing']['cols'] = missingColumn[missingColumn != 0].to_dict()
    ###percent of total values in colums
    nanByColumnPercent = round(df.isna().sum() / df.sum().sum() * 100, 4)
    params['missing']['colsPercent'] = nanByColumnPercent[nanByColumnPercent !=0].to_dict()
    ###percent ot total missing values
    nanColumnContributionPercent = round(df.isna().sum() / df.isna().sum().sum() * 100, 2)
    params['missing']['colsPercentContribution'] = nanColumnContributionPercent[nanColumnContributionPercent !=0].to_dict()

    #names
    params['names'] = {}
    params['names']['cols'] = list(df.columns.values)
    params['names']['colsReverse'] = list(df.columns.values)
    params['names']['colsReverse'].reverse()

    #describe
    params['describe'] = df.describe().to_dict()

    return params

In [76]:
target = 10
paramsArray = [
    {'name': 'A', 'cols': [1,2,3,4,5,6,7,8,9,10]},
    {'name': 'B', 'cols': [1,2,3,4,5,6,7,8]},
    {'name': 'C', 'cols': [1,2,3,4,10]},
]



def cross_file_validation(fileObjectArray, target):



    #check for missing target
    missingTarget = []
    for z in fileObjectArray:
        if not target in z['names']['cols']:
            missingTarget.append(z['storageId'])

    targetValuesArray = []
    for z in fileObjectArray:
        if not z['storageId'] in missingTarget:
            df = load_file(z['storageId'])
            targetValuesArray.append(df[target].unique())
    r = np.array(targetValuesArray).flatten()
    targetValues = list(np.unique(r))
    targetValues = list(map(lambda n: str(n), targetValues))

    #mismatched columns
    mismatchedColumns = []

    for z in fileObjectArray:
        for y in fileObjectArray:
            if z['storageId'] != y['storageId']:
                comp = [x for x in y['names']['cols'] if x not in z['names']['cols']]
                if len(comp) > 0:
                    mismatchedColumns.append({
                        'has': y['storageId'],
                        'misisng': z['storageId'],
                        'missingCols': comp
                    })


    validation = {
        'valid': len(missingTarget) == 0 and len(mismatchedColumns) == 0 and len(targetValues) == 2,
        'missingTarget': missingTarget,
        'mismatchedColumns': mismatchedColumns,
        'targetValues': targetValues
    }

    return validation




In [35]:
file_path = '/Volumes/GoogleDrive-114506167238584938104/My Drive/misc_files'
files = [
    'NFL-Training_missing.csv',
    'NFL-Gen-Test_missing.csv'
]

In [77]:
df = load_file(file_path, files[0])

[0, 2, 3, 4]

In [23]:
df.isna().sum()

Height        0
Wt            0
40YD          0
Vertical      0
BenchReps     7
Broad Jump    0
3Cone         4
Shuttle       0
BMI           0
NFLness       0
dtype: int64