In [2]:
from pathlib import Path
from tqdm import tqdm

import pandas as pd

In [3]:
parquetta = [ x for x in (Path('data')/'compressed').iterdir() ]

In [4]:
header_options = dict()
for file in tqdm(parquetta):
    df = pd.read_parquet(file)
    header_str = str(df.columns.values)
    if header_str not in header_options:
        header_options[header_str] = list()
    header_options[header_str].append(file)

100%|██████████| 363/363 [00:24<00:00, 14.83it/s]


In [5]:
for k,v in header_options.items():
    print(k,len(v))
    if len(v) < 10:
        for err in v:
            print(str(err))
    print('-'*100)

['Rental Id' 'Duration' 'Bike Id' 'End Date' 'EndStation Id'
 'EndStation Name' 'Start Date' 'StartStation Id' 'StartStation Name'] 345
----------------------------------------------------------------------------------------------------
['Rental Id' 'Duration' 'Bike Id' 'End Date' 'EndStation Id'
 'EndStation Name' 'Start Date' 'StartStation Id' 'StartStation Name'
 'Unnamed: 9' 'Unnamed: 10'] 2
data\compressed\05JourneyDataExtract01May2016-17May2016.parquet
data\compressed\15JourneyDataExtract20Jul2016-26Jul2016.parquet
----------------------------------------------------------------------------------------------------
['Rental Id' 'Duration' 'Bike Id' 'End Date' 'EndStation Id'
 'EndStation Name' 'Start Date' 'StartStation Id' 'StartStation Name'
 'Unnamed: 9' 'Unnamed: 10' 'Unnamed: 11'] 8
data\compressed\06JourneyDataExtract18May2016-24May2016.parquet
data\compressed\07JourneyDataExtract25May2016-31May2016.parquet
data\compressed\08JourneyDataExtract01Jun2016-07Jun2016.parquet
data

Conclusion:
---
- We should drop data\compressed\21JourneyDataExtract31Aug2016-06Sep2016.parquet, as it is really hard to convert, becauses the station id-s are missinb, but reductable
- We shoud drop data\compressed\325JourneyDataExtract06Jul2022-12Jul2022.parquet as there the end station id is missing, but reductable
- Others could be transformed to the original data shape


Mappings:
---
- `Start Station Name` => `StartStation Name`
- `Start Station Id` => `StartStation Id`
- `End Station Name` => `EndStation Name`
- `End Station Id` => `EndStation Id`
- `Duration_Seconds` => `Duration`

In [6]:
mapping = {
'Start Station Name': 'StartStation Name',
'Start Station Id': 'StartStation Id',
'End Station Name': 'EndStation Name',
'End Station Id': 'EndStation Id',
'Duration_Seconds': 'Duration',
}

unused_fields = [
    'Unnamed: 9',
    'Unnamed: 10',
    'Unnamed: 11',
]

In [18]:
for k,v in header_options.items():
    #load files
    for file in v:
        df = pd.read_parquet(file)
        rename_policy = {k:v for k,v in mapping.items() if k in df.columns.values}
        droppable = [f for f in unused_fields if f in df.columns.values]
        if not rename_policy and not droppable: #empty dict, no work to do here
            continue
        if rename_policy:
           df.rename(columns=rename_policy,inplace=True)
        if droppable:
            df.drop(columns=droppable,inplace=True)
        print("Modified ",file, "Reasons",rename_policy,droppable)
        og_file = file
        file.rename(file.parent /(file.name+'.bak'))
        df.to_parquet(og_file)

Modified  data\compressed\05JourneyDataExtract01May2016-17May2016.parquet Reasons {} ['Unnamed: 9', 'Unnamed: 10']
Modified  data\compressed\15JourneyDataExtract20Jul2016-26Jul2016.parquet Reasons {} ['Unnamed: 9', 'Unnamed: 10']
Modified  data\compressed\06JourneyDataExtract18May2016-24May2016.parquet Reasons {} ['Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11']
Modified  data\compressed\07JourneyDataExtract25May2016-31May2016.parquet Reasons {} ['Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11']
Modified  data\compressed\08JourneyDataExtract01Jun2016-07Jun2016.parquet Reasons {} ['Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11']
Modified  data\compressed\09JourneyDataExtract08Jun2016-14Jun2016.parquet Reasons {} ['Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11']
Modified  data\compressed\11JourneyDataExtract22Jun2016-28Jun2016.parquet Reasons {} ['Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11']
Modified  data\compressed\12JourneyDataExtract29Jun2016-05Jul2016.parquet Reasons {} ['Unnamed: 9', 'Unnamed: 10', 'Unna

In [22]:
def check_mismatched_headers():
    header_options = dict()
    for file in tqdm(parquetta):
        df = pd.read_parquet(file)
        header_str = str(df.columns.values)
        if header_str not in header_options:
            header_options[header_str] = list()
        header_options[header_str].append(file)
    for k,v in header_options.items():
        print(k,len(v))
        if len(v) < 10:
            for err in v:
                print(str(err))
        print('-'*100)
    return header_options

In [23]:
results = check_mismatched_headers()

100%|██████████| 363/363 [00:22<00:00, 15.99it/s]

['Rental Id' 'Duration' 'Bike Id' 'End Date' 'EndStation Id'
 'EndStation Name' 'Start Date' 'StartStation Id' 'StartStation Name'] 361
----------------------------------------------------------------------------------------------------
['Rental Id' 'Duration' 'Bike Id' 'End Date' 'EndStation Logical Terminal'
 'EndStation Name' 'endStationPriority_id' 'Start Date'
 'StartStation Logical Terminal' 'StartStation Name'] 1
data\compressed\21JourneyDataExtract31Aug2016-06Sep2016.parquet
----------------------------------------------------------------------------------------------------
['Rental Id' 'Duration' 'Bike Id' 'End Date' 'EndStation Name'
 'Start Date' 'StartStation Id' 'StartStation Name'] 1
data\compressed\325JourneyDataExtract06Jul2022-12Jul2022.parquet
----------------------------------------------------------------------------------------------------





In [34]:
selected = sorted(results.items(),key=lambda x:len(x[1]))[-1][1]
selected = [ str(x) for x in selected]

In [35]:
import json
with open('correct_files.json','w+') as jfile:
    json.dump(selected,jfile,indent=4)