In [1]:
import numpy as np
import pandas as pd
import os
fldr_raw = "data_temp/raw"

In [2]:

os.makedirs(fldr_raw, exist_ok=True)

n = 30
df0 = pd.DataFrame({
     'station': ['A0'] * n,
     'date': pd.date_range(start='2020-01-01',
                           periods=n),
     'temperature_avg': np.random.randint(low=10,
                                         high=25,
                                         size=n),
     'extra': [1.0] * n})

df0.to_csv(os.path.join(fldr_raw,
                        "file0.csv"),
           index=False)

df1 = pd.DataFrame({
    'date': pd.date_range(start='2020-01-01',
                          periods=n),
    'temperature_avg': np.random.uniform(low=5.5,
                                         high=33.6,
                                         size=n),
    'is_cloudy':np.random.choice([True, False],size=n),
    'station': ['A1'] * n})

df1['temperature_max'] = df1['temperature_avg'].astype('int') + 1

df1.to_csv(os.path.join(fldr_raw,
                        "file1.csv"),
           index=False)

df2 = pd.DataFrame({
    'station': ['A2'] * n,
    'date': pd.date_range(start='2020-01-01',
                          periods=n),
    'temperature_avg': np.random.randint(low=0,
                                         high=12,
                                         size=n),
    'extra': ['Fine'] * n})
df2.to_csv(os.path.join(fldr_raw,
                        "file2.csv"),
           index=False)

In [4]:
fns = os.listdir(fldr_raw)

To do for each file
* column names to lowercase
* remove leading and trailing spaces from column names
* extract dtypes

In [3]:
def extract_dtypes(df, name):
    dtypes = df.dtypes\
               .astype(str)\
               .to_frame(name=name)
    return dtypes

In [22]:
# df.columns = [col.lower().lstrip().rstrip()
#                   for col in df.columns]

In [54]:
fns = os.listdir(fldr_raw)

df_types = [] 
for fn in fns:
    df = pd.read_csv(os.path.join(fldr_raw, fn))
    df_types.append(extract_dtypes(df, fn))
df_types = pd.concat(df_types, axis=1)

In [55]:
df_types

Unnamed: 0,file2.csv,file1.csv,file0.csv
station,object,object,object
date,object,object,object
temperature_avg,int64,float64,int64
extra,object,,float64
is_cloudy,,bool,
temperature_max,,int64,


In [85]:
def pick_dtype(row):
    if row.notnull().all() and (row.nunique() == 1):
        return row[0]
    else:
        types = row[row.notnull()].unique()
        if 'object' in row[row.notnull()].unique():
            return 'object'
        else:
            return 'float'

In [86]:
dtypes_dict = df_types.apply(pick_dtype, axis=1)\
                      .to_dict()
dtypes_dict

{'station': 'object',
 'date': 'object',
 'temperature_avg': 'float64',
 'extra': 'object',
 'is_cloudy': 'float',
 'temperature_max': 'float64'}

In [36]:
np.issubdtype("int8", np.number)

True

In [59]:
def arrange_schema_2(fn, dtypes_dict, fldr_in, fldr_out,
                   dates=None):
    df = pd.read_csv(fn)
    for k, v in dtypes_dict.items():
        if k not in df.columns:
            if v == 'object':
                df[k] = ""
            else:
                df[k] = np.nan
    df = df.astype(dtypes_dict)
    if len(df.columns) != len(dtypes_dict):
        return None
    df = df[dtypes_dict.keys()]
    if dates is not None:
        if isinstance(dates, str):
            df[dates] = pd.to_datetime(df[dates])
        if isinstance(dates, (list, np.ndarray)):
            for k in dates:
                df[k] = pd.to_datetime(df[k])

In [110]:
def arrange_schema(fn, dtypes_dict, fldr_in, fldr_out):
    os.makedirs(fldr_out, exist_ok=True)
    df = pd.read_csv(os.path.join(fldr_in, fn))
    for k, v in dtypes_dict.items():
        if k not in df.columns:
            if v == 'object':
                df[k] = ""
            else:
                df[k] = np.nan
    df = df.astype(dtypes_dict)
    if len(df.columns) != len(dtypes_dict):
        return None
    df = df[dtypes_dict.keys()]
    df.to_parquet(os.path.join(fldr_out, fn.replace(".csv", ".parquet")), index=False)

In [116]:
fldr_out = "data_temp/processed"
for fn in fns:
    df = arrange_schema(fn, dtypes_dict, fldr_raw, fldr_out)

In [112]:
import shutil

In [115]:
shutil.rmtree(fldr_out)

FileNotFoundError: [Errno 2] No such file or directory: 'data_temp/processed'

In [117]:
fns = os.listdir(fldr_out)

df_types = [] 
for fn in fns:
    df = pd.read_parquet(os.path.join(fldr_out, fn))
    df_types.append(extract_dtypes(df, fn))
df_types = pd.concat(df_types, axis=1)

In [118]:
df_types

Unnamed: 0,file0.parquet,file2.parquet,file1.parquet
station,object,object,object
date,object,object,object
temperature_avg,float64,float64,float64
extra,float64,object,object
is_cloudy,float64,float64,float64
temperature_max,float64,float64,float64


In [1]:
import pandas as pd

In [2]:
df = pd.read_parquet("/home/baobob/Downloads/dtypes.parquet")

In [5]:
df.to_dict()['dtypes']

{'sku_cadena': 'object',
 'codigo_local_cadena': 'object',
 'fecha': 'datetime64[ns]',
 'yhat': 'float64',
 'yhat_lower': 'float64',
 'z_mean': 'float64',
 'z_std': 'float64',
 'ts_length': 'int64',
 'dia': 'object',
 'dia_mean': 'float64',
 'dia_std': 'float64',
 'venta_unidades_promedio': 'float64',
 'venta_clp_promedio': 'float64',
 'clust': 'float64',
 'timestamp': 'datetime64[ns]',
 'id_proceso': 'object'}