In [121]:
import pandas as pd
import numpy as np
from typing import Sequence, List, Tuple

In [28]:
# pull in only relevant data for development.
raw = pd.read_excel("../data/raw/MASTER FILE.xlsx", 
sheet_name="PAT_POP", 
index_col=0,
usecols=[0]+ [x for x in range(3,95)], # will ammend later
header=0)

In [33]:
raw.reset_index().to_csv("../data/interim/pat_pop.csv", index=False)

In [176]:
df = pd.read_csv("../data/interim/pat_pop.csv", index_col =0)

In [177]:
df.reset_index(inplace=True)

In [252]:
def try_str_to_date(val):
    try:
        return datetime.strptime(val, "%Y-%m-%d %H:%M:%S").date()
    except:
        return val

In [253]:
def try_str_to_float(val):
    try:
        return float(val)
    except:
        return val

In [254]:
def string_to_blank_save_numeric(col: Sequence) -> Sequence:
    """Converts all values of type str to NaN. Keeps numeric strings."""
    result = [np.nan if type(try_str_to_float(x)) == str else x for x in col]
    return pd.Series(result, name=col.name)

def string_to_blank_save_date(col: Sequence) -> Sequence:
    """Converts all values of type str to NaN. Keeps dates strings."""
    result = [np.nan if type(try_str_to_date(x)) == str else x for x in col]
    return pd.Series(result, name=col.name)

# def comma_split(df: pd.DataFrame, col_name: str, col: Sequence) -> pd.DataFrame:
#     """comma separates values into new columns"""
#     result = pd.DataFrame([x.split(',') for x in df[col_name].tolist()])
#     return result

# def string_to_flag_col(df, col_name):
#     df[f"col_name_notes"] = df.apply(lambda x: 1 if type(x[col_name]) == str else np.nan, axis =1)
#     second_col = [1 for x in df[col_name] if type(x)==str]
#     first_col = string_to_blank(df["col_name"])

# def value_to_notes_col(df, col_name: str, vals_to_notes: List):
#     new_col_name=f"{col_name}_notes"
#     notes_col = [x if x in vals_to_notes else x for x in df[col_name]]
#     amended_col = string_to_blank(df[col_name])
#     return df[col_name], [amended_col, notes_col]

    

In [None]:
{
    "OCT_DATE": {"format": "date", "string_intention": "mixed", "vals_to_notes" = ["NO ATROPHY", "OD CONVERTS TO WET"]},
    "Sub-RPE 5mm OD": {"format": "numeric", "string_intention": "notes"},
    "dist_fovea_OD": {"format": "numeric", "string_intention": "blank"},
    "cst_OD": {"format": "numeric", "string_intention":"notes"},
    "Sub-RPE 5mm OS": {"format": "numeric", "string_intention": "notes"},
    "questionable_OS": {"format": "numeric", "string_intention": "blank"}

}

In [255]:
def display_check(result):
    old_col = result[0]
    new_cols = result[1]
    diff = old_col.compare(new_cols[0], result_names=(old_col.name, new_cols[0].name))
    if len(new_cols)>1:
        idx = list(diff.index) 
        cols = [old_col]+new_cols
        return pd.DataFrame(cols).T.loc[idx]
    return diff

In [None]:
class PatpopScrub:
    def __init__(col: Sequence, instructions: Mapping):
        self.format = instructions["format"]
        self.string_intention = instructions["string_intention"]
        self.old_col = col.copy()
        self.notes_col_name = f"{col.name}_notes" if instructions["string_intention"] != "blank" else None
        self.old_col_name = f"old_{col.name}"
        self.notes = self._get_notes()

    def _get_notes(self):
        if self.string_intention == "mixed":
            if self.format == "date"
                return pd.Series([x if x in self.instructions["vals_to_notes"] else np.nan for x in old_col], name=self.notes_col_name)
            else:
                raise(ValueError(f"Cannot have string_intention 'mixed' with format {self.format}"))
        if self.string_intention == "notes":
            if self.format == "numeric":
                return pd.Series([x if type(try_str_to_float(x)) == str else np.nan for x in old_col])
            else:
                raise(ValueError(f"Cannot have string_intention 'notes' with format {self.format}"))
        if self.string_intention == "blank":
            return None

    def clean():
        if self.format == "numeric":
            ammended_col = string_to_blank_save_numeric(old_col)
        if self.format == "date":
            ammended_col = string_to_blank_save_date(col)
        if self.notes:
            return Tuple(self.old_col, [ammended_col, self.notes])
        else:
            return Tuple(self.old_col, [ammended_col])




In [256]:
def pat_pop_date_str2mixed(col: Sequence, vals_to_notes: List[str]) -> Tuple[Sequence, List[Sequence]]:
    """applied to date column, some strings to blank, some to notes column"""
    old_col = col.copy()
    new_col_name = f"{col.name}_notes"
    old_col.name = f"old_{col.name}"
    notes_col = [x if x in vals_to_notes else np.nan for x in old_col]
    amended_col = string_to_blank_save_date(col)
    return (old_col, [amended_col, pd.Series(notes_col, name=new_col_name)])


In [277]:
def pat_pop_numeric_str2notes(col: Sequence) -> Tuple[Sequence, List[Sequence]]:
    """applied to a numeric column, strings to notes column
    Applied to Sub-RPE_5mm_OD, CST_OD, Sub-RPE_5mm_OS"""
    old_col = col.copy()
    new_col_name= f"{col.name}_notes"
    old_col.name=f"old_{col.name}"
    amended_col = string_to_blank_save_numeric(col)
    notes_col = [x if type(try_str_to_float(x)) == str else np.nan for x in old_col]
    return (old_col, [pd.Series(amended_col, name=col.name), pd.Series(notes_col, name=new_col_name)])



In [281]:
def pat_pop_numeric_str2blank(col: Sequence) -> Tuple[Sequence, List[Sequence]]:
    """applied to numeric column, strings to blank
    applied to dist_fovea_OD and questionable_OS"""
    old_col = col.copy()
    old_col.name=f"old_{col.name}"
    new_col = string_to_blank_save_numeric(col)
    return (old_col, [pd.Series(new_col, name=col.name)])

In [283]:
res = pat_pop_numeric_str2blank(df["questionable_OS"])

In [237]:
len(cols)

1472

In [233]:
list(res[0].compare(res[1][0], result_names=(res[0].name, res[1][0].name)).index)

[75, 357, 495, 1144, 1149, 1370]

In [204]:
idx = res[0].compare(res[1][0], result_names=(res[0].name, res[1][0].name)).index

In [205]:
ilist = list(idx)