In [19]:
import pandas as pd
import numpy as np
import re

from typing import Union, Iterable

import data_transformation as dt 

import os

from scipy.stats import iqr, zscore

os.chdir("../data")


df1 = pd.read_csv('insurance.csv')
df2 = pd.read_csv('boston_housing.csv')
df3 = pd.read_csv('mock.csv', index_col=0)



In [20]:
class Trim_DF:
    keys = ['df', 'variable', 'boundaries', 'value', 'scaling_factor', 'z_score', 'ddof']
    # def __init__(self):
    #     self.trimmed: list = []

    def inserted_trimmed_df(
        self,
        df: pd.core.frame.DataFrame,
        variable: str,
        boundaries: str = None,
        value: Union[int, float, tuple] = None,
        scaling_factor: Union[int, float, tuple] = None,
        z_score: Union[str, int, tuple] = None,
        ddof: int = None,
        ) -> pd.core.frame.DataFrame: 


        if value:
            if (scaling_factor is not None) or (z_score is not None) or (ddof is not None):
                raise Exception(f"To properly set the fixed values threshold(s), `scaling_factor`, `z_score`, `ddof`, must be set to None.")

            if isinstance(value, (int, float, complex)):
                lower_threshold, upper_threshold = (value, value)
                
            if isinstance(value, tuple):
                if value[0] > value[1]:
                    raise Exception('The minimum threshold must must be lower than the maximum threshold: `value = (min, max)`.')      
                  
                lower_threshold, upper_threshold = (value[0], value[1]) 
        
        if scaling_factor:

            if (value is not None) or (z_score is not None) or (ddof is not None):
                raise Exception(f"To properly set the IQR threshold(s), `value`, `z_score`, `ddof`, must be set to None.")
            
            if isinstance(scaling_factor, (int, float, complex)):
                lower_scaling_factor, upper_scaling_factor = (scaling_factor, scaling_factor)

            if isinstance(scaling_factor, tuple):
                lower_scaling_factor, upper_scaling_factor = (scaling_factor[0], scaling_factor[1])

            q1 = df[variable].describe()['25%']
            q3 = df[variable].describe()['75%']
            iqr_value = iqr(df[variable].values, nan_policy='omit')

            lower_threshold = q1 - lower_scaling_factor * iqr_value
            upper_threshold = q3 + upper_scaling_factor * iqr_value

        if z_score:
            df = df.copy()
            if (value is not None) or (scaling_factor is not None):    
                raise Exception(f"To properly set the z-score threshold(s), `value`, `scaling_factor`, must be set to None.")
            if ddof is None:
                ddof = 1

            var_name_z = f'{variable}_z_score'
            df[var_name_z] = zscore(df[variable].values, ddof=ddof, nan_policy='omit')
        
            variable = var_name_z

            if isinstance(z_score, (int, float, complex)):
                lower_threshold, upper_threshold = (z_score, z_score)
            if isinstance(z_score, tuple):
                if z_score[0] > z_score[1]:
                    raise Exception('The minimum threshold must must be lower than the maximum threshold.')        

                lower_threshold, upper_threshold = (z_score[0], z_score[1])
        
        if boundaries == 'lower':
            df = df.query(f'{lower_threshold} <= {variable}')
        if boundaries == 'upper':
            df = df.query(f'{variable} <= {upper_threshold}')
        if boundaries == 'both':
            df = df.query(f'{lower_threshold} <= {variable} <= {upper_threshold}')
        
        if z_score:
            df = df.drop(columns=[var_name_z])

        df = df.reset_index(drop=True)

        # self.trimmed.append(df)

        return df.reset_index(drop=True)

In [21]:
class Filter_Data:
    # def __init__(self):
    #     self.filtered: list = [] 

    # def insert_filtered_df(self, query_: str, df: pd.core.frame.DataFrame) -> None:
    #     filtered = df.query(query_)
    #     self.filtered.append(filtered)

    keys = ['df', 'query']

    def filter_data(
            self,
            query: str,
            df: pd.core.frame.DataFrame
            ) -> pd.core.frame.DataFrame: 
        
        filtered = df.query(query).reset_index(drop=True)

        return filtered

In [22]:
from typing import Union, Iterable

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split as split

def split_data(
    df: pd.core.frame.DataFrame,
    rand_state: Union[int, tuple, list],
    proportions: Union[float, Iterable],
    shuffle: Union[bool, int] = True,
    ) -> tuple:

    if isinstance(rand_state, int):
        rand_state_1, rand_state_2 = (rand_state, rand_state)
    elif isinstance(rand_state, (tuple, list)) and len(rand_state)==2:
        rand_state_1 = rand_state[0]
        rand_state_2 = rand_state[1]

    if (shuffle is True) or (shuffle == 1):
        shuffle_1, shuffle_2 = (True, None)
    elif shuffle == 2:   
        shuffle_1, shuffle_2 = (True, True)
    elif (shuffle is False) or (shuffle == 0):
        shuffle_1, shuffle_2 = (None, None)
    elif (
        (isinstance(shuffle, bool)) 
        or shuffle not in [0, 1, 2]
        ):
        raise Exception('`shuffle` only takes booleans (shuffle once or no shuffle), `1` (shuffle once) or `2` (shuffle twice).')

    if isinstance(proportions, float):
        train_size = proportions

    if isinstance(proportions, Iterable):
        if (len(proportions) > 2) or (len(proportions) == 0):
            raise Exception('If an iterable is passed (tuple or list), it can only store one float value - train size, or two float values - train and validation sizes (from which the test size is inferred).')

        if len(proportions) == 2:
            if sum(proportions) < 1:
                train_size = proportions[0]
                validation_size = proportions[1] / (1 - proportions[0])
            if sum(proportions) > 1:
                raise Exception('If proportions are specified for train and validation sets, such values should add up to 1.')
            if (sum(proportions) == 1):
                train_size = proportions[0]

        if (len(proportions) == 1):
            train_size = proportions[0]


    train, validation = split(
        df,
        random_state=rand_state_1,
        train_size=train_size,
        shuffle=shuffle_1,
        )
    
    if (
        isinstance(proportions, float)
        or all([isinstance(proportions, Iterable), (sum(proportions) == 1)])
        or all([isinstance(proportions, Iterable), (len(proportions) == 1)])
    ):
        test = pd.DataFrame()

        return (
            train.reset_index(drop=True),
            validation.reset_index(drop=True),
            test,
        )
    
    else:
        # The previous `validation` will be splitted further into a validation and test sets.
        validation, test = split(
            validation,
            train_size=validation_size,
            random_state=rand_state_2,
            shuffle=shuffle_2,
            )

        return (
            train.reset_index(drop=True),
            validation.reset_index(drop=True),
            test.reset_index(drop=True),
        )

In [23]:
df1.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


In [24]:
train, validation, test  = split_data(
    df=df1,
    rand_state=(5, 5),
    proportions= (0.6, 0.2,), 
    shuffle= False,
)

data_sets = {'main': df1, 'train': train, 'validation': validation, 'test': test}

In [25]:
trimmer_container = [
    {'df': 'all', 'variable': 'bmi', 'boundaries': 'upper', 'value': 45},
    {'df': None, 'variable': 'charges', 'boundaries': 'both', 'z_score': (-1, 1)},
    {'variable': 'charges', 'boundaries': 'both', 'z_score': (-1, 1)},
    {'df': 'validation', 'variable': 'charges', 'boundaries': 'both', 'z_score': (-1, 1)},
    {'df': ['validation', 'test'], 'variable': 'charges', 'boundaries': 'both', 'z_score': (-1, 1)},
    ]

query_container = [
    {'df': 'all', 'query': 'smoker == "yes"'},
    {'df': 'all', 'query': 'bmi > 30 & smoker == "yes"'},
    {'df': 'all', 'query': 'bmi > 30 & smoker == "yes" & sex == "yes"'},
    ]

In [26]:
query = Filter_Data()

queried = query.filter_data(
    df=data_sets['train'],
    query=query_container[0]['query'],
)

queried.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,40,female,32.775,2,yes,northwest,40003.33225
1,34,male,30.8,0,yes,southwest,35491.64
2,37,female,34.8,2,yes,southwest,39836.519
3,64,female,22.99,0,yes,southeast,27037.9141
4,25,male,24.13,0,yes,northwest,15817.9857
5,29,male,34.4,0,yes,southwest,36197.699
6,31,male,25.9,3,yes,southwest,19199.944
7,43,male,34.96,1,yes,northeast,41034.2214
8,57,male,28.975,0,yes,northeast,27218.43725
9,54,male,40.565,3,yes,northeast,48549.17835


In [1]:
a = 'soma'

if a is True:
    print('ok')

In [8]:
d = {'df': 'train', 'variable': 'bmi', 'boundaries': 'upper', 'value': 45}

', '.join([f'{key}: {d[key]}' for key in d])

'df: train, variable: bmi, boundaries: upper, value: 45'

In [7]:
[f'{key}: {d[key]}' for key in d]

['df: train', 'variable: bmi', 'boundaries: upper', 'value: 45']

In [3]:
idx = [1, 4, 6, 7, 9, 10]


l = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',]

c = all([False for i in idx if i not in range(len(l))])

In [3]:
type((1, 2,))

tuple