In [2]:
%pip install scikit-learn pandas seaborn matplotlib

Collecting scikit-learn
  Downloading scikit_learn-1.8.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (11 kB)
Collecting pandas
  Downloading pandas-3.0.0-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (79 kB)
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting matplotlib
  Downloading matplotlib-3.10.8-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (52 kB)
Collecting numpy>=1.24.1 (from scikit-learn)
  Downloading numpy-2.4.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Collecting scipy>=1.10.0 (from scikit-learn)
  Downloading scipy-1.17.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
Collecting joblib>=1.3.0 (from scikit-learn)
  Downloading joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Colle

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [None]:
def get_data(file_name:str) -> pd.DataFrame:
    file_type = file_name.split('.')[-1]
    file_types = {
        'csv': pd.read_csv, 
        'excel':pd.read_excel, 
        'json': pd.read_json,
        'xml': pd.read_xml,
        'html': pd.read_html,
        'sql': pd.read_sql
        }
    if file_type.lower() in list(file_types.keys()):
        df = file_types[file_type.lower()](file_name)
        return df
    else: 
        raise TypeError(f'Try using one of the supported file types:\n{', '.join(list(file_types.keys()))}')

def save_data(df: pd.DataFrame, file_name: str, file_type: str = 'csv', index:bool = True) -> None:
    file_types = {
        'csv': df.to_csv, 
        'excel':df.to_excel, 
        'json': df.to_json,
        'xml': df.to_xml,
        'html': df.to_html,
        'sql': df.to_sql
        }
    
    if file_type.lower() in list(file_types.keys()):
        file_types[file_type.lower()](file_name+f'.{file_type}', index=index)
    else: 
        raise TypeError(f'Try using one of the supported file types:\n{', '.join(list(file_types.keys()))}')


In [6]:
df_init = sns.load_dataset('titanic')

In [19]:
df_init.select_dtypes(include=['int64', 'float64']).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   age       714 non-null    float64
 3   sibsp     891 non-null    int64  
 4   parch     891 non-null    int64  
 5   fare      891 non-null    float64
dtypes: float64(2), int64(4)
memory usage: 41.9 KB


In [20]:
df = df_init.copy()

In [None]:
def get_columns_by_types(df : pd.DataFrame, types : list, exclude : bool = False) -> pd.DataFrame:
    """
    Gets columns by provided types 

    Parameters
    ----------
    df : pd.DataFrame
        The DataFrame containing the data.
    types : list[str]
        A selection of types to be included/excluded.
    exclude : bool, default=False
        If True, excludes columns of the specified types; if False, includes them.

    Returns
    -------
    pd.DataFrame
        A DataFrame containing only the columns matching the specified types.

    Notes (ref. df.select_dtypes())
    -----
    * To select all *numeric* types, use ``np.number`` or ``'number'``
    * To select strings you must use the ``object`` dtype, but note that this will return *all* object dtype columns. With ``pd.options.future.infer_string`` enabled, using ``"str"`` will work to select all string columns.
    * See the `numpy dtype hierarchy <https://numpy.org/doc/stable/reference/arrays.scalars.html>`__ * To select datetimes, use ``np.datetime64``, ``'datetime'`` or ``'datetime64'`` 
    * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or ``'timedelta64'``
    * To select Pandas categorical dtypes, use ``'category'``
    * To select Pandas datetimetz dtypes, use ``'datetimetz'`` or ``'datetime64[ns, tz]'``
    """
    if exclude:
        chosen_columns = df.copy().select_dtypes(exclude=types)
    else:
        chosen_columns = df.copy().select_dtypes(include=types)
    return chosen_columns

In [None]:
# TODO: finish pipeline: done(category/object) -> encoded -> fill_na -> decoded
def get_encoded_object_columns(df: pd.DataFrame) -> dict:

    encoders = {}
    object_df = pd.DataFrame(index=df.index)

    df_temp = df.copy()
    object_columns = df_temp.select_dtypes(include=['object']).columns
    for column in object_columns:
        series = df_temp[column]
        encoder = LabelEncoder()
        encoder.fit(series.dropna())
        encoders[column] = encoder

        encoded = series.map(lambda x: encoder.transform([x])[0] if pd.notnull(x) else np.nan) # type: ignore
        object_df[column + '_encoded'] = encoded

    return {'encoded': object_df, 'encoders': encoders}

In [43]:
test_output = get_encoded_object_columns(df)

In [56]:
def get_decoded_columns(encoded_df : pd.DataFrame, encoders: dict) -> pd.DataFrame :
    decoded_df = pd.DataFrame()
    for column in encoded_df.columns:
        series = encoded_df[column]

        original_column_name = column.removesuffix('_encoded')
        encoder = encoders[original_column_name]
        
        decoded = series.map(lambda x: encoder.inverse_transform([int(x)])[0] if pd.notnull(x) else np.nan)
        decoded_df[original_column_name + '_decoded'] = decoded

    return decoded_df

In [57]:
# decoded = encoded.map(lambda x: encoder.inverse_transform([int(x)])[0] if pd.notnull(x) else np.nan)
# df_work[column + '_decoded'] = decoded

In [58]:
encoded_df = test_output['encoded']

In [60]:
decode_output

Unnamed: 0,sex_decoded,embarked_decoded,who_decoded,embark_town_decoded,alive_decoded
0,male,S,man,Southampton,no
1,female,C,woman,Cherbourg,yes
2,female,S,woman,Southampton,yes
3,female,S,woman,Southampton,yes
4,male,S,man,Southampton,no
...,...,...,...,...,...
886,male,S,man,Southampton,no
887,female,S,woman,Southampton,yes
888,female,S,woman,Southampton,no
889,male,C,man,Cherbourg,yes
