In [None]:
%pip install scikit-learn pandas seaborn matplotlib

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from modules.column import encode_columns, get_columns_by_type, get_columns_with_missing_values
from modules.fillers import fill_na_knn


In [2]:
df_init = sns.load_dataset('titanic')
df = df_init.copy()

In [None]:
from sklearn.preprocessing import LabelEncoder

def get_decoded_columns(encoded_df : pd.DataFrame, encoders: dict, embedded=False) -> pd.DataFrame :
    if embedded:
        decoded_df = encoded_df.copy()
    else:
        decoded_df = pd.DataFrame()

    for column in encoders.keys():
        series = encoded_df[column]
        column_encoder = encoders[column]

        original_column_name = column.removesuffix('_encoded')

        column_decoded_array = column_encoder.inverse_transform(series.astype('int64'))
        # decoded_df[original_column_name + '_decoded'] = column_decoded_array
        decoded_df[original_column_name + '_decoded'] = pd.DataFrame(column_decoded_array, columns=[original_column_name + '_decoded'], dtype='category')

       
    return decoded_df

def pipeline_missing_values_category_values(df : pd.DataFrame) -> pd.DataFrame :
    filled_df : pd.DataFrame = df.copy()

    # getting category columns with missing values
    null_columns        : pd.DataFrame  = get_columns_with_missing_values(filled_df, column_types=['category'])
    null_column_names   : list[str]     = list(null_columns.columns)

    # encode all categorical columns with missing values
    encoded_output = encode_columns(filled_df, columns=null_column_names, embedded=True)
    if 'encoded' in encoded_output and 'encoders':
        encoded_df  : pd.DataFrame              = encoded_output['encoded'] 
        encoders    : dict[str, LabelEncoder]   = encoded_output['encoders']
    else:
        raise ValueError('There was an error during the encoding process')
    
    # encoders.keys() -> has only encoded column names
    for column in encoders.keys():
        
        # transforming to numerical value for KNNImputer (Int64 handles NaNs)
        encoded_df[column] = encoded_df[column].astype('Int64')

    # filling in the missing values 
    filled_encoded_df : pd.DataFrame = fill_na_knn(encoded_df, columns=list(encoders.keys()))
    
    decoded_df = get_decoded_columns(filled_encoded_df, encoders, embedded=True)
    # rounding up the values for the inverse_transform


    return decoded_df


In [35]:
test = pipeline_missing_values_category_values(df)

In [9]:
test.info()

<class 'pandas.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   survived      891 non-null    int64   
 1   pclass        891 non-null    int64   
 2   sex           891 non-null    str     
 3   age           714 non-null    float64 
 4   sibsp         891 non-null    int64   
 5   parch         891 non-null    int64   
 6   fare          891 non-null    float64 
 7   embarked      889 non-null    str     
 8   class         891 non-null    category
 9   who           891 non-null    str     
 10  adult_male    891 non-null    bool    
 11  deck          203 non-null    category
 12  embark_town   889 non-null    str     
 13  alive         891 non-null    str     
 14  alone         891 non-null    bool    
 15  deck_encoded  891 non-null    float64 
dtypes: bool(2), category(2), float64(3), int64(4), str(5)
memory usage: 87.6 KB


In [37]:
test.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,deck_encoded,deck_decoded
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,4.6,E
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,2.0,C
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,5.0,F
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,2.0,C
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,3.8,D


In [57]:
# decoded = encoded.map(lambda x: encoder.inverse_transform([int(x)])[0] if pd.notnull(x) else np.nan)
# df_work[column + '_decoded'] = decoded

In [58]:
encoded_df = test_output['encoded']

In [60]:
decode_output

Unnamed: 0,sex_decoded,embarked_decoded,who_decoded,embark_town_decoded,alive_decoded
0,male,S,man,Southampton,no
1,female,C,woman,Cherbourg,yes
2,female,S,woman,Southampton,yes
3,female,S,woman,Southampton,yes
4,male,S,man,Southampton,no
...,...,...,...,...,...
886,male,S,man,Southampton,no
887,female,S,woman,Southampton,yes
888,female,S,woman,Southampton,no
889,male,C,man,Cherbourg,yes
