In [None]:
# %pip install scikit-learn pandas seaborn matplotlib

^C
Note: you may need to restart the kernel to use updated packages.


In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder



In [4]:
def get_histogram(df : pd.DataFrame, column : str) -> None:
    plt.hist(df[column])
    plt.show()

# TODO
# def fill_na_column(df, column, approach, **args):
#     approaches= {
#         'knn' : fill_na_knn(df, column, args['n_neighbors']),
#         'mean' : fill_na_mean(df, column)

#     }
#     df[column].fillna(approach)
#     return

def fill_na_mean(df, column):
    mean_value = df[column].mean()
    return df[column].fillna(mean_value)



# def fill_na_knn(df, n_neighbors=5):
#     df_numerical = df.copy().select_dtypes(include=['int64', 'float64'])
#     imputer = KNNImputer(n_neighbors=n_neighbors)
#     df_array = imputer.fit_transform(df_numerical)
#     df_imputed = pd.DataFrame(df_array, index=df_numerical.index, columns=df_numerical.columns)
#     return df_imputed

In [5]:
def delete_column(df, column: str) -> pd.DataFrame:
    return df.copy().drop(columns=[column])

def delete_row(df, row_id) -> pd.DataFrame:
    return df.copy().drop([row_id])

def change_cell_value(df: pd.DataFrame, column: str, row_id: int, new_value: str | int | float)-> pd.DataFrame:
    df_changed = df.copy()
    df_changed.loc[column,row_id] = new_value
    return df_changed

def get_rows(df: pd.DataFrame, start_row: int, end_row: int) -> pd.DataFrame:
    return df.iloc[start_row:end_row]

def get_data(file_name:str) -> pd.DataFrame:
    file_type = file_name.split('.')[-1]
    file_types = {
        'csv': pd.read_csv, 
        'excel':pd.read_excel, 
        'json': pd.read_json,
        'xml': pd.read_xml,
        'html': pd.read_html,
        'sql': pd.read_sql
        }
    if file_type.lower() in list(file_types.keys()):
        df = file_types[file_type.lower()](file_name)
        return df
    else: 
        raise TypeError(f'Try using one of the supported file types:\n{', '.join(list(file_types.keys()))}')

def save_data(df: pd.DataFrame, file_name: str, file_type: str = 'csv', index:bool = True) -> None:
    file_types = {
        'csv': df.to_csv, 
        'excel':df.to_excel, 
        'json': df.to_json,
        'xml': df.to_xml,
        'html': df.to_html,
        'sql': df.to_sql
        }
    
    if file_type.lower() in list(file_types.keys()):
        file_types[file_type.lower()](file_name+f'.{file_type}', index=index)
    else: 
        raise TypeError(f'Try using one of the supported file types:\n{', '.join(list(file_types.keys()))}')


In [6]:
def fill_na_knn(df:pd.DataFrame, column: str | None = None,  n_neighbors:int=5) -> pd.DataFrame:
    imputer = KNNImputer(n_neighbors=n_neighbors)
    df_filled = df.copy()    
    df_numerical = df_filled.select_dtypes(include=['int64', 'float64'])
    df_array = imputer.fit_transform(df_numerical)
    df_imputed = pd.DataFrame(df_array, index=df_numerical.index, columns=df_numerical.columns)

    
    if column:
        df_filled[column] = df_imputed[column]
    else:
        df_filled.update(df_imputed)
    print(df_filled.info())
    return df_filled

In [2]:
df = sns.load_dataset('titanic')

In [None]:

def get_encoded_object_columns(df: pd.DataFrame) -> dict:

    encoders = {}
    object_df = pd.DataFrame(index=df.index)

    df_temp = df.copy()
    object_columns = df_temp.select_dtypes(include=['object']).columns
    for column in object_columns:
        series = df_temp[column]
        encoder = LabelEncoder()
        encoder.fit(series.dropna())
        encoders[column] = encoder

        encoded = series.map(lambda x: encoder.transform([x])[0] if pd.notnull(x) else np.nan)
        object_df[column + '_encoded'] = encoded

    return {'encoded': object_df, 'encoders': encoders}

In [43]:
test_output = get_encoded_object_columns(df)

In [44]:
test_output

{'encoded':      sex_encoded  embarked_encoded  who_encoded  embark_town_encoded  \
 0              1               2.0            1                  2.0   
 1              0               0.0            2                  0.0   
 2              0               2.0            2                  2.0   
 3              0               2.0            2                  2.0   
 4              1               2.0            1                  2.0   
 ..           ...               ...          ...                  ...   
 886            1               2.0            1                  2.0   
 887            0               2.0            2                  2.0   
 888            0               2.0            2                  2.0   
 889            1               0.0            1                  0.0   
 890            1               1.0            1                  1.0   
 
      alive_encoded  
 0                0  
 1                1  
 2                1  
 3                1  
 

In [56]:
def get_decoded_columns(encoded_df : pd.DataFrame, encoders: dict) -> pd.DataFrame :
    decoded_df = pd.DataFrame()
    for column in encoded_df.columns:
        series = encoded_df[column]

        original_column_name = column.removesuffix('_encoded')
        encoder = encoders[original_column_name]
        
        decoded = series.map(lambda x: encoder.inverse_transform([int(x)])[0] if pd.notnull(x) else np.nan)
        decoded_df[original_column_name + '_decoded'] = decoded

    return decoded_df

In [57]:
# decoded = encoded.map(lambda x: encoder.inverse_transform([int(x)])[0] if pd.notnull(x) else np.nan)
# df_work[column + '_decoded'] = decoded

In [58]:
encoded_df = test_output['encoded']

In [59]:
decode_output = get_decoded_columns(test_output['encoded'], test_output['encoders'])

In [60]:
decode_output

Unnamed: 0,sex_decoded,embarked_decoded,who_decoded,embark_town_decoded,alive_decoded
0,male,S,man,Southampton,no
1,female,C,woman,Cherbourg,yes
2,female,S,woman,Southampton,yes
3,female,S,woman,Southampton,yes
4,male,S,man,Southampton,no
...,...,...,...,...,...
886,male,S,man,Southampton,no
887,female,S,woman,Southampton,yes
888,female,S,woman,Southampton,no
889,male,C,man,Cherbourg,yes
