In [20]:
import pandas as pd
import numpy as np
import os
import simplejson
import json
from sklearn.preprocessing import OneHotEncoder

In [2]:
def load_file(path):
    file_path = path
    df = pd.read_csv(file_path)
    #Automatic fixes
    df = df.replace(r'^\s*$', np.nan, regex=True) #replaces empty strings spacess with NaN
    return df


In [42]:
fileObjectArray = [
    {
        'path': '/Users/brandon/Desktop/Stroke-synthetic_testing-w-NaN&Text.csv',
        'storageId': 'Stroke-synthetic_testing-w-NaN&Text.csv',
    },
    {
        'path': '/Users/brandon/Desktop/Stroke-synthetic_training-w-NaN&Text.csv',
        'storageId': 'Stroke-synthetic_training-w-NaN&Text.csv',
    }
]

In [43]:

def analyze_encode_nonnumeric(fileObjectArray, target):
    df_array = []
    for file in fileObjectArray:

        df = load_file(file['path'])   
        df_array.append(df)

    df = pd.concat(df_array)

    print(df.shape)


    valid_data_types = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'bool']

    valid = df.select_dtypes(include=valid_data_types)

    invalid = df.drop(columns=valid.columns)

    result = []

    for column in invalid.columns:
        col = invalid[column]

        total_count = col.shape[0]
        nan_count = col.isna().sum()

        col = col.dropna()
        not_nan_count = col.shape[0]

        numeric = ~pd.to_numeric(col, errors='coerce').isna()
        numeric_count = numeric.sum()

        if (numeric_count / total_count) > 0.6:
            invalid[column] = pd.to_numeric(col, errors='coerce')
            #proposed_transform
            t = {
                'name': column,
                'method': 'mixed_to_numeric',
                'values' : {
                    'counts': int(numeric_count),
                    'percent': float(round(numeric_count / total_count * 100))
                },
                'items': [
                    {'text': 'Convert column to numeric', 'value': 'mixed_to_numeric'},
                    {'text': 'Remove column', 'value': 'drop'}
                ],
                'selection': 'mixed_to_numeric'
            }
            result.append(t)

        else:
            list_length = len(list(col.unique()))

            print()
            if list_length == 1:


                t = {
                    'name': column,
                    'method': 'drop_single',
                    'unique_values': list(col.unique()),
                    'nan_row_index': list(col[col.isna() == True].index),
                    'items': [
                        {'text': 'Remove column', 'value': 'drop'},
                    ],
                    'selection': 'drop'
                }       

            elif list_length == 2:


                #give option to map values
                value_map = create_binary_map(list(col.unique()))
    
                t = {
                    'name': column,
                    'method': 'one_hot_encode_binary',
                    'unique_values': list(col.unique()),
                    'nan_row_index': list(col[col.isna() == True].index),
                    'items': [
                        {'text': 'Convert each unique value to a seperate binary column', 'value': 'one_hot_encode'},
                        {'text': 'Encode as a single binary column', 'value': 'binary_encode'},
                        {'text': 'Remove column', 'value': 'drop'}
                    ],
                    'selection': 'one_hot_encode',
                    'valueMap': value_map
                }     

            else:    



                t = {
                    'name': column,
                    'method': 'one_hot_encode',
                    'unique_values': list(col.unique()),
                    'nan_row_index': list(col[col.isna() == True].index),
                    'items': [
                        {'text': 'Convert each unique value to a seperate binary column', 'value': 'one_hot_encode'},
                        {'text': 'Remove column', 'value': 'drop'},
                    ],
                    'selection': 'one_hot_encode' if len(list(col.unique())) <=20 else 'drop' #rule to decide if column should be dropped by default
                }
            #ADD LATER
            # if len(t['unique_values']) == 2: #allow for boolean conversion if only two unique values
            #     t['items'].insert(1,{'text': 'Convert to boolean', 'value': 'mixed_to_boolean'}) 
            #     if True in t['unique_values']: #if true is one of the unique values, then use this as default
            #         t['selection'] = 'mixed_to_boolean'

            result.append(t)
    return {'fileAnalysisCombined': result} 


#EFFECT
#none



IndentationError: expected an indented block (546635456.py, line 158)

In [68]:

#TRANSFORM
def transform_encode_nonnumeric(fileObjectArray, target, transform):
    df_array = []
    for file in fileObjectArray:

        df_sub = load_file(file['path'])
        df_sub['storage_id'] = file['storageId']
        df_array.append(df_sub)

    df = pd.concat(df_array) #combine all files into one dataframe

    df.reset_index(inplace=True) #reset index to start at 0

    

    

    for column in transform['data']:




        if column['selection'] == 'mixed_to_numeric':
            print(column['name'])
            df[column['name']] = local_transform_mixed_to_numeric(df[column['name']])

        #ADD LATER
        # elif t['type'] == 'category_to_binary':
        #     df[column] = transform_category_to_binary(df[column], t['map'])
        
        elif column['selection'] == 'one_hot_encode':

            position = df.columns.get_loc(column['name'])     

            new_columns = local_transform_one_hot_encode(df[column['name']])

            
            for new_column in new_columns[0:0]:

                df.insert(position, new_column, new_columns[new_column])


            df = df.drop(column['name'], axis=1)

        elif column['selection'] == 'drop':
            df = df.drop(column['name'], axis=1)

        elif column['selection'] == 'binary_encode':
             #the valueMap property is created on the backend when two unique values exist and maniplated on front end
            df[column['name']] = df[column['name']].astype('str').map(column['valueMap']).astype('int')

        #ensure target remains at end of file


    col_list = list(df.columns)
    i = col_list.index(target)
    reorder_list = col_list[:i] + col_list[i + 1:] + [target]
    df = df[reorder_list]

    
    
    #Split files and save)

    df.set_index('index')

    print(df.columns.to_list())
    result = []

    grouped = df.groupby(df.storage_id)
    for file in fileObjectArray:
        file_index=0
        df = grouped.get_group(file['storageId'])
        df = df.drop('storage_id', axis=1)

        print(file)

        file_index += 1
    
    return result


def local_transform_one_hot_encode(series):
    working = pd.Series(series)
    working = working.dropna()
    index = working.index
    
    enc = OneHotEncoder(handle_unknown='ignore')
    vector = working.values.reshape(-1,1)
    enc.fit(vector)
    trans = enc.transform(vector).toarray()
    output = pd.DataFrame(trans, columns=enc.categories_, index=index).add_prefix(series.name + '_').astype('int')
    output.columns = output.columns.get_level_values(0) #convert multiindex to single index
    
    return output


In [69]:
analysis = analyze_encode_nonnumeric(fileObjectArray, 'stroke')

transform = analysis['fileAnalysisCombined']

transform_encode_nonnumeric(fileObjectArray, 'stroke', {'data': transform})

(4981, 21)

['index', 'age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 'gender_Female', 'gender_Male', 'ever_married_No', 'ever_married_Yes', 'work_type_Govt_job', 'work_type_Private', 'work_type_Self-employed', 'work_type_children', 'Residence_type_Rural', 'Residence_type_Urban', 'smoking_status_Unknown', 'smoking_status_formerly smoked', 'smoking_status_never smoked', 'Risk_low', 'Risk_int', 'Risk_high', 'smoking_status_smokes', 'storage_id', 'stroke']
{'path': '/Users/brandon/Desktop/Stroke-synthetic_testing-w-NaN&Text.csv', 'storageId': 'Stroke-synthetic_testing-w-NaN&Text.csv'}
{'path': '/Users/brandon/Desktop/Stroke-synthetic_training-w-NaN&Text.csv', 'storageId': 'Stroke-synthetic_training-w-NaN&Text.csv'}


[]