In [15]:
import pandas as pd
import seaborn as sns
import numpy as np

In [16]:
df = sns.load_dataset('exercise')
df.head()

Unnamed: 0.1,Unnamed: 0,id,diet,pulse,time,kind
0,0,1,low fat,85,1 min,rest
1,1,1,low fat,85,15 min,rest
2,2,1,low fat,88,30 min,rest
3,3,2,low fat,90,1 min,rest
4,4,2,low fat,92,15 min,rest


In [17]:
len(df)

90

In [18]:
#id is individual being tested

#diet is the type of food consumed for some period i guess before

#pulse is the avg heartrate over the given time

#time is how long the 'kind' state lasted

#kind is what the individual did during the period

In [19]:
#turn diet from no/low fat to 0/1
def transform_target_var_binary(df,y_col_name,positive_state_str):
    df[y_col_name] = np.where(df[y_col_name] == positive_state_str, 1, 0)
    return df

In [20]:
df = transform_target_var_binary(df,'diet','low fat')
df

Unnamed: 0.1,Unnamed: 0,id,diet,pulse,time,kind
0,0,1,1,85,1 min,rest
1,1,1,1,85,15 min,rest
2,2,1,1,88,30 min,rest
3,3,2,1,90,1 min,rest
4,4,2,1,92,15 min,rest
...,...,...,...,...,...,...
85,85,29,0,135,15 min,running
86,86,29,0,130,30 min,running
87,87,30,0,99,1 min,running
88,88,30,0,111,15 min,running


### The setup is that I want to K-Fold Target Encode 'Kind' for predictions about diet

In [9]:
def k_target_encode_getGroups(df,k_groups):
    #divide the data into k equal (as equal as possible) sized datasets
    n_rows = len(df)
    
    smallest_group_size = int(n_rows/k_groups)
    remaining_rows = n_rows - (smallest_group_size * k_groups)
    
    sample_from_df = df.copy(deep=True)
    #print(sample_from_df.shape)
    
    groups = list()
    
    for i in range(k_groups):
        selection = df.sample(n=smallest_group_size,replace=False)
        #selection = selection.drop("Unnamed: 0",axis=1)
        groups.append(selection)
        
        for inv,val in selection.iterrows():
            sample_from_df = sample_from_df[sample_from_df["Unnamed: 0"] != val["Unnamed: 0"]]
            
    for i in range(remaining_rows):
        groups[i] = np.append(groups[i],np.random.choice(sample_from_df.index.values,1,replace=False))
        sample_from_df.drop(groups[i][-1])
        
    #print(groups[0].shape)
    
    return groups

In [10]:
groupsTest = k_target_encode_getGroups(df,3)
print(groupsTest[0])

    Unnamed: 0  id  diet  pulse    time     kind
36          36  13     1     90   1 min  walking
32          32  11     1     84  30 min  walking
85          85  29     0    135  15 min  running
15          15   6     0     83   1 min     rest
19          19   7     0     88  15 min     rest
63          63  22     1     98   1 min  running
67          67  23     1    105  15 min  running
66          66  23     1     98   1 min  running
87          87  30     0     99   1 min  running
74          74  25     1    116  30 min  running
25          25   9     0     99  15 min     rest
30          30  11     1     86   1 min  walking
68          68  23     1     99  30 min  running
52          52  18     0     96  15 min  walking
70          70  24     1    132  15 min  running
0            0   1     1     85   1 min     rest
41          41  14     1    100  30 min  walking
59          59  20     0    103  30 min  walking
34          34  12     1    103  15 min  walking
61          61  21  

In [11]:
def k_target_encode_getMeanByValueByGroup(group,unique_values,encode_col,target_col):
    vallvl = {}
    valCountByGroup = {}
    for uv in unique_values:
        locdf = group.copy(deep=True)
        locdf = locdf[locdf[encode_col] == uv]
        counter = 0
        for index,row in locdf.iterrows():
            if row[target_col] == 1:
                counter = counter + 1
        vallvl[uv] = (counter,len(locdf))


    return vallvl

In [12]:
def k_target_encode_getOverallMeanByGroup(group,target_col):
    count = 0
    total = 0
    for ind,row in group.iterrows():
        count = count + row[target_col]
        total = total + 1
    result = count / total
    return result   

In [13]:
def k_target_encode(df, encode_col, target_col, k_groups, m_overall_mean_weight=2):
    #check to make sure that target column is binary
    if df[target_col].nunique() != 2:
        print('non-binary target column don\'t work')
        return None
    
    #check how many categories there are in the encode_col
    n_unique = df[encode_col].nunique()
    unique_vals = list(df[encode_col].unique())
    
    #divide the data into k equal (as equal as possible) sized datasets
    groups = k_target_encode_getGroups(df,k_groups)
    group_lengths = list()
    group_means = list()
    for group in groups:
        group_lengths.append(len(group))
        group_means.append(k_target_encode_getOverallMeanByGroup(group,target_col))
    
    #get the option means (the means of the unique values) of each group gotten
    option_means = list()
    group_counts = list()
    for group in groups:
        unique_values = list(group[encode_col].unique())
        this_group_counts = k_target_encode_getMeanByValueByGroup(group,unique_values,encode_col,target_col)
        for uv in unique_vals:
            if (uv not in this_group_counts.keys()):
                this_group_counts[uv] = 0
        group_counts.append(this_group_counts)
    
    #get the weighted means for each unique value for each group
    #the formula is N(n_rows_used_for_option_mean) * OM(average of option means for uv from all OTHER groups)
    #    + M(parameter) * FM(overall mean from all OTHER groups)
    #    ALL THAT DIVIDED BY N + M
    weighted_means_all = list()
    for i,group in enumerate(groups):
        #calculate for one group
        counts_for_n_calc = {}
        for v in unique_vals:
            counts_for_n_calc[v] = [0,0]
        
        for j,g in enumerate(groups):
            if i != j:
                for v in unique_vals:
                    counts_for_n_calc[v][0] = counts_for_n_calc[v][0] + group_counts[j][v][0]
                    counts_for_n_calc[v][1] = counts_for_n_calc[v][1] + group_counts[j][v][1] 
        
        option_means = {}
        for v in unique_vals:
            option_means[v] = counts_for_n_calc[v][0]/counts_for_n_calc[v][1]
        
        weighted_means = {}
        for v in unique_vals:
            weighted_means[v] = option_means[v] * counts_for_n_calc[v][1]
            weighted_means[v] = weighted_means[v] + (m_overall_mean_weight * group_means[i])
            weighted_means[v] = weighted_means[v] / (1 + m_overall_mean_weight)
            
        weighted_means_all.append(weighted_means)
    
    encoded_df = pd.DataFrame()
    for i,group in enumerate(groups):
        for index,row in group.iterrows():
            this_row = row
            this_row[encode_col] = weighted_means_all[i][row[encode_col]]
            encoded_df = pd.concat([encoded_df,this_row.to_frame().T])

    return encoded_df
    

In [14]:
groups = k_target_encode(df,'kind','diet',5)
groups

Unnamed: 0.1,Unnamed: 0,id,diet,pulse,time,kind
87,87,30,0,99,1 min,5.851852
25,25,9,0,99,15 min,2.185185
86,86,29,0,130,30 min,5.851852
52,52,18,0,96,15 min,6.518519
11,11,4,1,83,30 min,2.185185
...,...,...,...,...,...,...
48,48,17,0,103,1 min,5.074074
37,37,13,1,92,15 min,5.074074
63,63,22,1,98,1 min,5.407407
21,21,8,0,92,1 min,2.740741
