In [2]:
from datetime import datetime
from sklearn.impute import KNNImputer
import numpy as np
import pandas as pd
import json

In [3]:
data = pd.read_csv('myFitnessPal_parsed.csv')
data.head()

Unnamed: 0,user_id,date,sequence,food_ids,total_calories,total_carbs,total_fat,total_protein,total_sodium,total_sugar,goal_calories,goal_carbs,goal_fat,goal_protein,goal_sodium,goal_sugar
0,1,2014-09-15,1,"[1, 2, 3, 4, 4]",2430,96,37.0,50.0,855.0,63.0,1572.0,196.0,52.0,79.0,2300.0,59.0
1,1,2014-09-16,1,"[5, 1, 2, 3, 6, 7]",1862,158,54.0,114.0,2215.0,100.0,1832.0,229.0,61.0,92.0,2300.0,69.0
2,1,2014-09-17,1,"[1, 2, 3, 6, 8, 9, 10]",2251,187,60.0,98.0,1765.0,105.0,1685.0,210.0,56.0,85.0,2300.0,63.0
3,1,2014-09-18,1,"[1, 6, 2, 3, 11, 12]",2001,113,81.0,202.0,1101.0,71.0,1597.0,199.0,53.0,80.0,2300.0,60.0
4,1,2014-09-19,1,"[1, 7, 13, 12, 2, 3, 12, 12]",2158,180,89.0,115.0,1998.0,84.0,1589.0,198.0,53.0,80.0,2300.0,60.0


In [3]:
with open('foods.json') as json_file:
     foods = json.load(json_file)

FileNotFoundError: [Errno 2] No such file or directory: 'foods.json'

In [3]:
# # split date and create separated columns
# data["year"] = data["date"].apply(lambda x: int(x[:4]))
# data["month"] = data["date"].apply(lambda x: int(x[5:7]))
# data["day"] = data["date"].apply(lambda x: int(x[-2:]))
# data = data.drop(columns = 'date')

In [4]:
null_value_frequency = data.isna().sum()
null_value_frequency[null_value_frequency!=0] 

total_fat          877
total_protein      877
total_sodium      1305
total_sugar       1307
goal_calories     1922
goal_carbs        1925
goal_fat         27313
goal_protein     27318
goal_sodium      67720
goal_sugar       67990
dtype: int64

In [5]:
# As we saw there are 10 columns which have sometimes null values so :
#In this cell we drop rows which have null values in more than 3 columns

nullColumns= data.loc[data.isnull().sum(axis=1)>3].index
nullColumns=pd.DataFrame(nullColumns,columns={'index'})
print('Values which have null in more than 3 columns : ',(len(nullColumns)/len(data))*100,'%')

cond = data.index.isin(nullColumns['index']) 
data.drop(data[cond].index, inplace = True)

Values which have null in more than 3 columns :  4.6515073588266755 %


In [6]:
#just check how columns null distribution has changed
null_value_frequency = data.isna().sum()
null_value_frequency[null_value_frequency!=0] 

goal_protein        5
goal_sodium     40407
goal_sugar      40677
dtype: int64

In [7]:
# seems like there are people who have set goal_calories to zero and it should also be the mistake
# so lets drop those ones too

zeros= data[data['goal_calories']==0].index
zeros=pd.DataFrame(zeros,columns={'index'})
print('Examples which have zero in goal_calories : ',(len(zeros)/len(data))*100,'%')

cond = data.index.isin(zeros['index']) 
data.drop(data[cond].index, inplace = True)

Examples which have zero in goal_calories :  0.09484293759477595 %


In [8]:
# create new column based on foods quantity
data['foods_len'] = data["food_ids"].apply(lambda x: len(x[1:-1].split(',')))

In [9]:
# add new feature which shows how many days are logged by each applicants
user_logged_freq = data["user_id"].value_counts()
user_logged_df = pd.DataFrame(data["user_id"].unique(),columns = ['user_id'])
user_logged_df["logged_frequency"] = user_logged_df["user_id"].apply(lambda _id: user_logged_freq[_id])

In [10]:
# this function gets all nutrition values with goals and checks if calories difference is less than percentage of goals
# TODO:
def check_bounds(total_calories, total_carbs, total_fat, total_protein, total_sodium, total_sugar, 
               goal_calories, goal_carbs, goal_fat, goal_protein, goal_sodium, goal_sugar,percent):
    
    return (abs(goal_calories - total_calories) < goal_calories * percent / 100)

In [11]:
# this function checks last days for user and counts number of days when his nutrient was in goal range
def reach_goal(df,user_id,num_days):
    allowed_difference_percentage = 15
    tails = df[df["user_id"]==user_id].tail(num_days)
    tails["reach_goal"] = tails.apply(lambda row: check_bounds(*(row.values[4:16]),allowed_difference_percentage),axis=1)
    return tails["reach_goal"].sum()

In [12]:
# create new feature which shows if user reaches goals in last days 
# and return 1 if the number of days when user reached goal is greater than threshold else 0
# TODO:

number_of_last_days = 5
threshold = 2
user_logged_df["reach_goal"] = user_logged_df['user_id'].apply(lambda x: reach_goal(data,x,number_of_last_days))
user_logged_df["reach_goal"] = user_logged_df["reach_goal"].apply(lambda x: 1 if x>=threshold else 0)

In [17]:
# this function counts whole days between start and end date and calculates missed days for new feature
def days_missed(d1, d2,loggedDays):
    d1 = datetime.strptime(str(d1), "%Y-%m-%d")
    d2 = datetime.strptime(str(d2), "%Y-%m-%d")
    return abs(abs((d2 - d1).days)-loggedDays)

In [18]:
# this function calls days_missed regarding last and first records
def get_missed_days(df,userID,logged_frequency):
    tail = df[df["user_id"]==userID].tail(1)['date'].values[0]
    head = df[df["user_id"]==userID].head(1)['date'].values[0]
    return days_missed(tail,head,logged_frequency)   

In [19]:
# add new feature based on how many days are missed for each user
user_logged_df['days_missed'] = user_logged_df[['user_id','logged_frequency']].apply(lambda x: get_missed_days(data,x.user_id,x.logged_frequency),axis=1)

In [20]:
def getHealtyDistributedValues(value,lower,upper):
    if(value<=lower):
        return lower-value # the difference person lacked 
    if(value >= upper):
        return value-upper # the difference person exceed 
    return 0 # method returns 0 for the values in range

In [21]:
# There is known that for healthy eating, daily carbs should be between 45-65% out of callories, fats between 10-35% and proteins 20-35%
# So this method counts daily norm distribution of person
# 0 for the people in persmissible range
# max value may be 1.3 (when person only took fats)

def healthyDistributed(carbs,fat,protein):
    totalCalories = fat*9+ carbs*4 + protein*4 # convert to calories (1g fat = 9 calories and etc..) and sum
    sum =  getHealtyDistributedValues ( carbs*4 / (totalCalories+0.00000001),0.45,0.65)
    sum += getHealtyDistributedValues ( protein*4 / (totalCalories+0.00000001),0.2,0.35)
    sum += getHealtyDistributedValues ( fat*9 / (totalCalories+0.00000001),0.1,0.35)
    return sum

In [22]:
#new column in data based on healthydistributed method to determine user's behavior
data['healtyDistrib']=data[['total_carbs','total_fat','total_protein']].apply(lambda x: healthyDistributed(x.total_carbs,x.total_fat,x.total_protein),axis=1)

In [23]:
data=data.drop(columns=['date','food_ids','sequence'])

In [24]:
# we need to have the same number of rows for each user so
# this function checks if number of rows for each user are greater than num_rows
# if it's greater, then this function returns the lasts rows 
# if less, then this fills them with -1

def row_padding(x,num_row):
    # get last row because I need same format and same id, others columns replaced by -1
    last_row = x.iloc[-1] 
    last_row[1:] = [-1]*len(last_row[1:])
    
    if np.shape(x)[0] < num_row:
        new_x = pd.DataFrame(x)
        for i in range(np.shape(x)[0],num_row):
            new_x = pd.DataFrame(new_x.append(last_row))
        return new_x
    else:
        return x.tail(num_row)

In [25]:
# create new columns based on goal and total nutritient
data["calories_diff"] = data["goal_calories"]-data["total_calories"]
data["carbs_diff"] = data["goal_carbs"]-data["total_carbs"]
data["fat_diff"] = data["goal_fat"]-data["total_fat"]
data["protein_diff"] = data["goal_protein"]-data["total_protein"]
data["sodium_diff"] = data["goal_sodium"]-data["total_sodium"]
data["sugar_diff"] = data["goal_sugar"]-data["total_sugar"]

In [26]:
# this function flattens all rows for each user which we padded already 
# so creates one vector because we need one input for each user
def flatten_rows(x,cols):
    for i in range(1,x.shape[0]):
        for j in range(1,len(cols)):
            temp_row = x.iloc[i]
            x[cols[j]+"_"+str(i)] = temp_row[j]
    return x.head(1)

In [27]:
data = data.groupby('user_id').apply(row_padding,5).reset_index(drop=True)

In [28]:
cols = data.columns
data = data.groupby('user_id').apply(flatten_rows,cols).reset_index(drop=True)

In [29]:
data

Unnamed: 0,user_id,total_calories,total_carbs,total_fat,total_protein,total_sodium,total_sugar,goal_calories,goal_carbs,goal_fat,...,goal_sodium_4,goal_sugar_4,foods_len_4,healtyDistrib_4,calories_diff_4,carbs_diff_4,fat_diff_4,protein_diff_4,sodium_diff_4,sugar_diff_4
0,1.0,4122.0,464.0,196.0,168.0,5787.0,125.0,4578.0,572.0,153.0,...,2300.0,87.0,6.0,0.081410,1121.0,176.0,34.0,18.0,1162.0,26.0
1,2.0,1548.0,153.0,52.0,44.0,1752.0,100.0,1320.0,165.0,44.0,...,2300.0,50.0,2.0,0.000000,1264.0,154.0,43.0,59.0,2193.0,40.0
2,3.0,1468.0,158.0,41.0,55.0,1588.0,15.0,1486.0,185.0,49.0,...,2300.0,25.0,6.0,0.076138,2.0,-22.0,10.0,26.0,593.0,14.0
3,4.0,645.0,63.0,39.0,27.0,1454.0,192.0,1450.0,181.0,73.0,...,2300.0,3500.0,8.0,0.085167,4.0,-24.0,14.0,4.0,1182.0,1440.0
4,5.0,3754.0,196.0,33.0,124.0,2569.0,90.0,3399.0,212.0,46.0,...,2300.0,90.0,14.0,0.000000,155.0,45.0,12.0,56.0,-844.0,-32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9396,9893.0,1140.0,47.0,49.0,13.0,2000.0,8.0,1932.0,97.0,242.0,...,2300.0,73.0,1.0,0.650000,1122.0,28.0,244.0,62.0,2140.0,73.0
9397,9894.0,1873.0,237.0,75.0,64.0,50.0,29.0,1660.0,208.0,55.0,...,-1.0,-1.0,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
9398,9895.0,1755.0,221.0,54.0,78.0,8.0,85.0,2024.0,202.0,56.0,...,44.0,44.0,19.0,0.118536,-132.0,-21.0,-46.0,80.0,27.0,-20.0
9399,9896.0,338.0,20.0,30.0,11.0,8.0,18.0,1290.0,43.0,161.0,...,25.0,48.0,5.0,0.452941,734.0,21.0,139.0,-8.0,14.0,36.0


In [30]:
# merge two new features with main data
data = pd.merge(data, user_logged_df, on=['user_id'])

In [31]:
null_value_frequency = data.isna().sum()
null_value_frequency[null_value_frequency!=0] 

goal_sodium       683
goal_sugar        688
sodium_diff       683
sugar_diff        688
goal_protein_1      1
goal_sodium_1     649
goal_sugar_1      652
protein_diff_1      1
sodium_diff_1     649
sugar_diff_1      652
goal_sodium_2     634
goal_sugar_2      638
sodium_diff_2     634
sugar_diff_2      638
goal_protein_3      1
goal_sodium_3     615
goal_sugar_3      617
protein_diff_3      1
sodium_diff_3     615
sugar_diff_3      617
goal_sodium_4     605
goal_sugar_4      610
sodium_diff_4     605
sugar_diff_4      610
dtype: int64

In [32]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9401 entries, 0 to 9400
Columns: 104 entries, user_id to days_missed
dtypes: float64(101), int64(3)
memory usage: 7.5 MB


In [33]:
# handle missing data with KNNImputer
imputer = KNNImputer()
data[data.columns] = np.round(imputer.fit_transform(data))
data[data.columns] = np.round(imputer.fit_transform(data))

In [34]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [32]:
data.describe()

Unnamed: 0,user_id,total_calories,total_carbs,total_fat,total_protein,total_sodium,total_sugar,goal_calories,goal_carbs,goal_fat,goal_protein,goal_sodium,goal_sugar,foods_len,healtyDistrib,calories_diff,carbs_diff,fat_diff,protein_diff,sodium_diff,sugar_diff,total_calories_1,total_carbs_1,total_fat_1,total_protein_1,total_sodium_1,total_sugar_1,goal_calories_1,goal_carbs_1,goal_fat_1,goal_protein_1,goal_sodium_1,goal_sugar_1,foods_len_1,healtyDistrib_1,calories_diff_1,carbs_diff_1,fat_diff_1,protein_diff_1,sodium_diff_1,sugar_diff_1,total_calories_2,total_carbs_2,total_fat_2,total_protein_2,total_sodium_2,total_sugar_2,goal_calories_2,goal_carbs_2,goal_fat_2,goal_protein_2,goal_sodium_2,goal_sugar_2,foods_len_2,healtyDistrib_2,calories_diff_2,carbs_diff_2,fat_diff_2,protein_diff_2,sodium_diff_2,sugar_diff_2,total_calories_3,total_carbs_3,total_fat_3,total_protein_3,total_sodium_3,total_sugar_3,goal_calories_3,goal_carbs_3,goal_fat_3,goal_protein_3,goal_sodium_3,goal_sugar_3,foods_len_3,healtyDistrib_3,calories_diff_3,carbs_diff_3,fat_diff_3,protein_diff_3,sodium_diff_3,sugar_diff_3,total_calories_4,total_carbs_4,total_fat_4,total_protein_4,total_sodium_4,total_sugar_4,goal_calories_4,goal_carbs_4,goal_fat_4,goal_protein_4,goal_sodium_4,goal_sugar_4,foods_len_4,healtyDistrib_4,calories_diff_4,carbs_diff_4,fat_diff_4,protein_diff_4,sodium_diff_4,sugar_diff_4,logged_frequency,days_missed
count,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0,9401.0
mean,4940.44708,1307.712797,136.98181,64.082332,81.627167,1125.858632,365.255079,1637.956919,190.677907,88.308903,159.571322,1395.28231,476.992873,9.413467,0.146261,330.244123,53.696096,24.226572,77.944155,385.440698,142.846612,1187.55409,125.54792,57.815764,77.184874,966.440911,344.850441,1564.389427,181.588023,84.307946,149.796405,1339.745665,443.171577,8.963621,0.095415,376.792682,55.997447,26.449527,72.695139,385.733113,142.087757,1149.923093,121.927986,56.331029,73.54324,935.517498,338.908733,1520.839272,177.478992,82.241889,147.76577,1292.434422,425.960962,8.618977,0.061483,370.84491,55.479736,25.839592,74.151261,370.459526,128.502819,1084.371556,117.521221,53.555154,68.150941,865.839911,320.984789,1476.699606,170.8364,79.824806,142.569939,1262.074886,377.287948,8.206999,0.037017,392.23306,53.220189,26.174662,74.513456,410.380704,125.058611,879.215509,93.1753,44.405063,57.152643,719.053079,279.843208,1421.514201,164.528667,76.797043,138.365068,1232.18764,352.835443,6.802255,0.028933,542.185512,71.240187,32.2788,81.099245,527.397298,147.565897,59.498138,54.984895
std,2861.680298,5496.419329,511.389537,249.110455,235.274781,9978.05267,790.562314,650.108338,206.311328,232.900462,365.941651,1126.417777,908.005486,5.93294,0.353387,5527.219323,516.284889,222.975696,334.805822,1087.332985,557.110837,702.14408,182.338371,176.305794,212.817717,1287.085629,775.672249,707.34897,204.725837,227.953323,347.650831,1137.808675,885.08574,6.135557,0.414294,775.678923,165.270603,177.68592,313.824691,1064.138009,552.085559,802.718889,178.185946,172.229192,201.758008,1248.833285,822.739465,747.039555,251.142847,226.164454,361.31348,1146.291464,877.539091,6.300398,0.447506,836.851151,223.526089,181.475347,327.795046,1012.191108,631.920548,737.838147,181.996306,167.519819,174.737521,1200.546056,764.854094,772.238405,197.948068,222.253989,342.470061,1149.568911,819.260003,6.437053,0.475027,774.284027,156.142406,168.972949,308.212879,999.220467,564.399959,880.245214,138.915397,157.788477,166.234703,1375.569998,685.867975,777.034282,195.156451,216.514978,340.340534,1155.773727,797.137291,5.990945,0.504462,944.570764,151.546167,166.806322,307.415336,1311.313302,556.272596,54.524395,48.223582
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-529773.0,-46975.0,-9029.0,-6700.0,-22596.0,-5220.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-8800.0,-6303.0,-3744.0,-6087.0,-23902.0,-4626.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-31925.0,-4597.0,-3594.0,-5417.0,-12012.0,-21711.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-11571.0,-3616.0,-4190.0,-3564.0,-22676.0,-6617.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-54130.0,-2996.0,-3940.0,-4224.0,-83090.0,-7522.0,1.0,0.0
25%,2461.0,770.0,57.0,21.0,30.0,25.0,17.0,1302.0,127.0,45.0,67.0,51.0,45.0,5.0,0.0,6.0,-2.0,-4.0,6.0,2.0,1.0,662.0,49.0,18.0,25.0,20.0,14.0,1260.0,113.0,43.0,63.0,43.0,45.0,4.0,0.0,-1.0,-1.0,-2.0,2.0,-1.0,-1.0,575.0,42.0,16.0,22.0,17.0,12.0,1209.0,100.0,41.0,61.0,38.0,45.0,4.0,0.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,477.0,36.0,13.0,18.0,14.0,10.0,1200.0,90.0,40.0,60.0,31.0,40.0,3.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,314.0,25.0,7.0,11.0,9.0,6.0,1200.0,76.0,40.0,60.0,25.0,32.0,2.0,0.0,-1.0,-1.0,-1.0,3.0,-1.0,-1.0,13.0,5.0
50%,4933.0,1256.0,115.0,43.0,58.0,426.0,42.0,1603.0,176.0,57.0,91.0,2300.0,65.0,9.0,0.0,362.0,48.0,16.0,35.0,47.0,26.0,1219.0,111.0,41.0,55.0,310.0,39.0,1581.0,173.0,55.0,88.0,2300.0,62.0,8.0,0.0,322.0,44.0,15.0,33.0,38.0,24.0,1184.0,104.0,39.0,52.0,239.0,37.0,1561.0,169.0,55.0,87.0,2300.0,61.0,8.0,0.0,319.0,40.0,14.0,31.0,30.0,22.0,1130.0,97.0,37.0,49.0,175.0,35.0,1548.0,166.0,54.0,85.0,2300.0,59.0,8.0,0.0,314.0,38.0,14.0,32.0,30.0,21.0,800.0,72.0,27.0,35.0,100.0,27.0,1500.0,163.0,52.0,83.0,2300.0,57.0,6.0,0.0,524.0,58.0,20.0,40.0,44.0,24.0,42.0,48.0
75%,7421.0,1645.0,175.0,66.0,88.0,1751.0,100.0,1962.0,226.0,75.0,130.0,2300.0,100.0,13.0,0.0,883.0,113.0,37.0,69.0,1035.0,58.0,1619.0,170.0,65.0,87.0,1650.0,93.0,1932.0,221.0,74.0,128.0,2300.0,100.0,13.0,0.0,853.0,109.0,37.0,67.0,1018.0,56.0,1591.0,167.0,65.0,84.0,1609.0,89.0,1920.0,220.0,73.0,128.0,2300.0,100.0,12.0,0.0,838.0,107.0,36.0,66.0,987.0,54.0,1548.0,165.0,62.0,81.0,1501.0,85.0,1902.0,216.0,72.0,125.0,2300.0,100.0,12.0,0.0,871.0,108.0,37.0,67.0,1032.0,52.0,1330.0,135.0,51.0,68.0,1190.0,70.0,1850.0,212.0,70.0,122.0,2300.0,93.0,10.0,0.0,1055.0,125.0,42.0,73.0,1281.0,54.0,92.0,95.0
max,9897.0,530000.0,47000.0,11329.0,9000.0,960000.0,7520.0,9000.0,4500.0,4700.0,4700.0,10000.0,5300.0,49.0,1.0,7644.0,4500.0,4700.0,4700.0,7347.0,5044.0,10000.0,9003.0,6044.0,6153.0,26202.0,6926.0,6460.0,4500.0,4700.0,4700.0,10000.0,5300.0,40.0,1.0,5114.0,4042.0,4500.0,4029.0,7871.0,4165.0,33615.0,4889.0,5576.0,5502.0,14312.0,24011.0,6401.0,15065.0,3720.0,10043.0,10000.0,14212.0,62.0,1.0,6091.0,14997.0,3646.0,10011.0,7736.0,14197.0,11893.0,5285.0,4847.0,4972.0,24976.0,8917.0,6424.0,4500.0,3500.0,4700.0,10000.0,8151.0,49.0,1.0,4569.0,3500.0,3500.0,3605.0,7941.0,8146.0,55330.0,4120.0,4668.0,4410.0,85390.0,9822.0,6387.0,3500.0,3500.0,4700.0,10000.0,5300.0,42.0,1.0,6003.0,3265.0,3500.0,4029.0,9599.0,3942.0,187.0,174.0


## data for prediction if user reach goals

In [36]:
# choosing users which have more logs than number_of_logs
number_of_logs = 60
data_2 = data[data["logged_frequency"]>number_of_logs]

# update dataframe

In [1]:
data.to_csv(r'myFitnessPal_parsed.csv',index = False)

NameError: name 'data' is not defined