In [15]:
from datetime import datetime
import numpy as np
import pandas as pd
import json

In [16]:
data = pd.read_csv('myFitnessPal_parsed.csv')
data.head()

Unnamed: 0,user_id,date,sequence,food_ids,total_calories,total_carbs,total_fat,total_protein,total_sodium,total_sugar,goal_calories,goal_carbs,goal_fat,goal_protein,goal_sodium,goal_sugar
0,1,2014-09-15,1,"[1, 2, 3, 4, 4]",2430,96,37.0,50.0,855.0,63.0,1572.0,196.0,52.0,79.0,2300.0,59.0
1,1,2014-09-16,1,"[5, 1, 2, 3, 6, 7]",1862,158,54.0,114.0,2215.0,100.0,1832.0,229.0,61.0,92.0,2300.0,69.0
2,1,2014-09-17,1,"[1, 2, 3, 6, 8, 9, 10]",2251,187,60.0,98.0,1765.0,105.0,1685.0,210.0,56.0,85.0,2300.0,63.0
3,1,2014-09-18,1,"[1, 6, 2, 3, 11, 12]",2001,113,81.0,202.0,1101.0,71.0,1597.0,199.0,53.0,80.0,2300.0,60.0
4,1,2014-09-19,1,"[1, 7, 13, 12, 2, 3, 12, 12]",2158,180,89.0,115.0,1998.0,84.0,1589.0,198.0,53.0,80.0,2300.0,60.0


In [3]:
with open('foods.json') as json_file:
     foods = json.load(json_file)

FileNotFoundError: [Errno 2] No such file or directory: 'foods.json'

In [4]:
# # split date and create separated columns
# data["year"] = data["date"].apply(lambda x: int(x[:4]))
# data["month"] = data["date"].apply(lambda x: int(x[5:7]))
# data["day"] = data["date"].apply(lambda x: int(x[-2:]))
# data = data.drop(columns = 'date')

In [17]:
# create new column based on foods quantity
data['foods_len'] = data["food_ids"].apply(lambda x: len(x[1:-1].split(',')))

In [18]:
# create new columns based on goal and total nutritient
data["calories_diff"] = data["goal_calories"]-data["total_calories"]
data["carbs_diff"] = data["goal_carbs"]-data["total_carbs"]
data["fat_diff"] = data["goal_fat"]-data["total_fat"]
data["protein_diff"] = data["goal_protein"]-data["total_protein"]
data["sodium_diff"] = data["goal_sodium"]-data["total_sodium"]
data["sugar_diff"] = data["goal_sugar"]-data["total_sugar"]

In [19]:
#In this cell we drop rows which have null values in more than 4 columns

nullColumns= data.loc[data.isnull().sum(axis=1)>=5]['user_id'].unique()
nullColumns=pd.DataFrame(nullColumns,columns={'user_id'})

cond = data['user_id'].isin(nullColumns['user_id']) 
data.drop(data[cond].index, inplace = True)

In [20]:
# There are no null values for goal_calories after drop action
data[np.isnan(data['goal_calories'])]['user_id'].value_counts()

Series([], Name: user_id, dtype: int64)

In [21]:
# seems like there are people who have set goal_calories to zero and it should also be the mistake
# so lets drop those ones too

zeros= data[data['goal_calories']==0]['user_id'].unique()
zeros=pd.DataFrame(zeros,columns={'user_id'})

cond = data['user_id'].isin(zeros['user_id']) 
data.drop(data[cond].index, inplace = True)

In [106]:
# add new feature which shows how many days are logged by each applicants
user_logged_freq = data["user_id"].value_counts()
user_logged_df = pd.DataFrame(data["user_id"].unique(),columns = ['user_id'])
user_logged_df["logged_frequency"] = user_logged_df["user_id"].apply(lambda _id: user_logged_freq[_id])

In [107]:
# this function counts whole days between start and end date and calculates missed days for new feature
def days_missed(d1, d2,loggedDays):
    d1 = datetime.strptime(str(d1), "%Y-%m-%d")
    d2 = datetime.strptime(str(d2), "%Y-%m-%d")
    return abs(abs((d2 - d1).days)-loggedDays)

In [108]:
# this function calls days_missed regarding last and first records
def get_missed_days(df,userID,logged_frequency):
    tail = df[df["user_id"]==userID].tail(1)['date'].values[0]
    head = df[df["user_id"]==userID].head(1)['date'].values[0]
    return days_missed(tail,head,logged_frequency)   

In [111]:
# add new feature based on how many days are missed for each user
user_logged_df['days_missed'] = user_logged_df[['user_id','logged_frequency']].apply(lambda x: get_missed_days(data,x.user_id,x.logged_frequency),axis=1)

In [10]:
# There is known that for healthy eating, daily carbs should be between 45-65% out of callories, fats between 10-35% and proteins 20-35%
# So this method checks if persons daily norm is healthy distributed

def healthyDistributed(carbs,fat,protein):
    sum = fat*9+ carbs*4 + protein*4 # convert to calories (1g fat = 9 calories and etc..) and sum
    return (0.45 < (carbs*4 /(sum+0.00001)<0.65)) & (0.10 < (fat*9 /(sum+0.00001))<0.35) & (0.20 < (protein*4 /(sum+0.00001))<0.35)

In [11]:
#new column in data based on healthydistributed method to determine user's behavior
data['healtyDistrib']=data[['total_carbs','total_fat','total_protein']].apply(lambda x: healthyDistributed(x.total_carbs,x.total_fat,x.total_protein),axis=1)

In [12]:
# we need to have the same number of rows for each user so
# this function checks if number of rows for each user are greater than num_rows
# if it's greater, then this function returns the lasts rows 
# if less, then this fills them with -1

def row_padding(x,num_row):
    # get last row because I need same format and same id, others columns replaced by -1
    last_row = x.iloc[-1] 
    last_row[1:] = [-1]*len(last_row[1:])
    
    if np.shape(x)[0] < num_row:
        new_x = pd.DataFrame(x)
        for i in range(np.shape(x)[0],num_row):
            new_x = pd.DataFrame(new_x.append(last_row))
        return new_x
    else:
        return x.tail(num_row)

In [13]:
# this function flattens all rows for each user which we padded already 
# so creates one vector because we need one input for each user
def flatten_rows(x,cols):
    for i in range(1,x.shape[0]):
        for j in range(1,len(cols)):
            temp_row = x.iloc[i]
            x[cols[j]+"_"+str(i)] = temp_row[j]
    return x.head(1)

In [14]:
data = data.groupby('user_id').apply(row_padding,5).reset_index(drop=True)

In [None]:
cols = data.columns
data = data.groupby('user_id').apply(flatten_rows,cols).reset_index(drop=True)

In [None]:
data.head()

# update dataframe

In [1]:
data.to_csv(r'myFitnessPal_parsed.csv',index = False)

NameError: name 'data' is not defined