In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
from collections import Counter
from datetime import timedelta, datetime

# Feature engineering

In [2]:
column_names = [
        "mood",
        "circumplex.arousal",
        "circumplex.valence",
        "activity",
        "screen",
        "call",
        "sms",
        "appCat.builtin",
        "appCat.communication",
        "appCat.entertainment",
        "appCat.finance",
        "appCat.game",
        "appCat.office",
        "appCat.other",
        "appCat.social",
        "appCat.travel",
        "appCat.unknown",
        "appCat.utilities",
        "appCat.weather",
        "target_mood"
]


def prepare_dataframe():
    return pd.DataFrame(columns=column_names)


# Time values should be added. Opposed to scores that should be averaged.
def is_duration(column_name):
    return column_name in ["screen", "appCat.builtin", "appCat.communication",
                            "appCat.entertainment", "appCat.finance", "appCat.game",
                            "appCat.office", "appCat.other", "appCat.social", "appCat.travel", "appCat.unknown",
                            "appCat.utilities", "appCat.weather", "call", "sms"]



# This makes sure that we will be fine if we try to get a group that does not exis
def safe_get_group(grouped, key):
    try:
        return grouped.get_group(key)
    except KeyError:
        return pd.DataFrame(columns=grouped.obj.columns)

In [3]:
df = pd.read_csv("../../datasets/dataset_mood_smartphone.csv")
# We have to turn time into date time to be able to work with it
df['time']= pd.to_datetime(df['time'])
# We don't care about exact times
df['time'] = df['time'].dt.date


# We will look into what each user did in what time individually
grouped_df = df.groupby(['id', 'time'])

In [4]:
# In a window, we will look into what the user did and extract predictors

def get_aggregated_values(user_id, curr_date, window):

    
    # Get the dates in the frame
    dates = [curr_date - pd.DateOffset(days=x + 1) for x in range(window)]

    # If user did not do something in the frame, consider it 0
    result = {key: 0 for key in column_names}
    
    for date in dates:

        # Get the activities the user did
        group = safe_get_group(grouped_df, (user_id, pd.to_datetime(date).date()))


        # For each possible activity
        for key in column_names:
            
            # If the user did it
            if key in group['variable'].values:

                
                # Get the values for user
                values = group[group['variable'] == key]['value']

                
                # Add it if it should be added
                if is_duration(key):
                    result[key] = values.sum()

                # Average it if it needs to be averages
                else:
                    result[key] = values.mean() if len(values) > 0 else None

    # Return the features
    return result
    

In [5]:
# See what was the average mood of the user in the day
# TODO: Make it categorical by rounding



def get_mood(group, is_categorical):

    mood_records = group[group['variable'] == 'mood']

    mean_mood_date = mood_records['value'].mean()

    return round(mean_mood_date) if is_categorical else mean_mood_date

In [6]:
date_window = 5
categorical = False

# Get empty dataframe
data = prepare_dataframe()


for ((user_id, date), group) in grouped_df:
    
    # For each user in each day
    
    for _, row in group.iterrows():

        
        # If the user had logged mood in a date at least once
        if row['variable'] == 'mood':

            # Get the average of that days moods for that user
            # Consider this the label
                
            date_mood = get_mood(group, categorical)

            # Get aggregated values for the rest of the activities of user in the window.
            # Consider this the features

            result = get_aggregated_values(user_id, date, date_window)
            
            # Put the user and target label values
            # result['user'] = user_id
            result['target_mood'] = date_mood

            new_row = pd.DataFrame([result])

            # Add the created record to the data
            data = pd.concat([data, new_row], ignore_index=True)


            # No need to look into that day anymore
            break


data.to_csv(f"extracted_data/extracted_features_{'classification' if categorical else 'regression'}.csv", index=False)

  data = pd.concat([data, new_row], ignore_index=True)


In [7]:
print("Done")

Done
