In [87]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from statsmodels.graphics.tsaplots import plot_acf
from scipy import stats

# Read in the data
df = pd.read_csv('Datasets/dataset_mood_smartphone.csv')

In [88]:
df['date'] = pd.to_datetime(df['time']).dt.date
df['time'] = pd.to_datetime(df['time']).dt.time

grouped_df = df.groupby(['id', 'date', 'time', 'variable']).sum().reset_index()

#create dataframe of values per id per date per time
df = grouped_df.pivot_table(index=['id', 'date', 'time'], columns='variable', values='value')

In [89]:
# Remove instances that are not in the range of the depicted column
# ATTENTION should be performed before merging the data!!
def range_removal(lower, upper, column_name, keep_nan=True):

    column = df[column_name]

    if keep_nan:
        # If we want to keep the NaN values use this
        filtered_df = df[column.between(lower, upper) | column.isna()]
    else:
        # If we want to remove the NaN values use this
        filtered_df = df[(column >= lower) & (column <= upper)]

    return filtered_df

df = range_removal(1, 10, 'mood')
df = range_removal(-2, 2, 'circumplex.arousal')
df = range_removal(-2, 2, 'circumplex.valence')
df = range_removal(0, 1, 'activity')

In [90]:
# Remove the negative instances for the applications
def remove_negative_values(columns, df):
    for column in columns:
        # Keep the NaN values and replace the negative values with 0
        df[column] = df.loc[~(df[column] < 0), column]

    return df

df = remove_negative_values(['appCat.builtin', 'appCat.communication', 'appCat.entertainment', 'appCat.finance', 'appCat.game', 'appCat.office', 'appCat.other', 'appCat.social', 'appCat.travel', 'appCat.unknown', 'appCat.utilities', 'appCat.weather', 'screen', 'call', 'sms'], df)


In [90]:
# For each column remove outliers that are more than 3 standard deviations away from the mean
def remove_outliers(df):
    for column in df.columns:
        # Keep the NaN values and replace the outliers with 0
        df[column] = df.loc[~(np.abs(stats.zscore(df[column])) > 3), column]

    return df

df = remove_outliers(df)
df


In [76]:
# remove outliers from the dataframe
def remove_outliers(dataframe):
    for column in dataframe.columns:
        dataframe['z_score'] = np.abs((dataframe[column] - dataframe[column].mean()) / dataframe[column].std())
        outliers = dataframe[dataframe['z_score'] > 3]
        print(outliers)
        dataframe.drop(outliers.index, inplace=True)
        dataframe.drop('z_score', axis=1, inplace=True)

df = remove_outliers(df)
df


variable                     activity  appCat.builtin  appCat.communication  \
id      date       time                                                       
AS14.01 2014-03-21 14:00:00  0.725000             NaN                   NaN   
                   15:00:00  0.798387             NaN                   NaN   
        2014-03-26 18:00:00  0.822034             NaN                   NaN   
        2014-04-01 11:00:00  0.785124             NaN                   NaN   
        2014-04-26 12:00:00  0.958333             NaN                   NaN   
...                               ...             ...                   ...   
AS14.33 2014-05-13 20:00:00  1.000000             NaN                   NaN   
        2014-05-25 03:00:00  0.697479             NaN                   NaN   
                   04:00:00  0.775000             NaN                   NaN   
                   06:00:00  0.983333             NaN                   NaN   
        2014-05-30 14:00:00  0.731092             Na

KeyError: 'z_score'

In [82]:
# calculate the z-score for each data point
df['z_score'] = np.abs((df['appCat.unknown'] - df['appCat.unknown'].mean()) / df['appCat.unknown'].std())

# identify any data points with a z-score greater than 3 (a common threshold for identifying outliers)
outliers = df[df['z_score'] > 3]
outliers

Unnamed: 0_level_0,Unnamed: 1_level_0,variable,activity,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,appCat.game,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.utilities,appCat.weather,call,circumplex.arousal,circumplex.valence,mood,screen,sms,z_score
id,date,time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
AS14.03,2014-04-10,22:18:19.752000,,,,,,,,,,,2239.937,,,,,,,,,18.378363
AS14.07,2014-04-23,23:22:22.224000,,,,,,,,,,,412.628,,,,,,,,,3.07432
AS14.09,2014-04-01,17:35:46.552000,,,,,,,,,,,620.648,,,,,,,,,4.816525
AS14.20,2014-03-20,22:31:20.286000,,,,,,,,,,,545.854,,,,,,,,,4.190111
AS14.20,2014-03-22,14:19:23.934000,,,,,,,,,,,982.955,,,,,,,,,7.850911
AS14.20,2014-03-24,21:53:34.308000,,,,,,,,,,,954.654,,,,,,,,,7.613885
AS14.20,2014-03-30,10:41:11.243000,,,,,,,,,,,446.21,,,,,,,,,3.355575
AS14.23,2014-03-22,20:37:11.877000,,,,,,,,,,,429.499,,,,,,,,,3.215617
AS14.23,2014-03-22,20:46:31.323000,,,,,,,,,,,753.676,,,,,,,,,5.930658
AS14.23,2014-03-27,09:48:49.051000,,,,,,,,,,,697.351,,,,,,,,,5.458926


In [93]:
column_name = 'appCat.unknown'

Q1 = df[column_name].quantile(0.25)
Q3 = df[column_name].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(lower_bound)

outliers = df[df[column_name] > upper_bound]

#outliers = df[(df[column_name] < lower_bound) | (df[column_name] > upper_bound)]
outliers

-54.10074999999999


Unnamed: 0_level_0,Unnamed: 1_level_0,variable,activity,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,appCat.game,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.utilities,appCat.weather,call,circumplex.arousal,circumplex.valence,mood,screen,sms
id,date,time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AS14.01,2014-03-25,19:22:27.391000,,,,,,,,,,,155.677,,,,,,,,
AS14.03,2014-04-04,23:29:28.384000,,,,,,,,,,,315.509,,,,,,,,
AS14.03,2014-04-10,22:18:19.752000,,,,,,,,,,,2239.937,,,,,,,,
AS14.03,2014-04-20,15:23:17.914000,,,,,,,,,,,212.949,,,,,,,,
AS14.03,2014-05-05,22:30:32.934000,,,,,,,,,,,140.589,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AS14.30,2014-04-21,09:39:25.503000,,,,,,,,,,,1273.763,,,,,,,,
AS14.30,2014-04-21,22:11:00.549000,,,,,,,,,,,125.554,,,,,,,,
AS14.30,2014-04-22,11:24:09.240000,,,,,,,,,,,504.445,,,,,,,,
AS14.30,2014-04-26,01:25:01.706000,,,,,,,,,,,151.718,,,,,,,,


In [65]:
# Apply different aggregation functions to different variables
agg_dict = {'mood': 'mean', 'circumplex.arousal': 'mean', 'circumplex.valence': 'mean', 'activity': 'mean', 'screen': 'sum', 'call': 'sum', 'sms': 'sum', 'appCat.builtin': 'sum', 'appCat.communication': 'sum', 'appCat.entertainment': 'sum', 'appCat.finance': 'sum', 'appCat.game': 'sum', 'appCat.office': 'sum', 'appCat.other': 'sum', 'appCat.social': 'sum', 'appCat.travel': 'sum', 'appCat.unknown': 'sum', 'appCat.weather': 'sum', 'appCat.utilities': 'sum'}

# Create df where some of the columns values are summed and for some the mean is taken for each day
df = df.groupby(['date', 'id']).agg(agg_dict).reset_index()
df

variable,date,id,mood,circumplex.arousal,circumplex.valence,activity,screen,call,sms,appCat.builtin,...,appCat.entertainment,appCat.finance,appCat.game,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.weather,appCat.utilities
0,2014-02-17,AS14.01,,,,,0.000000,2.0,0.0,0.000,...,0.0,0.0,0.0,0.0,0.000,0.000,0.000,0.0,0.0,0.000
1,2014-02-17,AS14.02,,,,,0.000000,4.0,2.0,0.000,...,0.0,0.0,0.0,0.0,0.000,0.000,0.000,0.0,0.0,0.000
2,2014-02-17,AS14.03,,,,,0.000000,0.0,1.0,0.000,...,0.0,0.0,0.0,0.0,0.000,0.000,0.000,0.0,0.0,0.000
3,2014-02-17,AS14.06,,,,,0.000000,5.0,0.0,0.000,...,0.0,0.0,0.0,0.0,0.000,0.000,0.000,0.0,0.0,0.000
4,2014-02-17,AS14.08,,,,,0.000000,0.0,4.0,0.000,...,0.0,0.0,0.0,0.0,0.000,0.000,0.000,0.0,0.0,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1968,2014-06-05,AS14.24,7.0,0.0,0.80,0.152343,2371.588999,11.0,0.0,1749.277,...,0.0,0.0,0.0,0.0,47.360,171.964,225.737,0.0,0.0,0.000
1969,2014-06-06,AS14.24,7.5,0.5,0.25,0.189476,2733.332000,16.0,0.0,945.898,...,0.0,0.0,0.0,0.0,36.414,0.000,0.000,0.0,0.0,2.072
1970,2014-06-07,AS14.24,8.0,-1.0,0.00,0.081893,5530.377999,0.0,2.0,5027.498,...,0.0,0.0,0.0,0.0,6.214,106.850,0.000,0.0,0.0,0.000
1971,2014-06-08,AS14.24,6.5,-1.5,1.00,0.079510,1131.197999,0.0,0.0,351.389,...,0.0,0.0,0.0,0.0,18.069,190.760,0.000,0.0,0.0,0.000


In [67]:
# Function that removes the starting nan values
def remove_starting_nan_until_n_values(column_name, dataframe, n_param):
    '''
    params: 
        colmun_name -> takes the name of the column as a string
        dataframe -> takes the dataframe object
        n_params -> int that takes the number of real float values before the NaN are removed

    This function removes the starting NaN values before an amount of n float values are detected
    Therefore removing all the instances before any real measurements are performed
    '''

    verbose = False

    unique_users = dataframe['id'].unique()
    new_df = pd.DataFrame()

    for user_i in unique_users:
        df_for_user_i = dataframe[dataframe['id'] == user_i]
        count_nan = 0
        count_float = 0
        count_final = 0

        for value in df_for_user_i[column_name]:
            print('-'*50, '\n', value) if verbose else None
            if math.isnan(value):
                print('NAN', value, type(value), isinstance(value, float)) if verbose else None
                count_nan += 1
                count_float = 0
            if isinstance(value, float):
                print("FLOAT",value, type(value), isinstance(value, float)) if verbose else None
                count_float += 1
            if count_float > n_param:
                n = count_nan 
                added_df = df_for_user_i.iloc[n:]
                added_df
                new_df = new_df.append(added_df)
                break

    return new_df
    
df = remove_starting_nan_until_n_values('mood', df, 4)
        
num_nan_values = df['mood'].isna().sum()
print(num_nan_values)
df


41


variable,date,id,mood,circumplex.arousal,circumplex.valence,activity,screen,call,sms,appCat.builtin,...,appCat.entertainment,appCat.finance,appCat.game,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.weather,appCat.utilities
645,2014-03-21,AS14.01,6.20,0.2,0.20,0.134050,17978.907000,6.0,0.0,3139.218,...,1007.456,49.544,0.0,172.206,239.751,4508.500,915.445,0.000,0.000,598.754
670,2014-03-22,AS14.01,6.40,0.6,0.40,0.236880,6142.161000,3.0,1.0,731.429,...,93.324,21.076,0.0,0.000,98.143,439.632,37.305,0.000,0.000,117.621
691,2014-03-23,AS14.01,6.80,0.2,0.80,0.142741,6773.832001,0.0,0.0,1286.246,...,94.346,43.403,0.0,0.000,72.823,900.839,0.000,0.000,30.386,30.086
714,2014-03-24,AS14.01,6.00,0.8,0.00,0.078961,15047.351001,10.0,0.0,866.956,...,976.971,34.106,0.0,3.010,66.558,3223.626,419.805,0.000,0.000,178.732
739,2014-03-25,AS14.01,6.75,0.5,0.50,0.098374,21475.354999,0.0,1.0,1032.768,...,68.206,43.054,0.0,0.000,178.819,1919.471,0.000,235.223,0.000,222.893
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1968,2014-06-05,AS14.24,7.00,0.0,0.80,0.152343,2371.588999,11.0,0.0,1749.277,...,0.000,0.000,0.0,0.000,47.360,171.964,225.737,0.000,0.000,0.000
1969,2014-06-06,AS14.24,7.50,0.5,0.25,0.189476,2733.332000,16.0,0.0,945.898,...,0.000,0.000,0.0,0.000,36.414,0.000,0.000,0.000,0.000,2.072
1970,2014-06-07,AS14.24,8.00,-1.0,0.00,0.081893,5530.377999,0.0,2.0,5027.498,...,0.000,0.000,0.0,0.000,6.214,106.850,0.000,0.000,0.000,0.000
1971,2014-06-08,AS14.24,6.50,-1.5,1.00,0.079510,1131.197999,0.0,0.0,351.389,...,0.000,0.000,0.0,0.000,18.069,190.760,0.000,0.000,0.000,0.000


In [68]:
# select the dataframe for id = 'AS14.33'
dummy_df = df[df['id'] == 'AS14.33']
dummy_df

variable,date,id,mood,circumplex.arousal,circumplex.valence,activity,screen,call,sms,appCat.builtin,...,appCat.entertainment,appCat.finance,appCat.game,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.weather,appCat.utilities
1331,2014-04-16,AS14.33,7.0,0.333333,0.333333,0.095028,4944.646001,5.0,1.0,765.403,...,296.785,0.0,0.0,0.0,107.616,4201.894,0.0,27.135,0.0,164.932
1358,2014-04-17,AS14.33,7.2,-0.6,1.4,0.072172,18855.781002,2.0,0.0,2442.832,...,920.187,0.0,0.0,0.0,152.834,6420.129,0.0,0.0,0.0,283.445
1385,2014-04-18,AS14.33,7.4,-0.8,0.8,0.036852,11030.036,3.0,1.0,1555.364,...,675.412,0.0,0.0,0.0,70.61,6281.793,0.0,0.0,0.0,102.369
1412,2014-04-19,AS14.33,7.666667,0.0,1.0,0.259306,10002.624,2.0,0.0,2879.461,...,438.444,0.0,0.0,0.0,55.455,3960.064,0.0,0.0,0.0,83.939
1439,2014-04-20,AS14.33,7.4,0.0,1.2,0.030758,10896.482999,0.0,0.0,669.192,...,1526.508,0.0,0.0,0.0,60.848,7490.158,0.0,0.0,0.0,3.01
1466,2014-04-21,AS14.33,7.2,0.0,0.6,0.039461,10135.543,0.0,0.0,621.054,...,962.34,0.0,0.0,0.0,63.275,9143.261,3.019,0.0,0.0,4.208
1493,2014-04-22,AS14.33,6.2,-0.6,-0.2,0.085497,20483.757999,7.0,0.0,3783.64,...,1181.991,0.0,0.0,0.0,78.571,7711.711,52.436,0.0,0.0,90.132
1520,2014-04-23,AS14.33,5.2,-0.2,-0.6,0.007715,6824.43,1.0,0.0,865.25,...,1206.39,0.0,0.0,0.0,68.263,4779.349,0.0,0.0,0.0,0.0
1547,2014-04-24,AS14.33,7.0,-0.25,1.0,0.134273,8703.779,5.0,3.0,3016.506,...,775.334,0.0,0.0,0.0,67.548,3687.524,50.712,0.0,0.0,90.989
1574,2014-04-25,AS14.33,8.4,0.8,0.8,0.14645,9947.085998,31.0,2.0,4057.256,...,538.707,0.0,0.0,0.0,52.192,2758.26,36.169,0.0,0.0,0.0


In [69]:
# Create a new column that contains the mean of the previous and next value
# NOTE can still change to take mean of previous n and next n values
def mean_between(column_name, dataframe):

    # sort the dataframe by id
    dataframe = dataframe.sort_values(by='date')

    # get the previous and next values
    dataframe['prev_value'] = dataframe.groupby('id')[column_name].shift(1)
    dataframe['next_value'] = dataframe.groupby('id')[column_name].shift(-1)

    # take the mean of previous and next values
    dataframe['mean_value'] = dataframe[['prev_value', 'next_value']].mean(axis=1)

    for index, row in dataframe.iterrows():
        if pd.isna(row[column_name]):
            dataframe.loc[index, column_name] = row['mean_value']

    dataframe.drop('mean_value', axis=1, inplace=True)
    dataframe.drop('prev_value', axis=1, inplace=True)
    dataframe.drop('next_value', axis=1, inplace=True)

    return dataframe

df = mean_between('mood', df)

# dummy_df = df[df['id'] == 'AS14.33']
# dummy_df

variable,date,id,mood,circumplex.arousal,circumplex.valence,activity,screen,call,sms,appCat.builtin,...,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.weather,appCat.utilities,prev_value,next_value,mean_value
1331,2014-04-16,AS14.33,7.0,0.333333,0.333333,0.095028,4944.646001,5.0,1.0,765.403,...,0.0,107.616,4201.894,0.0,27.135,0.0,164.932,,7.2,7.2
1358,2014-04-17,AS14.33,7.2,-0.6,1.4,0.072172,18855.781002,2.0,0.0,2442.832,...,0.0,152.834,6420.129,0.0,0.0,0.0,283.445,7.0,7.4,7.2
1385,2014-04-18,AS14.33,7.4,-0.8,0.8,0.036852,11030.036,3.0,1.0,1555.364,...,0.0,70.61,6281.793,0.0,0.0,0.0,102.369,7.2,7.666667,7.433333
1412,2014-04-19,AS14.33,7.666667,0.0,1.0,0.259306,10002.624,2.0,0.0,2879.461,...,0.0,55.455,3960.064,0.0,0.0,0.0,83.939,7.4,7.4,7.4
1439,2014-04-20,AS14.33,7.4,0.0,1.2,0.030758,10896.482999,0.0,0.0,669.192,...,0.0,60.848,7490.158,0.0,0.0,0.0,3.01,7.666667,7.2,7.433333
1466,2014-04-21,AS14.33,7.2,0.0,0.6,0.039461,10135.543,0.0,0.0,621.054,...,0.0,63.275,9143.261,3.019,0.0,0.0,4.208,7.4,6.2,6.8
1493,2014-04-22,AS14.33,6.2,-0.6,-0.2,0.085497,20483.757999,7.0,0.0,3783.64,...,0.0,78.571,7711.711,52.436,0.0,0.0,90.132,7.2,5.2,6.2
1520,2014-04-23,AS14.33,5.2,-0.2,-0.6,0.007715,6824.43,1.0,0.0,865.25,...,0.0,68.263,4779.349,0.0,0.0,0.0,0.0,6.2,7.0,6.6
1547,2014-04-24,AS14.33,7.0,-0.25,1.0,0.134273,8703.779,5.0,3.0,3016.506,...,0.0,67.548,3687.524,50.712,0.0,0.0,90.989,5.2,8.4,6.8
1574,2014-04-25,AS14.33,8.4,0.8,0.8,0.14645,9947.085998,31.0,2.0,4057.256,...,0.0,52.192,2758.26,36.169,0.0,0.0,0.0,7.0,7.6,7.3


In [28]:
dummy_df = df[df['id'] == 'AS14.33']
dummy_df

variable,date,id,mood,circumplex.arousal,circumplex.valence,activity,screen,call,sms,appCat.builtin,...,appCat.entertainment,appCat.finance,appCat.game,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.weather,appCat.utilities
1331,2014-04-16,AS14.33,7.0,0.333333,0.333333,0.095028,4944.646001,5.0,1.0,765.403,...,296.785,0.0,0.0,0.0,107.616,4201.894,0.0,27.135,0.0,164.932
1358,2014-04-17,AS14.33,7.2,-0.6,1.4,0.072172,18855.781002,2.0,0.0,2442.832,...,920.187,0.0,0.0,0.0,152.834,6420.129,0.0,0.0,0.0,283.445
1385,2014-04-18,AS14.33,7.4,-0.8,0.8,0.036852,11030.036,3.0,1.0,1555.364,...,675.412,0.0,0.0,0.0,70.61,6281.793,0.0,0.0,0.0,102.369
1412,2014-04-19,AS14.33,7.666667,0.0,1.0,0.259306,10002.624,2.0,0.0,2879.461,...,438.444,0.0,0.0,0.0,55.455,3960.064,0.0,0.0,0.0,83.939
1439,2014-04-20,AS14.33,7.4,0.0,1.2,0.030758,10896.482999,0.0,0.0,669.192,...,1526.508,0.0,0.0,0.0,60.848,7490.158,0.0,0.0,0.0,3.01
1466,2014-04-21,AS14.33,7.2,0.0,0.6,0.039461,10135.543,0.0,0.0,621.054,...,962.34,0.0,0.0,0.0,63.275,9143.261,3.019,0.0,0.0,4.208
1493,2014-04-22,AS14.33,6.2,-0.6,-0.2,0.085497,20483.757999,7.0,0.0,3783.64,...,1181.991,0.0,0.0,0.0,78.571,7711.711,52.436,0.0,0.0,90.132
1520,2014-04-23,AS14.33,5.2,-0.2,-0.6,0.007715,6824.43,1.0,0.0,865.25,...,1206.39,0.0,0.0,0.0,68.263,4779.349,0.0,0.0,0.0,0.0
1547,2014-04-24,AS14.33,7.0,-0.25,1.0,0.134273,8703.779,5.0,3.0,3016.506,...,775.334,0.0,0.0,0.0,67.548,3687.524,50.712,0.0,0.0,90.989
1574,2014-04-25,AS14.33,8.4,0.8,0.8,0.14645,9947.085998,31.0,2.0,4057.256,...,538.707,0.0,0.0,0.0,52.192,2758.26,36.169,0.0,0.0,0.0


In [8]:
def MA_on_missing_values(dataframe, column, n):
    '''
    Performs moving averages on a column for each distinct 'id' in the 'mood' column,
    and creates a new column where NaN values are substituted
    '''
    # Group by 'id' column
    groups = dataframe.groupby('id')

    # Perform moving averages for each group
    dataframe['MA'] = groups[column].ewm(span=n).mean().reset_index(0, drop=True)

    for index, row in dataframe.iterrows():
        if pd.isna(row[column]):
            dataframe.loc[index, column] = row['MA']

    dataframe.drop('MA', axis=1, inplace=True)
    
    return dataframe

selected_df = MA_on_missing_values(df, 'mood', 10)
selected_df = MA_on_missing_values(df, 'circumplex.arousal', 10)
selected_df = MA_on_missing_values(df, 'circumplex.valence', 10)
selected_df = MA_on_missing_values(df, 'activity', 10)

selected_df


variable,date,id,mood,circumplex.arousal,circumplex.valence,activity,screen,call,sms,appCat.builtin,...,appCat.entertainment,appCat.finance,appCat.game,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.weather,appCat.utilities
645,2014-03-21,AS14.01,6.20000,0.200000,0.200000,0.134050,17978.907000,6.0,0.0,3139.218,...,1007.456,49.544,0.0,172.206,239.751,4508.500,915.445,0.000,0.000,598.754
670,2014-03-22,AS14.01,6.40000,0.600000,0.400000,0.236880,6142.161000,3.0,1.0,731.429,...,93.324,21.076,0.0,0.000,98.143,439.632,37.305,0.000,0.000,117.621
691,2014-03-23,AS14.01,6.80000,0.200000,0.800000,0.142741,6773.832001,0.0,0.0,1286.246,...,94.346,43.403,0.0,0.000,72.823,900.839,0.000,0.000,30.386,30.086
714,2014-03-24,AS14.01,6.00000,0.800000,0.000000,0.078961,15047.351001,10.0,0.0,866.956,...,976.971,34.106,0.0,3.010,66.558,3223.626,419.805,0.000,0.000,178.732
739,2014-03-25,AS14.01,6.75000,0.500000,0.500000,0.098374,21475.354999,0.0,1.0,1032.768,...,68.206,43.054,0.0,0.000,178.819,1919.471,0.000,235.223,0.000,222.893
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1968,2014-06-05,AS14.24,7.00000,0.000000,0.800000,0.152343,2371.588999,11.0,0.0,1749.277,...,0.000,0.000,0.0,0.000,47.360,171.964,225.737,0.000,0.000,0.000
1969,2014-06-06,AS14.24,7.50000,0.500000,0.250000,0.189476,2733.332000,16.0,0.0,945.898,...,0.000,0.000,0.0,0.000,36.414,0.000,0.000,0.000,0.000,2.072
1970,2014-06-07,AS14.24,8.00000,-1.000000,0.000000,0.081893,5530.377999,0.0,2.0,5027.498,...,0.000,0.000,0.0,0.000,6.214,106.850,0.000,0.000,0.000,0.000
1971,2014-06-08,AS14.24,6.50000,-1.500000,1.000000,0.079510,1131.197999,0.0,0.0,351.389,...,0.000,0.000,0.0,0.000,18.069,190.760,0.000,0.000,0.000,0.000


In [None]:
def select_imputation_technique(mean_values):

    if mean_values = True:
        df = mean_between('mood', df)
        df = mean_between('circumplex.arousal', df)
        df = mean_between('circumplex.valence', df)
        df = mean_between('activity', df)
    else:
        df = MA_on_missing_values(df, 'mood', 10)
        df = MA_on_missing_values(df, 'circumplex.arousal', 10)
        df = MA_on_missing_values(df, 'circumplex.valence', 10)
        df = MA_on_missing_values(df, 'activity', 10)

    return df

df = select_imputation_technique(True)