In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from tqdm import tqdm


pd.set_option("max_columns", None)

pd.set_option("max_rows", None)

In [None]:
# Load the data
# Separate time into date and time
df = pd.read_csv("/content/dataset_mood_smartphone.csv", engine="python")
df = df.drop("Unnamed: 0", axis=1)
df["Date"] = pd.to_datetime(df["time"]).dt.date
df["Time"] = pd.to_datetime(df["time"]).dt.time

In [None]:
# Number of observations
len(df)

In [None]:
# check max and min dates
print(df["Date"].max())
print(df["Date"].min())

In [None]:
# aggregate by id and date
res = df.pivot_table(
    index=["id", "Date"], columns="variable", values="value"
).reset_index()

In [None]:
# check for nas
res.isna().sum()

In [None]:
# replace NAs using KNN
from sklearn.impute import KNNImputer

knn_imputer = KNNImputer(n_neighbors=10)
res[
    [
        "activity",
        "appCat.builtin",
        "appCat.communication",
        "appCat.entertainment",
        "appCat.finance",
        "appCat.game",
        "appCat.office",
        "appCat.other",
        "appCat.social",
        "appCat.travel",
        "appCat.unknown",
        "appCat.utilities",
        "appCat.weather",
        "call",
        "circumplex.arousal",
        "circumplex.valence",
        "screen",
        "sms",
    ]
] = knn_imputer.fit_transform(
    res[
        [
            "activity",
            "appCat.builtin",
            "appCat.communication",
            "appCat.entertainment",
            "appCat.finance",
            "appCat.game",
            "appCat.office",
            "appCat.other",
            "appCat.social",
            "appCat.travel",
            "appCat.unknown",
            "appCat.utilities",
            "appCat.weather",
            "call",
            "circumplex.arousal",
            "circumplex.valence",
            "screen",
            "sms",
        ]
    ]
)

res = res.dropna()

In [None]:
# check data dimension
res.shape

In [None]:
# View the data
res.head(5)

In [None]:
# Check number of observations per id
res["id"].value_counts().nlargest(33)

# DATA EXPLORATION

Do exploration for the id with the largest number of observations


In [None]:
# timeseries plot of all features and the label - mood
res_plot = res[res["id"] == "AS14.26"]
res_plot = res_plot.drop(["id"], axis=1)
res_plot = res_plot.set_index("Date")

numeric_features = [x for x in res_plot.columns if res[x].dtype != "object"]
# taking only the numeric columns from the dataframe.

for i in res[numeric_features].columns:
    plt.figure(figsize=(20, 5))
    plt.title(i)
    sns.lineplot(data=res_plot[i])

In [None]:
# plot moods and compare with its moving average
for i in res["id"].unique():
    res_ma = res[res["id"] == i]
    res_ma["moving_average"] = res_ma["mood"].rolling(7).mean()
    fig = px.line(
        res_ma, x="Date", y=["mood", "moving_average"], title=i, template="plotly_dark"
    )
    fig.show()

In [None]:
# do stationarity test on the id with the largest data
from pandas import read_csv
from statsmodels.tsa.stattools import adfuller

data_adf = res[res["id"] == "AS14.26"]
data_adf = data_adf["mood"].values
result = adfuller(data_adf)

print("ADF Statistic: %f" % result[0])
print("p-value: %f" % result[1])
print("Critical Values:")
for key, value in result[4].items():
    print("\t%s: %.3f" % (key, value))

In [None]:
# Using log transformed data
from numpy import log

data_adf = res[res["id"] == "AS14.26"]
data_adf = data_adf["mood"].values
data_adf = log(data_adf)
result = adfuller(data_adf)

print("ADF Statistic: %f" % result[0])
print("p-value: %f" % result[1])
print("Critical Values:")
for key, value in result[4].items():
    print("\t%s: %.3f" % (key, value))

In [None]:
res.columns

In [None]:
avg_mood = (
    res[
        [
            "Date",
            "activity",
            "appCat.builtin",
            "appCat.communication",
            "appCat.entertainment",
            "appCat.finance",
            "appCat.game",
            "appCat.office",
            "appCat.other",
            "appCat.social",
            "appCat.travel",
            "appCat.unknown",
            "appCat.utilities",
            "appCat.weather",
            "call",
            "circumplex.arousal",
            "circumplex.valence",
            "screen",
            "sms",
        ]
    ]
    .groupby(["Date"])[
        "activity",
        "appCat.builtin",
        "appCat.communication",
        "appCat.entertainment",
        "appCat.finance",
        "appCat.game",
        "appCat.office",
        "appCat.other",
        "appCat.social",
        "appCat.travel",
        "appCat.unknown",
        "appCat.utilities",
        "appCat.weather",
        "call",
        "circumplex.arousal",
        "circumplex.valence",
        "screen",
        "sms",
    ]
    .agg("mean")
    .reset_index()
)

In [None]:
from autoviz.AutoViz_Class import AutoViz_Class

AV = AutoViz_Class()
AV.AutoViz("", dfte=avg_mood)

In [None]:
AV = AutoViz_Class()
AV.AutoViz("", dfte=res[res["id"] == "AS14.26"], depVar="mood")

# TESTS FOR TIME SERIES CORRELATIONS/SIMILARITIES FOR MOODS AMONG ALL PARTICIPANTS

*   The ouput will help decide if a single model will be suitable for the time series



In [None]:
# Conduct ANOVA tests to check for similarities among the means for the different moods

res_corr = res[["Date", "id", "mood"]]

res_corr = res_corr.pivot(index="Date", columns="id", values="mood").reset_index()

res_corr = res_corr.drop("Date", axis=1)


res_corr[
    [
        "AS14.01",
        "AS14.02",
        "AS14.03",
        "AS14.05",
        "AS14.06",
        "AS14.07",
        "AS14.08",
        "AS14.09",
        "AS14.12",
        "AS14.13",
        "AS14.14",
        "AS14.15",
        "AS14.16",
        "AS14.17",
        "AS14.19",
        "AS14.20",
        "AS14.23",
        "AS14.24",
        "AS14.25",
        "AS14.26",
        "AS14.27",
        "AS14.28",
        "AS14.29",
        "AS14.30",
        "AS14.31",
        "AS14.32",
        "AS14.33",
    ]
] = knn_imputer.fit_transform(
    res_corr[
        [
            "AS14.01",
            "AS14.02",
            "AS14.03",
            "AS14.05",
            "AS14.06",
            "AS14.07",
            "AS14.08",
            "AS14.09",
            "AS14.12",
            "AS14.13",
            "AS14.14",
            "AS14.15",
            "AS14.16",
            "AS14.17",
            "AS14.19",
            "AS14.20",
            "AS14.23",
            "AS14.24",
            "AS14.25",
            "AS14.26",
            "AS14.27",
            "AS14.28",
            "AS14.29",
            "AS14.30",
            "AS14.31",
            "AS14.32",
            "AS14.33",
        ]
    ]
)


import scipy.stats as stats

# stats f_oneway functions takes the groups as input and returns ANOVA F and p value
fvalue, pvalue = stats.f_oneway(
    res_corr["AS14.01"],
    res_corr["AS14.02"],
    res_corr["AS14.03"],
    res_corr["AS14.05"],
    res_corr["AS14.06"],
    res_corr["AS14.07"],
    res_corr["AS14.08"],
    res_corr["AS14.09"],
    res_corr["AS14.12"],
    res_corr["AS14.12"],
    res_corr["AS14.13"],
    res_corr["AS14.14"],
    res_corr["AS14.15"],
    res_corr["AS14.16"],
    res_corr["AS14.17"],
    res_corr["AS14.19"],
    res_corr["AS14.20"],
    res_corr["AS14.23"],
    res_corr["AS14.24"],
    res_corr["AS14.25"],
    res_corr["AS14.26"],
    res_corr["AS14.27"],
    res_corr["AS14.28"],
    res_corr["AS14.29"],
    res_corr["AS14.30"],
    res_corr["AS14.31"],
    res_corr["AS14.32"],
    res_corr["AS14.33"],
)
print(fvalue, pvalue)

In [None]:
res_corr = res[["Date", "id", "mood"]]

res_corr = res_corr.pivot(index="Date", columns="id", values="mood").reset_index()


res_corr[
    [
        "AS14.01",
        "AS14.02",
        "AS14.03",
        "AS14.05",
        "AS14.06",
        "AS14.07",
        "AS14.08",
        "AS14.09",
        "AS14.12",
        "AS14.13",
        "AS14.14",
        "AS14.15",
        "AS14.16",
        "AS14.17",
        "AS14.19",
        "AS14.20",
        "AS14.23",
        "AS14.24",
        "AS14.25",
        "AS14.26",
        "AS14.27",
        "AS14.28",
        "AS14.29",
        "AS14.30",
        "AS14.31",
        "AS14.32",
        "AS14.33",
    ]
] = knn_imputer.fit_transform(
    res_corr[
        [
            "AS14.01",
            "AS14.02",
            "AS14.03",
            "AS14.05",
            "AS14.06",
            "AS14.07",
            "AS14.08",
            "AS14.09",
            "AS14.12",
            "AS14.13",
            "AS14.14",
            "AS14.15",
            "AS14.16",
            "AS14.17",
            "AS14.19",
            "AS14.20",
            "AS14.23",
            "AS14.24",
            "AS14.25",
            "AS14.26",
            "AS14.27",
            "AS14.28",
            "AS14.29",
            "AS14.30",
            "AS14.31",
            "AS14.32",
            "AS14.33",
        ]
    ]
)
res_corr.head(1)

In [None]:
avg_mood = res[["Date", "mood"]].groupby(["Date"])["mood"].agg("mean").reset_index()
fig = px.line(
    avg_mood,
    x="Date",
    y=["mood"],
    title="Daily Average Mood Movement Across All Users",
    template="presentation",
)
fig.show()

In [None]:
# Pearson Correlation
import scipy.stats as stats

overall_pearson_r = res_corr.corr()
overall_pearson_r = overall_pearson_r.values[
    np.triu_indices_from(overall_pearson_r.values, 1)
].mean()
print(f"Pandas computed Pearson r: {overall_pearson_r}")

In [None]:
# Compute rolling window synchrony
f, ax = plt.subplots(figsize=(50, 30))
res_corr.rolling(window=30, center=True).median().plot(ax=ax)
ax.set(xlabel="Time", ylabel="Pearson r")
ax.set(title=f"Overall Pearson r = {np.round(overall_pearson_r,2)}");