data_schema = {
    "time": {
        "description": "Timestamp in the format 'YYYY-MM-DD HH:MM:SS.FFF'",
        "type": "string",
        "pattern": "^\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}$"
    },
    "mood": {
        "description": "The mood scored by the user on a scale of 1-10",
        "type": "int",
        "range": (1, 10)
    },
    "circumplex.arousal": {
        "description": "The arousal scored by the user, on a scale between -2 to 2",
        "type": "int",
        "range": (-2, 2)
    },
    "circumplex.valence": {
        "description": "The valence scored by the user, on a scale between -2 to 2",
        "type": "int",
        "range": (-2, 2)
    },
    "activity": {
        "description": "Activity score of the user (number between 0 and 1)",
        "type": "float",
        "range": (0, 1)
    },
    "screen": {
        "description": "Duration of screen activity (time)",
        "type": "float",
        "range": None
    },
    "call": {
        "description": "Call made (indicated by a 1)",
        "type": "int",
        "range": (0, 1)
    },
    "sms": {
        "description": "SMS sent (indicated by a 1)",
        "type": "int",
        "range": (0, 1)
    },
    "appCat.builtin": {
        "description": "Duration of usage of builtin apps (time)",
        "type": "float",
        "range": None
    },
    "appCat.communication": {
        "description": "Duration of usage of communication apps (time)",
        "type": "float",
        "range": None
    },
    "appCat.entertainment": {
        "description": "Duration of usage of entertainment apps (time)",
        "type": "float",
        "range": None
    },
    "appCat.finance": {
        "description": "Duration of usage of finance apps (time)",
        "type": "float",
        "range": None
    },
    "appCat.game": {
        "description": "Duration of usage of game apps (time)",
        "type": "float",
        "range": None
    },
    "appCat.office": {
        "description": "Duration of usage of office apps (time)",
        "type": "float",
        "range": None
    },
    "appCat.other": {
        "description": "Duration of usage of other apps (time)",
        "type": "float",
        "range": None
    },
    "appCat.social": {
        "description": "Duration of usage of social apps (time)",
        "type": "float",
        "range": None
    },
    "appCat.travel": {
        "description": "Duration of usage of travel apps (time)",
        "type": "float",
        "range": None
    },
    "appCat.unknown": {
        "description": "Duration of usage of unknown apps (time)",
        "type": "float",
        "range": None
    },
    "appCat.utilities": {
        "description": "Duration of usage of utilities apps (time)",
        "type": "float",
        "range": None
    },
    "appCat.weather": {
        "description": "Duration of usage of weather apps (time)",
        "type": "float",
        "range": None
    }
}


import json
# 创建一个新的json对象，其中只有"time"键
new_data_schema = {"time": data_schema["time"]}

# 创建一个新的"variable"对象，其中包含除"time"之外的所有键值对
new_data_schema["variable"] = {key: value for key, value in data_schema.items() if key != "time"}

# new_data_schema现在是修改后的json
# 将new_data_schema保存为JSON
with open("../Assignment1/threshold.json", 'w') as f:
    json.dump(new_data_schema, f)

In [2]:
import pandas as pd
# 数据
data=pd.read_csv('../Assignment1//dataset_mood_smartphone.csv')
data.set_index(data.columns[0],inplace=True)
data.reset_index(inplace=True)
data.index += 1
data.head(5)

Unnamed: 0.1,Unnamed: 0,id,time,variable,value
1,1,AS14.01,2014-02-26 13:00:00.000,mood,6.0
2,2,AS14.01,2014-02-26 15:00:00.000,mood,6.0
3,3,AS14.01,2014-02-26 18:00:00.000,mood,6.0
4,4,AS14.01,2014-02-26 21:00:00.000,mood,7.0
5,5,AS14.01,2014-02-27 09:00:00.000,mood,6.0


In [3]:
import numpy as np
print(data['value'].iloc[5708])
print(data['value'].iloc[5708] is np.NaN)

nan
False


In [4]:
import re

# 记录不符合规定的行索引

data_schema=pd.read_json('../Assignment1//threshold.json')
# 遍历每一行
def clean_raw(data, data_schema):
    invalid_rows = []
    error_types = []
    for index, row in data.iterrows():
    # 检查时间是否符合规定
        if not re.match(data_schema['time']['pattern'], row['time']):
            invalid_rows.append(index)
            error_types.append('wrong_time')
            continue

    # 检查变量是否符合规定
        variable = row['variable']
        value = row['value']
        if variable in data_schema['variable']:
            var_schema = data_schema['variable'][variable]
            if var_schema['range'] is not None:
                min_value, max_value = var_schema['range']
                if not min_value <= value <= max_value:
                    if pd.isnull(value):
                        invalid_rows.append(index)
                        error_types.append('missing_value')
                    else:
                        invalid_rows.append(index)
                        error_types.append('out_of_range')
        else:
            invalid_rows.append(index)
            error_types.append('missing_variable')

    print(len(invalid_rows))
    print("invalid row id:", invalid_rows)
    print("error types:", error_types)


import pandas as pd
from io import StringIO

data_str = """
"","id","time","variable","value"
"5639","AS14.33","2014-99-30 19:00:00.000","mood",8
"5640","AS14.33","2014-05-30 20:00:00.000","NA",6
"5641","AS14.33","2014-05-31 12:00:00.000","mood",110
"5642","AS14.01","2014-02-26 13:00:00.000","circumplex.arousal",NA
"5643","AS14.01","206 15:00:00.000","circumplex.arousal",-1
"5644","AS14.01","2014-02-26 18:00:00.000","circumplex.arousal",2000
"""

data = pd.read_csv(StringIO(data_str), quotechar='"', skipinitialspace=True)
print(data)

In [5]:
clean_raw(data, data_schema)

202
invalid row id: [5709, 5731, 5773, 5797, 5836, 6325, 6379, 6434, 6668, 6793, 7037, 7256, 7262, 7320, 7348, 7450, 8193, 8202, 8350, 8357, 8362, 8383, 8404, 8461, 8467, 8643, 9332, 9390, 9394, 9399, 9443, 9478, 9503, 9519, 9646, 9919, 10189, 10241, 10248, 10262, 10283, 10292, 10293, 10329, 10334, 11256, 11300, 11352, 11379, 11380, 11382, 11415, 11416, 11419, 11479, 11480, 11488, 11497, 11968, 12022, 12067, 12077, 12311, 12324, 12436, 12680, 12774, 12899, 12924, 13037, 13047, 13051, 13052, 13061, 13179, 13183, 13187, 13203, 13749, 13799, 13820, 13823, 13826, 13843, 13845, 13853, 13860, 13887, 13897, 13928, 13945, 13959, 13961, 13965, 13974, 13977, 13992, 13993, 14000, 14003, 14008, 14019, 14022, 14026, 14028, 14035, 14038, 14044, 14047, 14054, 14055, 14056, 14063, 14070, 14071, 14085, 14089, 14097, 14101, 14104, 14105, 14106, 14110, 14113, 14286, 14312, 14313, 14315, 14324, 14329, 14330, 14336, 14339, 14935, 14939, 14972, 14975, 14982, 14984, 15025, 15030, 15032, 15033, 15037, 15039, 

KNN
------------------------------------------------------------------------------------------------------------------------------

In [6]:
# Convert 'time' column to datetime format
data['time'] = pd.to_datetime(data['time'])

# Checking the unique variables in the dataset
unique_variables = data['variable'].unique()

unique_variables


array(['mood', 'circumplex.arousal', 'circumplex.valence', 'activity',
       'screen', 'call', 'sms', 'appCat.builtin', 'appCat.communication',
       'appCat.entertainment', 'appCat.finance', 'appCat.game',
       'appCat.office', 'appCat.other', 'appCat.social', 'appCat.travel',
       'appCat.unknown', 'appCat.utilities', 'appCat.weather'],
      dtype=object)

In [7]:
from sklearn.impute import KNNImputer

# Filtering the data for 'mood' variable
mood_data = data[data['variable'] == 'mood'].copy()

# Sorting the data by time to maintain time series order
mood_data.sort_values(by='time', inplace=True)

# We need to reset the index because KNNImputer relies on numerical indices to find nearest neighbors
mood_data.reset_index(drop=True, inplace=True)

# Selecting columns for imputation (using 'value' column only here as example)
values_for_imputation = mood_data[['value']]

# Setting up the KNN imputer, choosing 5 neighbors for simplicity
imputer = KNNImputer(n_neighbors=5)

# Performing the imputation
mood_data['value_imputed'] = imputer.fit_transform(values_for_imputation)

# Showing the original and imputed values to compare
mood_data[mood_data['value'].isnull()][['time', 'value', 'value_imputed']]


Unnamed: 0,time,value,value_imputed


In [8]:
# Counting missing values by variable type
missing_by_variable = data.groupby('variable')['value'].apply(lambda x: x.isnull().sum())
missing_by_variable_sorted = missing_by_variable.sort_values(ascending=False)

missing_by_variable_sorted

variable
circumplex.valence      156
circumplex.arousal       46
activity                  0
appCat.unknown            0
screen                    0
mood                      0
call                      0
appCat.weather            0
appCat.utilities          0
appCat.travel             0
appCat.builtin            0
appCat.social             0
appCat.other              0
appCat.office             0
appCat.game               0
appCat.finance            0
appCat.entertainment      0
appCat.communication      0
sms                       0
Name: value, dtype: int64

In [10]:
# Filtering the data for 'circumplex.valence' variable
valence_data = data[data['variable'] == 'circumplex.arousal'].copy()

# Sorting the data by time to maintain time series order
valence_data.sort_values(by='time', inplace=True)

# Reset the index because KNNImputer relies on numerical indices to find nearest neighbors
valence_data.reset_index(drop=True, inplace=True)

# Selecting columns for imputation (using 'value' column only here as example)
valence_values_for_imputation = valence_data[['value']]

# Setting up the KNN imputer, choosing 5 neighbors for simplicity
valence_imputer = KNNImputer(n_neighbors=5)

# Performing the imputation
valence_data['value_imputed'] = valence_imputer.fit_transform(valence_values_for_imputation)

# Showing the original and imputed values to compare
valence_data[valence_data['value'].isnull()][['time', 'value', 'value_imputed']].head()


Unnamed: 0,time,value,value_imputed
1247,2014-04-01 15:00:00,,-0.098624
1268,2014-04-01 19:00:00,,-0.098624
1361,2014-04-02 18:00:00,,-0.098624
1744,2014-04-06 12:00:00,,-0.098624
1825,2014-04-06 21:00:00,,-0.098624


随机森林

In [11]:
# Step 1: Remove the unnecessary 'Unnamed: 0' column
data = data.drop(columns=['Unnamed: 0'])

# Step 2: Convert 'time' column to datetime format
data['time'] = pd.to_datetime(data['time'])

# Step 3: Analyze missing data by 'variable' type
missing_data_analysis = data.groupby('variable').apply(lambda x: x['value'].isnull().sum())

# Display the missing data analysis by variable
missing_data_analysis


  missing_data_analysis = data.groupby('variable').apply(lambda x: x['value'].isnull().sum())


variable
activity                  0
appCat.builtin            0
appCat.communication      0
appCat.entertainment      0
appCat.finance            0
appCat.game               0
appCat.office             0
appCat.other              0
appCat.social             0
appCat.travel             0
appCat.unknown            0
appCat.utilities          0
appCat.weather            0
call                      0
circumplex.arousal       46
circumplex.valence      156
mood                      0
screen                    0
sms                       0
dtype: int64

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Label encoding for categorical data
label_encoder_id = LabelEncoder()
data['id_encoded'] = label_encoder_id.fit_transform(data['id'])

label_encoder_variable = LabelEncoder()
data['variable_encoded'] = label_encoder_variable.fit_transform(data['variable'])

# Function to prepare data for filling missing values with RandomForest
def prepare_data_for_rf(variable_name):
    # Selecting rows related to the variable
    variable_data = data[data['variable'] == variable_name]

    # Separating into data with and without missing values
    known_data = variable_data[variable_data['value'].notna()]
    unknown_data = variable_data[variable_data['value'].isna()]

    # Selecting features and target
    features = known_data[['id_encoded', 'variable_encoded', 'time']]
    target = known_data['value']

    # One-hot encoding for time features
    features = pd.get_dummies(features, columns=['time'])

    # Splitting the data for training
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
    
    return X_train, X_test, y_train, y_test, unknown_data, features.columns

# Prepare data for 'circumplex.arousal'
X_train_arousal, X_test_arousal, y_train_arousal, y_test_arousal, unknown_data_arousal, feature_cols_arousal = prepare_data_for_rf('circumplex.arousal')

# Initialize and train the RandomForest model
model_arousal = RandomForestRegressor(n_estimators=100, random_state=42)
model_arousal.fit(X_train_arousal, y_train_arousal)

# Predict the missing values
unknown_data_arousal_features = pd.get_dummies(unknown_data_arousal[['id_encoded', 'variable_encoded', 'time']], columns=['time'])
unknown_data_arousal_features = unknown_data_arousal_features.reindex(columns=feature_cols_arousal, fill_value=0)
predicted_values_arousal = model_arousal.predict(unknown_data_arousal_features)

# Display predictions
predicted_values_arousal


array([-0.67, -0.56, -0.43, -0.62, -0.65, -0.55, -0.17, -0.4 , -0.98,
       -0.3 , -0.98, -0.01,  0.  ,  0.01,  0.  ,  0.  ,  1.  ,  1.  ,
        0.67,  0.82,  0.59,  1.22,  0.82,  0.82,  0.82,  1.  ,  0.  ,
        0.01,  0.14,  0.16,  0.14,  0.14,  0.14,  0.14, -0.01,  0.06,
       -0.28,  0.98, -0.29, -0.14, -0.28,  0.48, -0.3 , -0.28, -0.28,
       -0.36])

In [13]:
# Prepare data for 'circumplex.valence'
X_train_valence, X_test_valence, y_train_valence, y_test_valence, unknown_data_valence, feature_cols_valence = prepare_data_for_rf('circumplex.valence')

# Initialize and train the RandomForest model for 'circumplex.valence'
model_valence = RandomForestRegressor(n_estimators=100, random_state=42)
model_valence.fit(X_train_valence, y_train_valence)

# Predict the missing values for 'circumplex.valence'
unknown_data_valence_features = pd.get_dummies(unknown_data_valence[['id_encoded', 'variable_encoded', 'time']], columns=['time'])
unknown_data_valence_features = unknown_data_valence_features.reindex(columns=feature_cols_valence, fill_value=0)
predicted_values_valence = model_valence.predict(unknown_data_valence_features)

# Display predictions for 'circumplex.valence'
predicted_values_valence


array([ 1.  ,  0.8 ,  1.  ,  0.99,  0.9 ,  1.  ,  1.  ,  0.82,  1.  ,
        1.  ,  1.01,  1.  ,  1.  ,  1.  ,  1.  ,  0.97,  1.  ,  1.  ,
        1.  ,  0.82,  1.  ,  0.95,  0.92,  1.  ,  1.  ,  1.  ,  0.52,
        1.  ,  1.  ,  0.84,  1.  ,  1.  ,  1.  ,  1.16,  1.  ,  1.  ,
        0.98,  1.  ,  1.  ,  1.  ,  0.92,  1.  ,  0.41,  0.79,  1.  ,
        1.  ,  0.76,  1.  ,  1.  ,  1.  ,  1.  ,  0.45,  0.92,  1.  ,
        1.  ,  1.  ,  1.  ,  1.  ,  0.99,  1.  ,  0.94, -0.31,  0.99,
        1.  ,  0.8 ,  0.84,  1.  ,  1.  ,  1.  ,  0.5 , -0.12,  1.  ,
        0.96,  1.  ,  0.41,  0.37,  0.28,  1.  ,  1.  ,  1.  ,  0.47,
        0.96,  1.  ,  1.  ,  0.87,  1.  ,  1.  ,  1.  ,  1.  ,  1.  ,
        0.99,  0.98,  0.9 ,  1.  ,  1.  ,  1.  ,  1.  ,  1.  ,  1.  ,
        1.  ,  1.  ,  1.  ,  0.98,  1.  ,  1.  ,  1.  ,  1.  ,  1.  ,
        1.  ,  1.  ,  1.  ,  1.  ,  1.  ,  1.  ,  0.46,  1.  ,  1.  ,
        1.  ,  1.  ,  1.  ,  1.  ,  1.  ,  1.  ,  0.86,  1.53,  1.  ,
        0.94,  0.59,