In [1]:
import pandas as pd
import numpy as np
import time
import importlib.machinery
es = importlib.machinery.SourceFileLoader('extrasense','/home/sac086/extrasensory/extrasense/extrasense.py').load_module()
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler

In [104]:
leave_users_out=None
data_type="activity"
labeled_only=True

In [156]:
def clean_labels_new(labels_df, include_label_source=True):
    labels = []
    start = time.time()
    clean_labels_df = labels_df.iloc[:,:-1].idxmax(axis=1).str.replace("label:", "")
    if include_label_source:
        clean_labels_df = pd.concat((clean_labels_df, labels_df['label_source']), axis=1)
    return clean_labels_df

In [None]:
def clean_labels(labels_df, include_label_source=True):
    labels = []
    start = time.time()
    if not include_label_source:
        for ind, row in labels_df.iterrows():
            max_label = np.argmax(row)
            if np.isnan(np.max(row)):
                label = max_label
            else:
                if ":" in max_label:
                    label = max_label.split(":")[1]
                else:
                    label=max_label
            labels.append(label)
        return pd.Series(labels)
    else:
        for ind, row in labels_df.iterrows():
            if row['label_source'] != -1.0:
                
                label_source_mask = row.index.isin(['label_source'])
                max_label = np.argmax(row[~label_source_mask])
                if np.isnan(np.max(row[~label_source_mask])):
                    label = max_label
                else:
                    if ":" in max_label:
                        label = max_label.split(":")[1]
                    else:
                        label=max_label
                labels.append({"label":label, "label_source":row['label_source']})
            else:
                labels.append({"label":None, "label_source":row['label_source']})
        return pd.DataFrame(labels)

In [159]:
def clean_data(df, data_type="activity", labeled_only=False):
    start = time.time()
    if data_type is "activity":
        features_df = df[df.columns.intersection(es.acc_col_name)]
        labels_df = clean_labels_new(df[df.columns.intersection(es.acc_labels_name)])
        print("\tTime for splitting and CLEANING labels : %.3f" % (time.time() - start))
        df = pd.concat((features_df,labels_df,df['timestamp']), axis=1)
        print("\tTime to concatenate : %.3f" % (time.time() - start))

        df = df.rename(columns={0:'label'})
        df = df.dropna(subset=es.acc_col_name)
        print("\tTime to drop NaN rows : %.3f" % (time.time() - start))
    else:
        feature_columns = [col for col in df.columns if col not in label_col_name]
        features_df = df[df.columns.intersection(feature_columns)]
        labels_df = clean_labels(df[df.columns.intersection(label_col_name)])
        df = pd.concat((features_df,labels_df), axis=1)
        df = df.rename(columns={0:'label'})
        df = df.dropna(subset=feature_columns)
    if labeled_only:
        df = df[df.label.notnull()]
        print("\tTime to drop unlabeled stuff: %.3f" % (time.time() - start))
    return df

In [160]:
user_dfs = []
for u_num, uid in enumerate(es.user_ids):
    print("user #%s/%s" % (u_num, len(es.user_ids)))
    if (leave_users_out is None) or (uid in leave_users_out):
        start = time.time()
        df = pd.read_csv(es.data_dir+uid+".features_labels.csv")
        print("\tTime to load dataframe from csv: %.3f" % (time.time() - start))
        user_df = clean_data(df, data_type=data_type, labeled_only=labeled_only)
        print("\tTime to finish cleaning data: %.3f" % (time.time() - start))
        user_df['user_id'] = [uid] * user_df.shape[0]
        print("\tTime to finally add user_id column: %.3f" % (time.time() - start))
        user_dfs.append(user_df)

user #0/60
	Time to load dataframe from csv: 0.232
	Time for splitting and CLEANING labels : 0.017
	Time to concatenate : 0.019
	Time to drop NaN rows : 0.022
	Time to drop unlabeled stuff: 0.023
	Time to finish cleaning data: 0.255
	Time to finally add user_id column: 0.291
user #1/60
	Time to load dataframe from csv: 0.143
	Time for splitting and CLEANING labels : 0.014
	Time to concatenate : 0.015
	Time to drop NaN rows : 0.018
	Time to drop unlabeled stuff: 0.019
	Time to finish cleaning data: 0.162
	Time to finally add user_id column: 0.197
user #2/60
	Time to load dataframe from csv: 0.119
	Time for splitting and CLEANING labels : 0.009
	Time to concatenate : 0.010
	Time to drop NaN rows : 0.013
	Time to drop unlabeled stuff: 0.014
	Time to finish cleaning data: 0.133
	Time to finally add user_id column: 0.169
user #3/60
	Time to load dataframe from csv: 0.262
	Time for splitting and CLEANING labels : 0.022
	Time to concatenate : 0.023
	Time to drop NaN rows : 0.026
	Time to drop

	Time to load dataframe from csv: 0.323
	Time for splitting and CLEANING labels : 0.023
	Time to concatenate : 0.024
	Time to drop NaN rows : 0.028
	Time to drop unlabeled stuff: 0.029
	Time to finish cleaning data: 0.352
	Time to finally add user_id column: 0.388
user #32/60
	Time to load dataframe from csv: 0.324
	Time for splitting and CLEANING labels : 0.025
	Time to concatenate : 0.027
	Time to drop NaN rows : 0.030
	Time to drop unlabeled stuff: 0.032
	Time to finish cleaning data: 0.355
	Time to finally add user_id column: 0.391
user #33/60
	Time to load dataframe from csv: 0.222
	Time for splitting and CLEANING labels : 0.017
	Time to concatenate : 0.019
	Time to drop NaN rows : 0.022
	Time to drop unlabeled stuff: 0.023
	Time to finish cleaning data: 0.245
	Time to finally add user_id column: 0.281
user #34/60
	Time to load dataframe from csv: 0.323
	Time for splitting and CLEANING labels : 0.025
	Time to concatenate : 0.027
	Time to drop NaN rows : 0.030
	Time to drop unlabel

In [161]:
impersonal_df = pd.concat(user_dfs)

In [162]:
np.sum(impersonal_df.isnull().any(axis=1))

0

# Test fastest argmax of columns in DF

In [135]:
uid = es.user_ids[0]
df = pd.read_csv(es.data_dir+uid+".features_labels.csv")

In [136]:
features_df = df[df.columns.intersection(es.acc_col_name)]
labels_df = df[df.columns.intersection(es.acc_labels_name)]

In [150]:
labels_df.iloc[:,:-1].idxmax(axis=1).str.replace("label:", "").unique()

array(['SITTING', 'FIX_walking', 'LYING_DOWN', nan, 'FIX_running',
       'BICYCLING'], dtype=object)

In [144]:
labels_df.loc[:,:"label_source"].head()

Unnamed: 0,label:LYING_DOWN,label:SITTING,label:FIX_walking,label:FIX_running,label:BICYCLING,label:STAIRS_-_GOING_UP,label:STAIRS_-_GOING_DOWN,label_source
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2
