# Description
This notebook includes code for creating a json text file with the exact training sets and sample indeces used for training, sampling, and validation through the training, sampling, and validation pipeline used for this study.  The purpose is so that additional sampling or classification techniques can be used on independent runs without needing to run the pipeline from scratch.

resulting json file would like this :

[{  "minimum_personal_samples" : 40, # minimum number of labels from user in order to be used in validation
    "k-folds" : k, # k, for the k-fold cross-validation done on personal data
    "validation users" : [
                        {"user_id" : user_id,
                         "labeled personal samples" : labeled_personal_samples, # all the samples available
                         "impersonal_training_set" : {"data_set_version" : "1" # "1" if WISDM v1.1, "2" if WISDM 2.0
                                                       "data_set_technique" : method_name # method used for selecting users or samples
                                                       "training users" : [user_id1, user_id2, ...], # if set, 
                                                                                         #  the users included
                                                                                         # in the training set
                                                       "training samples" : [row_id1, row_id2, ...], # if set, 
                                                                                         # the row_ids included
                                                       "sample weights" : [sw_1, sw_2,...]}, # if set, the sample weights
                                           "random_personal_samples" : [[sample1, sample2, ...],[sample1, sample2, ...],...]
                                           "least_certain_personal_samples" : [[sample1, sample2, ...],[sample1, sample2, ...],...]
                                           "margin_certain_personal_samples" : {"margin":margin, # margin by which samples are chosen for labeling
                                           samples" : [[sample1, sample2, ...],[sample1, sample2, ...],...]
   }

In [1]:
from wisdm import wisdm

In [3]:
wisdm.set_data(version="1")

In [None]:
def random_personal_sampling(personal_set, impersonal_set, number_of_samples):
    random_active_indeces = np.random.choice(len(personal_set), ts)
    

In [None]:
def pipeline1(version, output_path, user_ids, training_set_techniques = [], k=10, minimum_personal_samples=40, make_compatible=True):
	# initialize pipeline variables
	random_sample_iterations = 5
	
	training_sizes = [10,20,30,40,50,60,70,80,90,100]
    
    sampling_data = [{"minimum_personal_samples" : minimum_personal_samples,
                      "k-folds" : k,
                      "validation_users" : []}]
    
	with warnings.catch_warnings():
		warnings.simplefilter("ignore")

		# Train model with v1.1 data and get clusterings
		set_data(version=version, make_compatible=make_compatible)

		for ind, user_id in enumerate(user_ids): # iterate through the users holding one out for testing
			user_results = []
			print("Running user #%s: %s" % (ind, user_id))
			personal_set = get_user_set(user_id)
			personal_set = remove_all_nan(personal_set)

			print("%s personal samples" % len(personal_set))

			if len(personal_set) < minimum_personal_samples:
				print("User %s has less than %s labeled samples..." % (user_id, minimum_personal_samples))
				continue

			personal_labels = np.array([t.decode("utf-8") for t in personal_set['class'].as_matrix()])
			personal_features = personal_set.as_matrix(columns=[personal_set.columns[1:-1]])

			# get impersonal data
			impersonal_set = data_df[data_df['user'] != user_id]
			impersonal_set = remove_all_nan(impersonal_set)
			impersonal_labels = np.array([t.decode("utf-8") for t in impersonal_set['class'].as_matrix()])
			impersonal_features = impersonal_set.as_matrix(columns=[impersonal_set.columns[1:-1]])
            
            validation_user_data = {"user_id" : user_id,
                                    "model training technique" : []}
			
            for training_set_technique in training_set_techniques:
                
            
            
            # train an impersonal model
			impersonal_scaler = StandardScaler().fit(impersonal_features)
			scaled_train_x = impersonal_scaler.transform(impersonal_features)

			rfc_clf = weka_RF()
			rfc_clf.fit(scaled_train_x, impersonal_labels)

			# calibrated for probability estimation
			prob_cal_cv_generator = StratifiedKFold(n_splits=3).split(impersonal_features,impersonal_labels)
			prob_cal_clf = CalibratedClassifierCV(rfc_clf, cv=prob_cal_cv_generator, method='sigmoid')
			prob_cal_clf.fit(scaled_train_x, impersonal_labels)

			# create clusters
			number_of_clusters = 4 # the higher this number is, the smaller we should expect each cluster to be

			KM = KMeans(n_clusters=number_of_clusters)
			clusters = KM.fit_predict(scaled_train_x)

			# split personal data into training (potentially) and test
			skf = StratifiedKFold(n_splits=k)
			k_run = 0

			for active_index, test_index in skf.split(personal_features, personal_labels):
				print("\tRunning Fold #%s\n" % k_run)
				# data set available for active labeling from the individual
				all_active_features = personal_features[active_index]
				all_active_labels = personal_labels[active_index]

				# held out test set from individual
				test_features = personal_features[test_index]
				test_labels = personal_labels[test_index]
			
				k_run_df = sample_experiments(user_id, k_run,
							  impersonal_features, impersonal_labels, \
							  all_active_features, all_active_labels, \
							  test_features, test_labels, \
							  training_sizes, \
							  random_sample_iterations, \
							  impersonal_model=prob_cal_clf, impersonal_scaler=impersonal_scaler,
							  KM=KM, clusters=clusters)
				user_results.append(k_run_df)
				k_run += 1
			user_scores_df = pd.concat(user_results)
			user_scores_df.to_pickle(output_path+user_id+".pickle")