In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
import pandas as pd
import numpy as np
import time

In [2]:
train_df = pd.read_csv('saved_train_df_idx.csv')
print(f"Shape of the train df: {train_df.shape}")
train_df.head()

Shape of the train df: (595038, 1855)


Unnamed: 0,zone,session_id,tag_id,Unnamed: 3,0,1,2,3,4,5,...,1841,1842,1843,1844,1845,1846,1847,1848,1849,1850
0,Floor 1,216441698810,ad24cd,17,-0.391508,-0.322429,-0.335741,-0.39144,-0.424744,-0.442341,...,-0.265351,-0.264938,-0.332012,-0.323045,-0.285949,-0.280206,-0.310537,-0.255797,-0.310412,-0.290698
1,Floor 1,216441698810,ad24cd,18,-0.391508,-0.322429,-0.335741,-0.39144,-0.424744,-0.442341,...,-0.265351,-0.264938,-0.332012,-0.323045,-0.285949,-0.280206,-0.310537,-0.255797,-0.310412,-0.290698
2,Floor 1,216441698810,ad24cd,19,-0.391508,-0.322429,-0.335741,-0.39144,-0.424744,-0.442341,...,-0.265351,-0.264938,-0.332012,-0.323045,-0.285949,-0.280206,-0.310537,-0.255797,-0.310412,-0.290698
3,Floor 1,216441698810,ad24cd,20,-0.391508,-0.322429,-0.335741,-0.39144,-0.424744,-0.442341,...,-0.265351,-0.264938,-0.332012,-0.323045,-0.285949,-0.280206,-0.310537,-0.255797,-0.310412,-0.290698
4,Floor 1,216441698810,ad24cd,21,-0.391508,-0.322429,-0.335741,-0.39144,-0.424744,-0.442341,...,-0.265351,-0.264938,-0.332012,-0.323045,-0.285949,-0.280206,-0.310537,-0.255797,-0.310412,-0.290698


In [3]:
val_df = pd.read_csv('saved_val_df_idx.csv')
print(f"Shape of the validation matrix: {val_df.shape}")
val_df.head()

Shape of the validation matrix: (290184, 1855)


Unnamed: 0,zone,session_id,tag_id,Unnamed: 3,0,1,2,3,4,5,...,1841,1842,1843,1844,1845,1846,1847,1848,1849,1850
0,Floor 1,216441698810,ad24cd,17,-0.391508,-0.322429,-0.335741,-0.39144,-0.424744,-0.442341,...,-0.265351,-0.264938,-0.332012,-0.323045,-0.285949,-0.280206,-0.310537,-0.255797,-0.310412,-0.290698
1,Floor 1,216441698810,ad24cd,18,-0.391508,-0.322429,-0.335741,-0.39144,-0.424744,-0.442341,...,-0.265351,-0.264938,-0.332012,-0.323045,-0.285949,-0.280206,-0.310537,-0.255797,-0.310412,-0.290698
2,Floor 1,216441698810,ad24cd,19,-0.391508,-0.322429,-0.335741,-0.39144,-0.424744,-0.442341,...,-0.265351,-0.264938,-0.332012,-0.323045,-0.285949,-0.280206,-0.310537,-0.255797,-0.310412,-0.290698
3,Floor 1,216441698810,ad24cd,20,-0.391508,-0.322429,-0.335741,-0.39144,-0.424744,-0.442341,...,-0.265351,-0.264938,-0.332012,-0.323045,-0.285949,-0.280206,-0.310537,-0.255797,-0.310412,-0.290698
4,Floor 1,216441698810,ad24cd,21,-0.391508,-0.322429,-0.335741,-0.39144,-0.424744,-0.442341,...,-0.265351,-0.264938,-0.332012,-0.323045,-0.285949,-0.280206,-0.310537,-0.255797,-0.310412,-0.290698


In [4]:
logit = LogisticRegression()
svc = SVC()
dt = DecisionTreeClassifier(criterion='gini')
rf = RandomForestClassifier(n_estimators=100)
ada = AdaBoostClassifier(n_estimators=50, learning_rate=1.0)

In [5]:
from sw_rtls_algo_util.preprocessing.pipes import (
    cleaned_df_to_lstm_data
)

from sw_rtls_algo_util.preprocessing.transforms import (
    generate_lstm_data,
    generate_lookback,
    lookback_numpy
)

from sw_rtls_algo_util.encoder_io import load_encoder

sensor_encoder = load_encoder('model/sensor_encoder.pkl')
zone_encoder = load_encoder('model/zone_encoder.pkl')
rssi_standardizer = load_encoder('model/rssi_standardizer.pkl')

train_data, train_labels = generate_lstm_data(train_df, zone_encoder, lookback=1)


In [6]:
data = generate_lookback(val_df, 1, inference_version=False)
zone = data.reset_index()['zone']
tag = data.reset_index()['tag_id']
labels = []
for zone, feature, tag in zip(zone,data.values, tag):
    labels = labels+[zone]*len(feature)

features = data.values
for i in range(len(features)):
    if len(features[i].shape) < 2:
        features[i] = features[i].reshape(1, -1)
val_data = np.concatenate(features)

converted_zone = np.array(labels).reshape(-1,1)
enc_y = zone_encoder.transform(converted_zone)
val_labels = enc_y.toarray()

In [7]:
val_labels = val_labels[:290184]

In [8]:
train_data = train_data[:,4:]
val_data = val_data[:,4:]

In [36]:
import math

# Train a model and report its performance
def train_and_eval(model, name, features, labels, test_features, test_labels):
    # Transform the labels into a 1d array
    if name in ['logit', 'svc', 'ada', 'xgb']:
        labels = np.argmax(labels, 1)
        test_labels = np.argmax(test_labels, 1)

    if name == 'xgb':
        dtrain = xgb.DMatrix(features[:int(0.8*features.shape[0])], labels[:int(0.8*features.shape[0])])
        param = {
            'max_depth': 10,
            'eta': 0.5,
            'objective': 'multi:softmax',
            'num_class': 4
            }
        dtest = xgb.DMatrix(features[int(0.8*features.shape[0]):], labels[int(0.8*features.shape[0]):])
        evallist = [(dtest, 'eval'), (dtrain, 'train')]
        num_round = 30
        start = time.perf_counter()
        bst = xgb.train(param, dtrain, num_round, evallist)
        fit_time = time.perf_counter()-start
        dval = xgb.DMatrix(test_features)

        start = time.perf_counter()
        preds = bst.predict(dval)
        inf_time = time.perf_counter()-start
        acc = sum(preds == test_labels)

        bst.save_model('xgb_multifloor.model')
        
        print("Accuracy: ", acc/test_labels.shape[0])
        return {
            "Score": acc/test_labels.shape[0],
            "Fit time": fit_time,
            "Predict time": inf_time,
            "Name": name
        }


    start = time.perf_counter()
    model.fit(features, labels)
    print("The model is fit")
    fit_time = time.perf_counter()-start
    print(f"Fit time is {fit_time}")
    
    start = time.perf_counter()
    score = model.score(test_features, test_labels)
    inf_time = time.perf_counter() - start
    print(f"Time to predict the score: {inf_time}")
    print(f"Model {model.__name__} has score: {score}")
    return {
        "Score": score,
        "Fit time": fit_time,
        "Predict time": inf_time,
        "Name": name
    }

In [11]:
# Generate proportional but trimmed datasets

value_counts = train_df['zone'].value_counts()
v1 = value_counts['Floor 1']
v2 = value_counts['Floor 2']
v3 = value_counts['Floor 3']
v4 = value_counts['Floor 4']

# Extract a subset of training data that is proportional to what existed
f1_prop = v1/train_df.shape[0]
f2_prop = v2/train_df.shape[0]
f3_prop = v3/train_df.shape[0]
f4_prop = v4/train_df.shape[0]

sample_total = 80000 # There wiill be 80000 rows in final training data
trimmed_train_data = train_data[0:int(math.floor(f1_prop*sample_total))]
trimmed_train_data = np.vstack((trimmed_train_data, train_data[v1:v1+int(math.floor(f2_prop*sample_total))]))
trimmed_train_data = np.vstack((trimmed_train_data, train_data[v1+v2:v1+v2+int(math.floor(f3_prop*sample_total))]))
trimmed_train_data = np.vstack((trimmed_train_data, train_data[v1+v2+v3:v1+v2+v3+int(math.floor(f4_prop*sample_total))]))

trimmed_train_labels = train_labels[0:int(math.floor(f1_prop*sample_total))]
trimmed_train_labels = np.vstack((trimmed_train_labels, train_labels[v1:v1+int(math.floor(f2_prop*sample_total))]))
trimmed_train_labels = np.vstack((trimmed_train_labels, train_labels[v1+v2:v1+v2+int(math.floor(f3_prop*sample_total))]))
trimmed_train_labels = np.vstack((trimmed_train_labels, train_labels[v1+v2+v3:v1+v2+v3+int(math.floor(f4_prop*sample_total))]))

np.save('trimmed_train_features', trimmed_train_data, allow_pickle=True)
np.save('trimmed_train_labels', trimmed_train_labels, allow_pickle=True)

value_counts = val_df['zone'].value_counts()
v1 = value_counts['Floor 1']
v2 = value_counts['Floor 2']
v3 = value_counts['Floor 3']
v4 = value_counts['Floor 4']

# Extract a subset of training data that is proportional to what existed
f1_prop = v1/val_df.shape[0]
f2_prop = v2/val_df.shape[0]
f3_prop = v3/val_df.shape[0]
f4_prop = v4/val_df.shape[0]

sample_total = 20000 # There wiill be 80000 rows in final training data
trimmed_val_data = val_data[0:int(math.floor(f1_prop*sample_total))]
trimmed_val_data = np.vstack((trimmed_val_data, val_data[v1:v1+int(math.floor(f2_prop*sample_total))]))
trimmed_val_data = np.vstack((trimmed_val_data, val_data[v1+v2:v1+v2+int(math.floor(f3_prop*sample_total))]))
trimmed_val_data = np.vstack((trimmed_val_data, val_data[v1+v2+v3:v1+v2+v3+int(math.floor(f4_prop*sample_total))]))

trimmed_val_labels = val_labels[0:int(math.floor(f1_prop*sample_total))]
trimmed_val_labels = np.vstack((trimmed_val_labels, val_labels[v1:v1+int(math.floor(f2_prop*sample_total))]))
trimmed_val_labels = np.vstack((trimmed_val_labels, val_labels[v1+v2:v1+v2+int(math.floor(f3_prop*sample_total))]))
trimmed_val_labels = np.vstack((trimmed_val_labels, val_labels[v1+v2+v3:v1+v2+v3+int(math.floor(f4_prop*sample_total))]))

np.save('trimmed_val_features', trimmed_val_data, allow_pickle=True)
np.save('trimmed_val_labels', trimmed_val_labels, allow_pickle=True)

In [56]:
train_data.reshape((-1,2*1851)).shape

(297519, 3702)

In [52]:
train_labels[::12].shape

(24182, 4)

In [58]:
train_data = lookback_numpy(train_data, 3, 1851)

In [61]:
train_labels = train_labels[2:]
val_data = lookback_numpy(val_data, 3, 1851)
val_labels = val_labels[2:]

In [60]:
train_data.reshape((-1, train_data.shape[1]*train_data.shape[2])).shape

(595036, 5553)

In [62]:
# scores = []
# scores.append(train_and_eval(rf, "rf", trimmed_train_data, trimmed_train_labels, trimmed_val_data, trimmed_val_labels))
# scores.append(train_and_eval(rf, 'rf', train_data, train_labels, val_data, val_labels))
scores.append(train_and_eval(rf, 'rf', train_data.reshape((-1, train_data.shape[1]*train_data.shape[2])), train_labels, val_data.reshape((-1, val_data.shape[1]*val_data.shape[2])), val_labels))
print(scores)

In [37]:
scores.append(train_and_eval(None, 'xgb', train_data, train_labels, val_data, val_labels))



[0]	eval-mlogloss:1.53741	train-mlogloss:0.63646
[1]	eval-mlogloss:1.74025	train-mlogloss:0.36879
[2]	eval-mlogloss:2.05382	train-mlogloss:0.22679
[3]	eval-mlogloss:2.32502	train-mlogloss:0.14334
[4]	eval-mlogloss:2.59674	train-mlogloss:0.09534
[5]	eval-mlogloss:2.92757	train-mlogloss:0.06451
[6]	eval-mlogloss:3.23519	train-mlogloss:0.04477
[7]	eval-mlogloss:3.49323	train-mlogloss:0.03049
[8]	eval-mlogloss:3.81132	train-mlogloss:0.02065
[9]	eval-mlogloss:4.08903	train-mlogloss:0.01402
[10]	eval-mlogloss:4.36876	train-mlogloss:0.00991
[11]	eval-mlogloss:4.62688	train-mlogloss:0.00713
[12]	eval-mlogloss:4.85881	train-mlogloss:0.00524
[13]	eval-mlogloss:5.12761	train-mlogloss:0.00397
[14]	eval-mlogloss:5.39171	train-mlogloss:0.00296
[15]	eval-mlogloss:5.61442	train-mlogloss:0.00234
[16]	eval-mlogloss:5.89261	train-mlogloss:0.00177
[17]	eval-mlogloss:6.16174	train-mlogloss:0.00140
[18]	eval-mlogloss:6.42658	train-mlogloss:0.00112
[19]	eval-mlogloss:6.70526	train-mlogloss:0.00090
[20]	eval-

In [32]:
# scores.append(train_and_eval(dt, "dt", trimmed_train_data, trimmed_train_labels, trimmed_val_data, trimmed_val_labels))
scores.append(train_and_eval(dt, 'dt', train_data, train_labels, val_data, val_labels))

The model is fit
Fit time is 290.44706937499905
Time to predict the score: 22.539450459000363


In [17]:
# scores.append(train_and_eval(ada, "ada", trimmed_train_data, trimmed_train_labels, trimmed_val_data, trimmed_val_labels))
scores.append(train_and_eval(ada, 'ada', train_data, train_labels, val_data, val_labels))

The model is fit
Fit time is 3781.4371871669996
Time to predict the score: 943.61152425


In [16]:
# scores.append(train_and_eval(logit, "logit", trimmed_train_data, trimmed_train_labels, trimmed_val_data, trimmed_val_labels))
scores.append(train_and_eval(logit, 'logit', train_data, train_labels, val_data, val_labels))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


The model is fit
Fit time is 216.215461
Time to predict the score: 29.07787237499997


In [18]:
# scores.append(train_and_eval(svc, "svc", trimmed_train_data, trimmed_train_labels, trimmed_val_data, trimmed_val_labels))
scores.append(train_and_eval(svc, 'svc', train_data, train_labels, val_data, val_labels))

The model is fit
Fit time is 8571.964375042
Time to predict the score: 3026.7402966669997


In [33]:
print(scores)

[{'Score': 0.8426756816364789, 'Fit time': 550.6219452500009, 'Predict time': 0.19181791700066242, 'Name': 'xgb'}, {'Score': 0.9480605408981887, 'Fit time': 290.44706937499905, 'Predict time': 22.539450459000363, 'Name': 'dt'}]


In [34]:
def save_ml_model(model, filename):
    import pickle
    pickle.dump(model, open(filename, 'wb'))
    
def load_ml_model(filename):
    import pickle
    return pickle.load(open(filename, 'rb'))


In [35]:
save_ml_model(dt, 'dt_multifloor.pkl')

In [None]:
# --------------------------- Keras Model ---------------------------- #

In [26]:
from tensorflow import keras
train_features = lookback_numpy(train_data, lookback=20, num_sensors=train_data.shape[1])
val_features = lookback_numpy(val_data, lookback=20, num_sensors=val_data.shape[1])

In [27]:
train_labels_lstm = train_labels[19:]
train_labels_lstm.shape

(595019, 4)

In [30]:
model = keras.models.Sequential()
model.add(keras.layers.Dropout(0.25, input_shape=(train_features.shape[1], train_features.shape[2])))
model.add(keras.layers.LSTM(200))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(train_labels_lstm.shape[1]))

learning_rate_schedule = keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=5e-4,
        decay_steps=15000,
        decay_rate=0.95
    )
optimizer = keras.optimizers.Adam(learning_rate=learning_rate_schedule)

model.compile(optimizer, loss='categorical_crossentropy')

Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB



2022-07-27 11:25:34.437138: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-07-27 11:25:34.437852: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [31]:
train_features = np.asarray(train_features).astype('float32')
train_labels = np.asarray(train_labels).astype('float32')
model.fit(train_features, train_labels_lstm, batch_size=64, epochs=10, verbose=1)

: 

: 

In [39]:
from sklearn.model_selection import GridSearchCV
rf_parameters = {
    "n_estimators": [50, 100, 150],
    "criterion": ["gini", "entropy", "log_loss"],
    "max_depth": [None, 3, 5],
    "max_features": ["sqrt", "log2", None],
    "n_jobs": [-1]
}
rf2 = RandomForestClassifier()
clf = GridSearchCV(rf2, rf_parameters)
clf.fit(train_data, train_labels)

KeyboardInterrupt: 

In [None]:
lr_parameters = {
    "penalty": ['l1', 'l2', 'elasticnet', 'none'],
    "tol": [1e-3, 1e-4, 1e-5],
    "C": [0.1, 1.0, 10],
    "class_weight": [None, 'balanced'],
    "max_iter": [50, 100, 150],
    "n_jobs": [-1]
}
lr = LogisticRegression()
lr_grid_search = GridSearchCV(lr, lr_parameters)
lr_grid_search.fit(train_data, train_labels)