## Imports

In [1]:
# Pytorch
import torch
from torch.nn import CrossEntropyLoss, Conv2d, Linear, ReLU, Sequential, Sigmoid, Tanh, Softmax, MSELoss
from torch.optim import SGD, Adam

# helper
from os.path import join
import numpy as np
from math import sqrt

# Visualization
import pandas as pd

## Data acquisition

In [2]:
DATA_FOLDER = 'datasets/'
UJI_INDOOR_LOC_FOLDER = join(DATA_FOLDER, 'UjiIndoorLoc')
TRAIN_PATH = join(UJI_INDOOR_LOC_FOLDER, 'TrainingData.csv')
TEST_PATH = join(UJI_INDOOR_LOC_FOLDER, 'ValidationData.csv')

In [15]:
train_data_raw = pd.read_csv(TRAIN_PATH)
test_data_raw = pd.read_csv(TEST_PATH)

## Data exploration and cleaning

In [16]:
train_data_raw.describe()

Unnamed: 0,WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,...,WAP520,LONGITUDE,LATITUDE,FLOOR,BUILDINGID,SPACEID,RELATIVEPOSITION,USERID,PHONEID,TIMESTAMP
count,19937.0,19937.0,19937.0,19937.0,19937.0,19937.0,19937.0,19937.0,19937.0,19937.0,...,19937.0,19937.0,19937.0,19937.0,19937.0,19937.0,19937.0,19937.0,19937.0,19937.0
mean,99.823644,99.820936,100.0,100.0,99.613733,97.130461,94.733661,93.820234,94.693936,99.163766,...,100.0,-7464.275947,4864871.0,1.674575,1.21282,148.429954,1.833024,9.068014,13.021869,1371421000.0
std,5.866842,5.798156,0.0,0.0,8.615657,22.93189,30.541335,33.010404,30.305084,12.634045,...,0.0,123.40201,66.93318,1.223078,0.833139,58.342106,0.372964,4.98872,5.36241,557205.4
min,-97.0,-90.0,100.0,100.0,-97.0,-98.0,-99.0,-98.0,-98.0,-99.0,...,100.0,-7691.3384,4864746.0,0.0,0.0,1.0,1.0,1.0,1.0,1369909000.0
25%,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,-7594.737,4864821.0,1.0,0.0,110.0,2.0,5.0,8.0,1371056000.0
50%,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,-7423.0609,4864852.0,2.0,1.0,129.0,2.0,11.0,13.0,1371716000.0
75%,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,-7359.193,4864930.0,3.0,2.0,207.0,2.0,13.0,14.0,1371721000.0
max,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,-7300.81899,4865017.0,4.0,2.0,254.0,2.0,18.0,24.0,1371738000.0


Some columns only have unrepresented values (100 means unrepresented)

In [17]:
unrepresented_columns = train_data_raw.loc[:, (train_data == 100).all()].columns
unrepresented_columns

Index(['WAP003', 'WAP004', 'WAP092', 'WAP093', 'WAP094', 'WAP095', 'WAP152',
       'WAP158', 'WAP159', 'WAP160', 'WAP215', 'WAP217', 'WAP226', 'WAP227',
       'WAP238', 'WAP239', 'WAP240', 'WAP241', 'WAP242', 'WAP243', 'WAP244',
       'WAP245', 'WAP246', 'WAP247', 'WAP254', 'WAP293', 'WAP296', 'WAP301',
       'WAP303', 'WAP304', 'WAP307', 'WAP333', 'WAP349', 'WAP353', 'WAP360',
       'WAP365', 'WAP416', 'WAP419', 'WAP423', 'WAP429', 'WAP433', 'WAP438',
       'WAP441', 'WAP442', 'WAP444', 'WAP445', 'WAP451', 'WAP458', 'WAP482',
       'WAP485', 'WAP487', 'WAP488', 'WAP491', 'WAP497', 'WAP520'],
      dtype='object')

In [25]:
train_data_relevant = train_data_raw.loc[:, ~train_data_raw.columns.isin(unrepresented_columns)]
test_data_relevant = test_data_raw.loc[:, ~train_data_raw.columns.isin(unrepresented_columns)]

In [29]:
train_data_relevant.head()

Unnamed: 0,WAP001,WAP002,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,WAP011,WAP012,...,WAP519,LONGITUDE,LATITUDE,FLOOR,BUILDINGID,SPACEID,RELATIVEPOSITION,USERID,PHONEID,TIMESTAMP
0,100,100,100,100,100,100,100,100,100,100,...,100,-7541.2643,4864921.0,2,1,106,2,2,23,1371713733
1,100,100,100,100,100,100,100,100,100,100,...,100,-7536.6212,4864934.0,2,1,106,2,2,23,1371713691
2,100,100,100,100,100,-97,100,100,100,100,...,100,-7519.1524,4864950.0,2,1,103,2,2,23,1371714095
3,100,100,100,100,100,100,100,100,100,100,...,100,-7524.5704,4864934.0,2,1,102,2,2,23,1371713807
4,100,100,100,100,100,100,100,100,100,100,...,100,-7632.1436,4864982.0,0,0,122,2,11,13,1369909710


## Data processing

In [6]:
N = len(data)
split_ratio = 0.8
split_index = int(split_ratio * N)

In [7]:
data_torch = torch.tensor(data)
data_torch

tensor([[-64, -56, -61,  ..., -82, -81,   1],
        [-68, -57, -61,  ..., -85, -85,   1],
        [-63, -60, -60,  ..., -85, -84,   1],
        ...,
        [-62, -59, -46,  ..., -87, -88,   4],
        [-62, -58, -52,  ..., -90, -85,   4],
        [-59, -50, -45,  ..., -88, -87,   4]])

In [8]:
# generate random indices to split the data
random_indices = np.random.permutation(2000)
train_indices = random_indices[:split_index]
test_indices = random_indices[split_index:]

# split the data into train and test samples
train_data = data_torch[train_indices]
test_data = data_torch[test_indices]

# separate the input columns from the target column
train_input = train_data[:, :7].float()
train_target = train_data[:, 7] - 1
test_input = test_data[:, :7].float()
test_target = test_data[:, 7] - 1

## Train and evaluation functions

In [9]:
criterion = CrossEntropyLoss()
batch_size = 10
nb_epochs = 25

In [10]:
def train_model(model, train_input, train_target, nb_epochs=nb_epochs, batch_size=batch_size):
    """Train a model"""
    
    optimizer = Adam(model.parameters())

    for _ in range(nb_epochs):
        for b in range(0, train_input.size(0), batch_size):
            output = model(train_input.narrow(0, b, batch_size))
            loss = criterion(output, train_target.narrow(0, b, batch_size))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

In [11]:
def compute_nb_errors(model, data_input, data_target):
    """Compute the number of mismatched predictions"""
    
    nb_errors = 0
    for b in range(0, data_input.size(0), batch_size):
        output = model(data_input.narrow(0, b, batch_size))
        _, predictions = output.max(1)
        nb_errors += (predictions != data_target.narrow(0, b, batch_size)).sum()
        
    return int(nb_errors)

In [12]:
def weight_reset(m):
    """Reinitilize every linear and convolutional layer parameters in the model"""
    if isinstance(m, Conv2d) or isinstance(m, Linear):
        m.reset_parameters()

In [13]:
def train_and_evaluate(model, nb_trials=10):
    p_errs = torch.zeros(nb_trials)
    
    for i in range(nb_trials):
        model.apply(weight_reset)
        
        # train
        train_model(model, train_input, train_target)

        # evaluate
        n_err = compute_nb_errors(model, test_input, test_target)
        p_err = n_err / N
        p_errs[i] = p_err
    
    p_err_mean = p_errs.mean().item()
    std = p_errs.std().item()
    ci_95 = 1.96 * std / sqrt(nb_trials)
    
    return {'error_rate': p_err_mean,
            'std': std,
            'confidence_interval_95': ci_95
           }

In [14]:
model = Sequential(
    Linear(7, 4)
)

In [15]:
train_and_evaluate(model)

{'error_rate': 0.011749999597668648,
 'std': 0.0016372401732951403,
 'confidence_interval_95': 0.0010147719727709198}