In [1]:
# needed libraries
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

# Importing & Preprocessing Data

In [2]:
# import data
training = pd.read_csv('trainingData.csv', header = 0)
validation = pd.read_csv('validationData.csv', header = 0)
print('Training Data Set Dimensions: \t' + str(np.shape(training)))
print('Validation Data Set Dimensions: ' + str(np.shape(validation)))
print('\n Sample of Data:')
training.head(3)

Training Data Set Dimensions: 	(19937, 529)
Validation Data Set Dimensions: (1111, 529)

 Sample of Data:


Unnamed: 0,WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,...,WAP520,LONGITUDE,LATITUDE,FLOOR,BUILDINGID,SPACEID,RELATIVEPOSITION,USERID,PHONEID,TIMESTAMP
0,100,100,100,100,100,100,100,100,100,100,...,100,-7541.2643,4864921.0,2,1,106,2,2,23,1371713733
1,100,100,100,100,100,100,100,100,100,100,...,100,-7536.6212,4864934.0,2,1,106,2,2,23,1371713691
2,100,100,100,100,100,100,100,-97,100,100,...,100,-7519.1524,4864950.0,2,1,103,2,2,23,1371714095


In [3]:
# drop unnecessary columns
drop_col = [520,521,524,525,526,527,528]
training = training.drop(training.columns[drop_col], axis = 1)
validation = validation.drop(validation.columns[drop_col], axis = 1)

In [4]:
# top 3 rows of new training set after drop
training.head(3)

Unnamed: 0,WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,...,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519,WAP520,FLOOR,BUILDINGID
0,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,2,1
1,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,2,1
2,100,100,100,100,100,100,100,-97,100,100,...,100,100,100,100,100,100,100,100,2,1


In [5]:
# create a new column combining building and floor locations
training['build_floor'] = training.apply(lambda row : str(row['BUILDINGID']) + '.' + str(row['FLOOR']), axis=1)
validation['build_floor'] = validation.apply(lambda row : str(row['BUILDINGID']) + '.' + str(row['FLOOR']), axis=1)
training.head(3)

Unnamed: 0,WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,...,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519,WAP520,FLOOR,BUILDINGID,build_floor
0,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,2,1,1.2
1,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,2,1,1.2
2,100,100,100,100,100,100,100,-97,100,100,...,100,100,100,100,100,100,100,2,1,1.2


In [6]:
# adjust WAP values formula
# 0 equals to no signal
# 105 equals to max signal
training.iloc[:,0:520] = training.iloc[:,0:520].replace(100, -105)
training.iloc[:,0:520] += 105
validation.iloc[:,0:520] = validation.iloc[:,0:520].replace(100, -105)
validation.iloc[:,0:520] += 105

In [7]:
training.describe()

Unnamed: 0,WAP001,WAP002,WAP003,WAP004,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,...,WAP513,WAP514,WAP515,WAP516,WAP517,WAP518,WAP519,WAP520,FLOOR,BUILDINGID
count,19937.0,19937.0,19937.0,19937.0,19937.0,19937.0,19937.0,19937.0,19937.0,19937.0,...,19937.0,19937.0,19937.0,19937.0,19937.0,19937.0,19937.0,19937.0,19937.0,19937.0
mean,0.008727,0.016301,0.0,0.0,0.025029,0.297437,0.676882,0.781411,0.811958,0.058334,...,0.14601,0.128906,0.053468,4.943572,6.602699,0.014847,0.001354,0.0,1.674575,1.21282
std,0.292228,0.531403,0.0,0.0,0.566261,2.590441,4.377768,4.533137,4.95545,0.915116,...,1.903943,1.662281,1.00398,12.978733,13.448082,0.46278,0.11175,0.0,1.223078,0.833139
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0
max,12.0,19.0,0.0,0.0,16.0,47.0,58.0,44.0,55.0,21.0,...,35.0,38.0,23.0,105.0,105.0,18.0,11.0,0.0,4.0,2.0


### Feature Engineering

In [8]:
# Principal Component Analysis on Data
pca = PCA(random_state = 14)
t_pca = pca.fit(training.iloc[:,0:520])
v_pca = pca.fit(validation.iloc[:,0:520])
print(str(round(t_pca.explained_variance_ratio_[0:110].sum(),2)) + ' of variance explained by first 110 PC')
print(str(round(t_pca.explained_variance_ratio_[0:221].sum(),2)) + ' of variance explained by first 221 PC')

0.95 of variance explained by first 110 PC
0.99 of variance explained by first 221 PC


In [9]:
# transforming PCA into a DataFrame
WAP = ['WAP_%i' % i for i in range(1,521)]
training_pca = pd.DataFrame(t_pca.transform(training.iloc[:,0:520]), 
                            columns=WAP,
                            index = training.iloc[:,0:520].index)
validation_pca = pd.DataFrame(v_pca.transform(validation.iloc[:,0:520]), 
                            columns=WAP,
                            index = validation.iloc[:,0:520].index)

In [10]:
# adding label (build_floor) to pca data sets
training_pca['build_floor'] = training['build_floor']
validation_pca['build_floor'] = validation['build_floor']
print(np.shape(training_pca))
training_pca.head(1)

(19937, 521)


Unnamed: 0,WAP_1,WAP_2,WAP_3,WAP_4,WAP_5,WAP_6,WAP_7,WAP_8,WAP_9,WAP_10,...,WAP_512,WAP_513,WAP_514,WAP_515,WAP_516,WAP_517,WAP_518,WAP_519,WAP_520,build_floor
0,-6.481547,-17.019789,-53.727879,73.513014,8.875953,-22.581842,-11.996387,0.657891,5.506258,0.735435,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.2673769999999999e-48,-7.015439e-15,1.2


In [11]:
# sampling training_pca set
training_pca_2000 = training_pca.sample(n = 2000, random_state = 14)
training_pca_4000 = training_pca.sample(n = 4000, random_state = 14)
training_pca_8000 = training_pca.sample(n = 8000, random_state = 14)

# Building & Evaluating Models

In [15]:
# multiprocessing
import multiprocessing
print(multiprocessing.cpu_count())

# import classifiers
from sklearn.ensemble import RandomForestClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans

# import GridSearchCV
from sklearn.model_selection import GridSearchCV

# import performance metrics
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix
import time

# ignore warnings 
import warnings
warnings.filterwarnings("ignore")

4


In [16]:
# Random Forest Classifier Fitting
rf = RandomForestClassifier(random_state = 14)
rf_params = {'n_estimators': [250]}
rf_cv = GridSearchCV(rf, rf_params, cv=10)
rf_start_time = time.time()
rf_fit = rf_cv.fit(training_pca.iloc[:,0:110], training_pca['build_floor'])
rf_end_time = time.time()

In [22]:
# Random Forest Classifier Performance
rf_fit_time = rf_end_time - rf_start_time
print('Fit Time: '+str(round(rf_fit_time/60,2))+' minutes\n')
print(rf_fit.best_estimator_)
print('\nTraining Accuracy Score: ' + str(round(rf_fit.best_score_,4)))
rf_predictions = rf_fit.predict(validation_pca.iloc[:,0:110])
print('\nTesting Accuracy Score: ' + str(round(accuracy_score(validation_pca['build_floor'], rf_predictions),4)))
print('Testing Kappa Score:\t' + str(round(cohen_kappa_score(validation_pca['build_floor'], rf_predictions),4)))
confusion_matrix(validation_pca['build_floor'], rf_predictions)

Fit Time: 8.92 minutes

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=None,
            oob_score=False, random_state=14, verbose=0, warm_start=False)

Training Accuracy Score: 0.9847

Testing Accuracy Score: 0.9379
Testing Kappa Score:	0.9303


array([[ 72,   4,   2,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  8, 198,   2,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   4, 160,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   1,  84,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,  20,   8,   2,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   2, 129,  11,   1,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   1,  86,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   1,   4,  42,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,  22,   2,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0, 110,   1,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  48,   6,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  39,   1],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   7,  32]])

In [23]:
# K Neighbors Classifier Fitting
knn = KNeighborsClassifier()
knn_params = {'n_neighbors': [1],
              'p':[1,2],
              'n_jobs': [-1]}
knn_cv = GridSearchCV(knn, knn_params, cv=10)
knn_start_time = time.time()
knn_fit = knn_cv.fit(training_pca.iloc[:,0:50], training_pca['build_floor'])
knn_end_time = time.time()

In [24]:
# K Neighbors Classifier Performance
knn_fit_time = knn_end_time - knn_start_time
print('Fit Time: '+str(round(knn_fit_time/60,2))+' minutes\n')
print(knn_fit.best_estimator_)
print('\nTraining Accuracy Score: ' + str(round(knn_fit.best_score_,4)))
knn_predictions = knn_fit.predict(validation_pca.iloc[:,0:50])
print('\nTesting Accuracy Score: ' + str(round(accuracy_score(validation_pca['build_floor'], knn_predictions),4)))
print('Testing Kappa Score:\t' + str(round(cohen_kappa_score(validation_pca['build_floor'], knn_predictions),4)))
confusion_matrix(validation_pca['build_floor'], knn_predictions)

Fit Time: 0.21 minutes

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=1, p=2,
           weights='uniform')

Training Accuracy Score: 0.9862

Testing Accuracy Score: 0.9136
Testing Kappa Score:	0.9033


array([[ 72,   3,   3,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  4, 204,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  1,   3, 158,   3,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   1,  84,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,  21,   4,   5,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   7,  96,  37,   3,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   2,  83,   2,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   5,  42,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,  22,   2,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   1, 110,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  51,   3,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  39,   1],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   1,   0,   5,  33]])

In [25]:
# Support Vector Machines Classifier
svc = SVC(random_state = 14)
svc_params = {'C': [0.1],
              'kernel': ['poly'],
              'degree': [2],
              'gamma': ['auto']}
svc_cv = GridSearchCV(svc, svc_params, cv=10)
svc_start_time = time.time()
svc_fit = svc_cv.fit(training_pca_8000.iloc[:,0:325], training_pca_8000['build_floor'])
svc_end_time = time.time()

In [28]:
# Support Vector Machines Performance
svc_fit_time = svc_end_time - svc_start_time
print('Fit Time: '+str(round(svc_fit_time/60,2))+' minutes\n')
print(svc_fit.best_estimator_)
print('\nTraining Accuracy Score: ' + str(round(svc_fit.best_score_,4)))
svc_predictions = svc_fit.predict(validation_pca.iloc[:,0:325])
print('\nTesting Accuracy Score: ' + str(round(accuracy_score(validation_pca['build_floor'], svc_predictions),4)))
print('Testing Kappa Score:\t' + str(round(cohen_kappa_score(validation_pca['build_floor'], svc_predictions),4)))
confusion_matrix(validation_pca['build_floor'], svc_predictions)

Fit Time: 1.02 minutes

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=2, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=14, shrinking=True,
  tol=0.001, verbose=False)

Training Accuracy Score: 0.9915

Testing Accuracy Score: 0.9064
Testing Kappa Score:	0.8953


array([[ 72,   3,   2,   0,   0,   0,   0,   1,   0,   0,   0,   0,   0],
       [  3, 201,   2,   0,   0,   0,   0,   2,   0,   0,   0,   0,   0],
       [  0,   4, 158,   0,   0,   0,   0,   2,   0,   1,   0,   0,   0],
       [  0,   0,   1,  83,   0,   0,   0,   1,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,  22,   4,   3,   1,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   8,  94,  33,   6,   2,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   1,  79,   7,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   3,  44,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,  21,   3,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   1, 108,   2,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  52,   1,   1],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  38,   2],
       [  0,   0,   0,   0,   0,   0,   0,   1,   1,   0,   0,   2,  35]])