## Overview

It demonstrats how to utilize [the unified Wi-Fi dataset](https://www.kaggle.com/kokitanisaka/indoorunifiedwifids).<br>
The Neural Net model is not optimized, there's much space to improve the score. 

In this notebook, I refer these two excellent notebooks.
* [wifi features with lightgbm/KFold](https://www.kaggle.com/hiro5299834/wifi-features-with-lightgbm-kfold) by [@hiro5299834](https://www.kaggle.com/hiro5299834/)<br>
 I took some code fragments from his notebook.
* [Simple 👌 99% Accurate Floor Model 💯](https://www.kaggle.com/nigelhenry/simple-99-accurate-floor-model) by [@nigelhenry](https://www.kaggle.com/nigelhenry/)<br>
 I use his excellent work, the "floor" prediction.

It takes much much time to finish learning. <br>
And even though I enable the GPU, it doesn't help. <br>
If anybody knows how to make it better, can you please make a comment? <br>

Thank you!

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import glob
import pickle
import random
import os
from sklearn.preprocessing import StandardScaler, LabelEncoder
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense

### options
We can change the way it learns with these options. <br>
Especialy **NUM_FEATS** is one of the most important options. <br>
It determines how many features are used in the training. <br>
We have 100 Wi-Fi features in the dataset, but 100th Wi-Fi signal sounds not important, right? <br>
So we can use top Wi-Fi signals if we think we need to. 

In [None]:
# options

NUM_FEATS = 20 # number of features that we use. there are 100 feats but we don't need to use all of them
base_path = '/kaggle'

In [None]:
feature_dir = f"{base_path}/input/indoorunifiedwifids"
train_files = sorted(glob.glob(os.path.join(feature_dir, '*_train.csv')))
test_files = sorted(glob.glob(os.path.join(feature_dir, '*_test.csv')))
subm = pd.read_csv(f'{base_path}/input/indoor-location-navigation/sample_submission.csv', index_col=0)

In [None]:
with open(f'{feature_dir}/train_all.pkl', 'rb') as f:
  data = pickle.load( f)

with open(f'{feature_dir}/test_all.pkl', 'rb') as f:
  test_data = pickle.load(f)

In [None]:
BSSID_FEATS = [f'bssid_{i}' for i in range(NUM_FEATS)]
RSSI_FEATS  = [f'rssi_{i}' for i in range(NUM_FEATS)]

In [None]:
# get numbers of bssids to embed them in a layer

wifi_bssids = []
for i in range(100):
    wifi_bssids.extend(data.iloc[:,i].values.tolist())
wifi_bssids = list(set(wifi_bssids))

wifi_bssids_size = len(wifi_bssids)
print(f'BSSID TYPES: {wifi_bssids_size}')

wifi_bssids_test = []
for i in range(100):
    wifi_bssids_test.extend(test_data.iloc[:,i].values.tolist())
wifi_bssids_test = list(set(wifi_bssids_test))

wifi_bssids_size = len(wifi_bssids_test)
print(f'BSSID TYPES: {wifi_bssids_size}')

wifi_bssids.extend(wifi_bssids_test)
wifi_bssids_size = len(wifi_bssids)

In [None]:
# preprocess

le = LabelEncoder()
le.fit(wifi_bssids)
le_site = LabelEncoder()
le_site.fit(data['site_id'])

ss = StandardScaler()
ss.fit(data.loc[:,RSSI_FEATS])

In [None]:
data.loc[:,RSSI_FEATS] = ss.transform(data.loc[:,RSSI_FEATS])#
for i in BSSID_FEATS:
    data.loc[:,i] = le.transform(data.loc[:,i])
    data.loc[:,i] = data.loc[:,i] + 1
    
data.loc[:, 'site_id'] = le_site.transform(data.loc[:, 'site_id'])

data.loc[:,RSSI_FEATS] = ss.transform(data.loc[:,RSSI_FEATS])

In [None]:
test_data.loc[:,RSSI_FEATS] = ss.transform(test_data.loc[:,RSSI_FEATS])
for i in BSSID_FEATS:
    test_data.loc[:,i] = le.transform(test_data.loc[:,i])
    test_data.loc[:,i] = test_data.loc[:,i] + 1
    
test_data.loc[:, 'site_id'] = le_site.transform(test_data.loc[:, 'site_id'])

test_data.loc[:,RSSI_FEATS] = ss.transform(test_data.loc[:,RSSI_FEATS])

In [None]:
from sklearn import preprocessing
np.random.seed(10)
dataset = data.values
np.random.shuffle(dataset)
X= dataset[:, 0:20]
Y = dataset[:,200:203]
X_site=dataset[:,204:205]
X=np.hstack((X,X_site))
X = preprocessing.scale(X)
#X -= X.mean(axis=0)
#X /= X.std(axis=0)
#X = preprocessing.scale(X)
print(X)
print(Y)
len(X[0])


In [None]:
X_train, Y_train = X[:200000], Y[:200000]     
X_test, Y_test = X[200000:], Y[200000:] 

In [None]:
#fail version
'''
import numpy as np
import pandas as pd
import keras
import keras.backend as kb
import tensorflow as tf

model = keras.Sequential([
    keras.layers.Dense(32, activation=tf.nn.relu, input_shape=(X.shape[-1],)),
    keras.layers.Dense(32, activation=tf.nn.relu),
    keras.layers.Dense(32, activation=tf.nn.relu),
    keras.layers.Dense(3)
  ])

optimizer = tf.keras.optimizers.RMSprop(0.0099)
#optimizer = tf.keras.optimizers.RMSprop(0.9)
model.compile(loss='mean_squared_error',optimizer=optimizer)
#model.summary()
X_train= np.asarray(X_train).astype('float32')
Y_train= np.asarray(Y_train).astype('float32')
model.fit(X_train,Y_train,epochs=3)
#model.evaluate(X_test, Y_test, verbose=20)
'''

In [None]:
X_train= np.asarray(X_train).astype('float32')
Y_train= np.asarray(Y_train).astype('float32')
model = Sequential()
model.add(Dense(32, input_shape=(X_train.shape[1],), activation="relu"))
model.add(Dense(32, activation="relu"))
model.add(Dense(3))
model.compile(loss="mse", optimizer="adam",
metrics=["mae"])
model.fit(X_train, Y_train, epochs=50, batch_size=16, verbose=0)
model.save_weights('lg.h5')

In [None]:
X_test= np.asarray(X_test).astype('float32')
Y_test= np.asarray(Y_test).astype('float32')
accuracy =model.evaluate(X_test, Y_test, verbose=2)

In [None]:
X_test= np.asarray(X_test).astype('float32')
Y_test= np.asarray(Y_test).astype('float32')
Y_pred = model.predict(X_test, batch_size=10, verbose=0)
print(Y_pred) 


In [None]:
dataset = test_data.values
sub_X= dataset[:, 0:20]
sub_X_site=dataset[:,201:202]
sub_X=np.hstack((sub_X,sub_X_site))
print(sub_X_site[0])
sub_X = preprocessing.scale(sub_X)
len(sub_X[0])

In [None]:
sub_X= np.asarray(sub_X).astype('float32')
pre_Y = model.predict(sub_X, batch_size=10, verbose=0)
print(pre_Y) 

In [None]:
for v in pre_Y:
    v[2]=int(round(v[2])) 

In [None]:
pre_Y = pd.DataFrame(pre_Y)
pre_Y.to_csv('result.csv')

In [None]:
cd working