In [1]:
# Import Neccessary Libraries
import numpy as np
import pandas as pd
from pathlib import Path
import os

# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Import data from txt file as csv file,  and drop columns that are note nee
data = pd.read_csv('/content/drive/MyDrive/DUPA - RCPA/Technology transfer deep unfolding/SPROJ-ConvMC-Net/Sensor Data/data.txt', sep  =  " ", header = None)
# Appropriately rename columns
data.columns = ["date:yyyy-mm-dd", "time:hh:mm:ss.xxx", "epoch:int", "moteid:int", "temperature:real", "humidity:real", "light:real", "voltage:real" ]
# Drop columns that are not needed
data = data.drop(columns = ["humidity:real", "light:real", "voltage:real"])
# Remove first row
data = data.iloc[1: , :]
# Remove observations with missing temperature values
data = data.dropna(subset = ['temperature:real'])

data.head()

# For more information for the dataset visit: https://www.kaggle.com/datasets/divyansh22/intel-berkeley-research-lab-sensor-data?resource=download

Unnamed: 0,date:yyyy-mm-dd,time:hh:mm:ss.xxx,epoch:int,moteid:int,temperature:real
1,2004-02-28,00:59:16.02785,3,1.0,19.9884
2,2004-02-28,01:03:16.33393,11,1.0,19.3024
3,2004-02-28,01:06:16.013453,17,1.0,19.1652
4,2004-02-28,01:06:46.778088,18,1.0,19.175
5,2004-02-28,01:08:45.992524,22,1.0,19.1456


In [3]:
# General Information Regarding dataset
# This file includes a log of about 2.3 million readings collected from the 54 sensors installed in the lab. There are 8 columns in this txt file which could be converted to a CSV file with ease.
# The columns included are date:yyyy-mm-dd, time:hh:mm:ss.xxx, epoch:int, moteid:int, temperature:real, humidity:real, light:real, voltage:real.
# Epoch is a monotonically increasing sequence number from each mote. Two readings from the same epoch number were produced from different motes at the same time.
# There are some missing epochs in this data set. Moteids range from 1-54; data from some motes may be missing or truncated.
# Temperature is in degrees Celsius. Humidity is temperature corrected relative humidity, ranging from 0-100%.
# Light is in Lux (a value of 1 Lux corresponds to moonlight, 400 Lux to a bright office, and 100,000 Lux to full sunlight.)
# Voltage is expressed in volts, ranging from 2-3; the batteries, in this case, were lithium-ion cells that maintain a fairly constant voltage over their lifetime
# Note that variations in voltage are highly correlated with temperature.

In [4]:
# Getting the data for each sensor from 1 to 55 (excluding 5, 50, 15, 8 and 12 - ask shoaib bhai why?) and storing it as a list which containing 49 indexes. Data at each index will have their index reset
# Note: each index in the list corresponds to a sensor and its readings for temperature at different moment in time. The minimum number of readings are 28131 and the maximum is 65689
lst = []
num = 0
for i in range(1, 55):
  if i != 5 and i != 50 and i != 15 and i != 8 and i != 12:
    lst.append(data.loc[data['moteid:int'] == float(i)])
    lst[num].reset_index()
    num = num + 1
print('len(lst)', len(lst))

len(lst) 49


In [5]:
# Set up Global Root Directory for Data and some global variables

ROOT = Path('/content/drive/MyDrive/DUPA - RCPA/Technology transfer deep unfolding/SPROJ-ConvMC-Net/Sensor Data/')
os.makedirs(ROOT, exist_ok = True)

# List of Q --> [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
# List of sigma --> [0.0, 1.0, 3.0, 5.0, 7.0, 10.0]

q_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
sigma_list = [0.0, 1.0, 3.0, 5.0, 7.0, 10.0]

# The file structure is as follows

# data/ <- overall dataset folder
#     Q is point 1/ <- Sampling Rate
#         Noise Variance 0/ <- Noise Variance
#             groundtruth/ <- Groundtruth numpy arrays
#               train/ <- numpy arrays for training
#                 L_mat_MC_train0.npy
#                 L_mat_MC_train1.npy
#                 ...
#               test/ <- numpy arrays for inference
#                 L_mat_MC_test0.npy
#                 L_mat_MC_test1.npy
#             prediction/ <- Will eventually be populated by our proposed ConvMC-Net predictions.
#               train/ <- numpy arrays for prediction on train dataset
#                 L_mat_MCinfer_train0.npy
#                 L_mat_MCinfer_train1.npy
#                 ...
#               test/ <- numpy arrays for prediction on test dataset
#                 L_mat_MCinfer_test0.npy
#                 L_mat_MCinfer_test1.npy
#         ...
#         Noise Variance 10/
#           ...
#     ...
#     Q is point 9/ <- Sampling Rate
#         Noise Variance 10/ <- Noise Variance
#             groundtruth/ <- Groundtruth numpy arrays
#               train/ <- numpy arrays for training
#                 L_mat_MC_train0.npy
#                 L_mat_MC_train1.npy
#                 ...
#               test/ <- numpy arrays for inference
#                 L_mat_MC_test0.npy
#                 L_mat_MC_test1.npy
#             prediction/ <- Will eventually be populated by our proposed ConvMC-Net predictions.
#               train/ <- numpy arrays for prediction on train dataset
#                 L_mat_MCinfer_train0.npy
#                 L_mat_MCinfer_train1.npy
#                 ...
#               test/ <- numpy arrays for prediction on test dataset
#                 L_mat_MCinfer_test0.npy
#                 L_mat_MCinfer_test1.npy

In [6]:
# Generating various types of data with varying sampling rate (Q) and noise (sigma)

# We will use the list made above to intialize a numpy array of zeros of shape (49, 60) i.e. we are interested in 60 readings for each sensor.

# Each array will have 49 sensors and their 60 readings. To ensure we account the same time, we will take a certain index range for that array. That array will have 49 sensors and their 60 readings each
# sensor took the readings at the same time. The index range is chosen such that its present in all of the sensors list part.

# We will have 468 such numpy arrays out of which 400 will be used for training and 68 for testing. Therefore the training data will represent 400 low rank matrices of overall shape (400, 49, 60), and for
# test (68, 49, 60). Depending on Q and sigma, we will appropriately remove Q% of entries and add sigma noise to the training/test data. To ensure reproducibility we will set a random seed.

# Set a random seed for reproducibility
np.random.seed(42)

# Defining a function 'sample_data' which takes an array, a sampling rate q (between 0 and 1) and randomly maps q% of entries to 0 and rest remain unchanged

def sample_data(arr, q):

  # Calculate the number of elements to set to 0 (40% of total elements)
  total_elements = arr.size
  num_zero_entries = int(q * total_elements)

  # Create an array with 40% 0s and 60% 1s
  mask_array = np.concatenate((np.zeros(num_zero_entries), np.ones(total_elements - num_zero_entries)))
  np.random.shuffle(mask_array)
  mask_array = mask_array.reshape(arr.shape)

  # Element-wise multiplication to simulate missing entries
  result_array = np.multiply(arr, mask_array)

  return result_array

# Defining a q, sigma manipulates the data accordingly and returns the train and test counterparts

def get_train_and_test_data(q, sigma):

  # Get data batch
  arr = np.zeros((468, 49, 60), dtype = float)
  for _, i in enumerate(range(0, 468)):
    for j in range(0, 49):
      arr[i, j, :] = lst[j]['temperature:real'].iloc[((60 * i)):(60 + (60 * i))].to_numpy().reshape(1, 60)
    arr = arr + np.random.normal(loc = 0.0, scale = float(sigma), size = arr.shape)

  # Now randomly set q% of data to 0, rest remain unchanged. Use the helper function 'sample_data'
  sampled_arr = sample_data(arr, q)

  # Randomly select 400 for training data and 68 for testing data
  chosen_indices = np.random.choice(468, size = 400, replace = False)

  # Create two variables for chosen and remaining arrays
  train_arr = sampled_arr[chosen_indices]
  remaining_indices = np.setdiff1d(np.arange(468), chosen_indices)
  test_arr = sampled_arr[remaining_indices]

  # Return train and test arrays
  return train_arr, test_arr

# Defining a function which makes the respective directories corresponding to the hyperparameters q and sigma.
def make_dir(q, sigma, ground_or_pred):

  # Make directories according to the structure above
  q_dir = (ROOT / f'Q is {q * 100}%')
  os.makedirs(q_dir, exist_ok = True)

  noise_dir = (q_dir / f'Noise Variance {sigma}')
  os.makedirs(noise_dir, exist_ok = True)

  ground_or_pred_dir = (noise_dir / ground_or_pred)
  os.makedirs(ground_or_pred_dir, exist_ok = True)

  train_dir = (ground_or_pred_dir / 'train')
  os.makedirs(train_dir, exist_ok = True)

  test_dir = (ground_or_pred_dir / 'test')
  os.makedirs(test_dir, exist_ok = True)


  # Return train and test directories as thats only needed
  return train_dir, test_dir

# Defining a function which takes train and test arrays and approriately stores them in their respective directories
def save_data(q, sigma):
  # Get data
  train_arr, test_arr = get_train_and_test_data(q, sigma)

  # Get train and test dir (rest are automatically made)
  train_dir, test_dir = make_dir(q, sigma, 'groundtruth')

  # Now for each of matrix in test and train array each of shape (49, 60) save in the respective directories
  for i in range(train_arr.shape[0]):
    np.save(str(train_dir) + '/L_mat_MC_train' + str(i) + '.npy', train_arr[i])

  for i in range(test_arr.shape[0]):
    np.save(str(test_dir) + '/L_mat_MC_test' + str(i) + '.npy', test_arr[i])

In [None]:
# Getting Data for Each parameter

for q in q_list:
  for sigma in sigma_list:
    print(f'Saving Data for Sampling Rate: {q * 100}% and Noise Variance {sigma}')
    save_data(q, sigma)

In [9]:
# Checking if Data is stored and sturctured correctly

# Your directory path
for q in q_list:
  for sigma in sigma_list:
    for split in ['train', 'test']:
      dir_path = '/content/drive/MyDrive/DUPA - RCPA/Technology transfer deep unfolding/SPROJ-ConvMC-Net/Sensor Data/' + f'Q is {q * 100}%/' + f'Noise Variance {sigma}/groundtruth/' + split

      # List all files and directories in the directory
      all_files = os.listdir(dir_path)

      # Count the number of files (excluding directories)
      num_files = len([file for file in all_files if os.path.isfile(os.path.join(dir_path, file))])

      print(f"Number of files in the directory: {dir_path} are {num_files}")


Number of files in the directory: /content/drive/MyDrive/DUPA - RCPA/Technology transfer deep unfolding/SPROJ-ConvMC-Net/Sensor Data/Q is 10.0%/Noise Variance 0.0/groundtruth/train are 400
Number of files in the directory: /content/drive/MyDrive/DUPA - RCPA/Technology transfer deep unfolding/SPROJ-ConvMC-Net/Sensor Data/Q is 10.0%/Noise Variance 0.0/groundtruth/test are 68
Number of files in the directory: /content/drive/MyDrive/DUPA - RCPA/Technology transfer deep unfolding/SPROJ-ConvMC-Net/Sensor Data/Q is 10.0%/Noise Variance 1.0/groundtruth/train are 400
Number of files in the directory: /content/drive/MyDrive/DUPA - RCPA/Technology transfer deep unfolding/SPROJ-ConvMC-Net/Sensor Data/Q is 10.0%/Noise Variance 1.0/groundtruth/test are 68
Number of files in the directory: /content/drive/MyDrive/DUPA - RCPA/Technology transfer deep unfolding/SPROJ-ConvMC-Net/Sensor Data/Q is 10.0%/Noise Variance 3.0/groundtruth/train are 400
Number of files in the directory: /content/drive/MyDrive/DU

In [None]:
# Perfect!!