In [18]:
import os
import pandas as pd
from datetime import datetime
import math

class DataDiggestor:

	def _get_list_of_humidities(self, df: pd.DataFrame):
		humidities = []
		columns = list(df.columns)
		for col in columns:
				current_line = (df[df[col] == '-']).reset_index(drop=True)
				if (len(current_line) != 0): 
					idx = columns.index(col) + 2
					if (idx > len(columns) - 1): continue
					humidities.append(float(current_line.iloc[0, idx]))
				else:
					current_line = (df[df[col].str.contains('\*') == True]).reset_index(drop=True)
					if (len(current_line) == 0): continue
					value = current_line.iloc[0, columns.index(col)]
					humidity = float(value.replace(' ', '')[9:])
					humidities.append(humidity)			

		return humidities

	def _get_humidity_levels(self, df: pd.DataFrame):
		humidities = self._get_list_of_humidities(df)
		h_len = len(humidities)
		if (h_len > 0):
			total = math.fsum(humidities)
			return round(total/h_len, 6)

	def _get_time_in_seconds(self, time_object: datetime):
		return time_object.hour * 3600 + time_object.minute * 60 + time_object.second + time_object.microsecond / 1e6

	def _convert_time_column_to_relative_seconds(self, df:pd.DataFrame, time_column_name='Time'):
		time_col = df[time_column_name].copy()
		for i, time in enumerate(time_col):
			try: time_obj = datetime.strptime(str(time), '%Y-%m-%d %H:%M:%S.%f')
			except:	time_obj = datetime.strptime(str(time), '%Y-%m-%d %H:%M:%S')
			time_col[i] = float(self._get_time_in_seconds(time_obj))
		time_col = time_col - time_col.min()
		response = df.copy()
		response[time_column_name] = pd.to_numeric(time_col)
		return response

	def _get_closest_time_line(self, target_in_seconds:float, df:pd.DataFrame, time_in_seconds_column_name='Time'):
		abs_diff = abs(df[time_in_seconds_column_name] - target_in_seconds)
		min_index = abs_diff.idxmin()
		return min_index

	def get_essay_number_from(self, filename:str):
		return int(filename.split('\\')[1].split('E')[1][0:2])

	def get_labeled_essays(self, data_path:str):
		essays_df = pd.DataFrame()
		final_df = pd.DataFrame()

		for dir_filename in os.listdir(data_path):
			filename = os.path.join(data_path, dir_filename)
			if os.path.isfile(filename):
				essay_number = self.get_essay_number_from(filename)

				if (dir_filename.startswith('E')): # Essay files
					df = pd.read_excel(filename)
					df = self._convert_time_column_to_relative_seconds(df)
					df['Ensaio'] = essay_number
					essays_df = pd.concat([essays_df, df.copy()]).reset_index(drop=True)

				if (dir_filename.startswith('U')): # Humidity files
					humidity_collected_time_in_seconds = int(dir_filename[5:dir_filename.find('_min')]) * 60
					current_essay = essays_df[essays_df['Ensaio'] == essay_number]
					closest_line = self._get_closest_time_line(humidity_collected_time_in_seconds, current_essay)

					df = pd.read_excel(filename, dtype=str)
					humidity = self._get_humidity_levels(df)

					current_humidity_essay_line = dict(essays_df.loc[closest_line])
					current_humidity_essay_line['Umidade Produto [%]'] = humidity
					current_humidity_essay_line['Ensaio'] = essay_number

					final_df = pd.concat([final_df, pd.DataFrame([current_humidity_essay_line])]).reset_index(drop=True)

		return final_df

	def get_essays(self, data_path:str):
		essays_df = pd.DataFrame()
		for dir_file in os.listdir(data_path):
			filename = os.path.join(data_path, dir_file)
			if os.path.isfile(filename):
				essay_number = self.get_essay_number_from(filename)
				if (dir_file.startswith('E')):
					df = pd.read_excel(filename)
					df = self._convert_time_column_to_relative_seconds(df)
					df['Ensaio'] = essay_number
					essays_df = pd.concat([essays_df, df.copy()]).reset_index(drop=True)

		return essays_df

In [19]:
data_path = '../data/collected_data'

diggestor = DataDiggestor()
display(diggestor.get_labeled_essays(data_path))

Unnamed: 0,Time,PT100 1 [ºC],PT100 2 [ºC],Temp. TH 1 [ºC],Umidade 1 [%],Temp. TH 2 [ºC],Umidade 2 [%],Ensaio,PT100 3 [ºC],PT100 4 [ºC],Vel. do Ar [m/s],Umidade Produto [%]
0,0.0,23.681101,23.62614,23.174812,57.70434,24.255182,73.746678,0,,,,55.223333
1,899.564,27.150231,27.212689,24.787157,47.790284,28.432172,87.234795,0,,,,48.006667
2,1798.938,26.887455,26.675792,26.097392,44.201039,26.974166,92.332055,0,,,,41.003333
3,2700.298,26.897853,26.742584,27.033274,42.788991,26.88222,93.655657,0,,,,7.083333
4,3601.595,43.864988,39.788933,27.676899,40.073266,36.175378,36.28046,0,,,,2.283333
5,0.0,26.356301,26.398715,25.890513,49.829562,26.698327,68.761479,1,1326.058821,1326.005777,1.075032,61.353333
6,898.476,26.933613,26.909098,27.158059,44.335677,27.230302,91.89852,1,1326.058821,1326.005777,1.075032,51.775
7,1801.147,26.88462,26.862624,28.579943,40.601962,27.07268,92.834563,1,-244.98909,1326.005777,1.075032,35.256667
8,2701.283,30.078009,28.68758,29.167743,38.766306,26.816544,93.0579,1,1326.058821,1326.005777,1.075032,4.863333
9,3600.386,47.169319,46.821808,29.404176,37.866541,43.074684,18.242541,1,1326.058821,1326.005777,1.075032,2.13


In [20]:
data_path = '../data/collected_data'

diggestor = DataDiggestor()
display(diggestor.get_essays(data_path))

Unnamed: 0,Time,PT100 1 [ºC],PT100 2 [ºC],Temp. TH 1 [ºC],Umidade 1 [%],Temp. TH 2 [ºC],Umidade 2 [%],Ensaio,PT100 3 [ºC],PT100 4 [ºC],Vel. do Ar [m/s]
0,0.000,23.681101,23.626140,23.174812,57.704340,24.255182,73.746678,0,,,
1,3.253,23.681888,23.623464,22.944946,57.779871,24.288020,73.914167,0,,,
2,6.655,23.675593,23.618114,23.158393,57.628810,24.333993,74.577554,0,,,
3,9.884,23.666780,23.609773,23.164961,57.326689,24.383250,75.809093,0,,,
4,13.184,23.649784,23.598442,22.987635,56.486007,24.376682,77.267243,0,,,
...,...,...,...,...,...,...,...,...,...,...,...
10920,3589.997,45.175779,43.714931,24.550723,56.548401,39.968184,26.399501,9,1326.058821,1326.005777,1.075032
10921,3593.550,45.181483,43.728711,24.448926,56.476155,39.932062,26.120379,9,1326.058821,1326.005777,1.075032
10922,3597.008,45.192734,43.747875,24.547440,56.466303,40.027293,26.035000,9,-244.989090,-244.989090,1.075032
10923,3600.524,45.204301,43.769892,24.544156,56.508994,39.971468,26.113811,9,-244.989090,1326.005777,1.075032
