In [14]:
import os
import pandas as pd
from datetime import datetime
import math

class DataDiggestor:
	def _get_list_of_humidities(self, df: pd.DataFrame):
		humidities = []
		columns = list(df.columns)
		for col in columns:
				current_line = (df[df[col] == '-']).reset_index(drop=True)
				if (len(current_line) != 0): 
					idx = columns.index(col) + 2
					if (idx > len(columns) - 1): continue
					humidities.append(float(current_line.iloc[0, idx]))
				else:
					current_line = (df[df[col].str.contains('\*') == True]).reset_index(drop=True)
					if (len(current_line) == 0): continue
					value = current_line.iloc[0, columns.index(col)]
					humidity = float(value.replace(' ', '')[9:])
					humidities.append(humidity)			

		return humidities

	def _get_humidity_levels(self, df: pd.DataFrame):
		humidities = self._get_list_of_humidities(df)
		h_len = len(humidities)
		if (h_len > 0):
			total = math.fsum(humidities)
			return round(total/h_len, 6)

	def _get_time_in_seconds(self, time_object: datetime):
		return time_object.hour * 3600 + time_object.minute * 60 + time_object.second + time_object.microsecond / 1e6

	def _convert_time_column_to_relative_seconds(self, df:pd.DataFrame, time_column_name='Time'):
		time_col = df[time_column_name].copy()
		for i, time in enumerate(time_col):
			try: time_obj = datetime.strptime(str(time), '%Y-%m-%d %H:%M:%S.%f')
			except:	time_obj = datetime.strptime(str(time), '%Y-%m-%d %H:%M:%S')
			time_col[i] = float(self._get_time_in_seconds(time_obj))
		time_col = time_col - time_col.min()
		response = df.copy()
		response[time_column_name] = pd.to_numeric(time_col)
		return response

	def _get_closest_line(self, target_in_seconds:float, df:pd.DataFrame, time_in_seconds_column_name='Time'):
		abs_diff = abs(df[time_in_seconds_column_name] - target_in_seconds)
		min_index = abs_diff.idxmin()
		return min_index

	def diggest_files_into_single_dataframe(self, data_path:str):
		data_df = pd.DataFrame()
		final_df = pd.DataFrame()

		for filename in os.listdir(data_path):
			f = os.path.join(data_path, filename)
			if os.path.isfile(f):
				essay_number = f.split('\\')[1].split('E')[1][0:2]
				if (filename.startswith('E')):
					df = pd.read_excel(f)
					df = self._convert_time_column_to_relative_seconds(df)
					data_df = df.copy()

				if (filename.startswith('U')):
					time = int(filename[5:filename.find('_min')]) * 60
					closest_line = self._get_closest_line(time, data_df)

					df = pd.read_excel(f, dtype=str)
					humidity = self._get_humidity_levels(df)

					line = dict(data_df.loc[closest_line])
					line['Umidade Produto [%]'] = humidity
					line['Ensaio'] = int(essay_number)

					final_df = pd.concat([final_df, pd.DataFrame([line])], ignore_index=True)

		return final_df
	
	def get_dfs_without_humidity(self, data_path:str):
		final_df = pd.DataFrame()

		for filename in os.listdir(data_path):
			f = os.path.join(data_path, filename)
			if os.path.isfile(f):
				essay_number = f.split('\\')[1].split('E')[1][0:2]
				if (filename.startswith('E')):
					df = pd.read_excel(f)
					df = self._convert_time_column_to_relative_seconds(df)
					df['Ensaio'] = int(essay_number)
					final_df = pd.concat([final_df, df], ignore_index=True)
					
		return final_df.dropna()

In [15]:
data_path = '../data/collected_data'

diggestor = DataDiggestor()
display(diggestor.diggest_files_into_single_dataframe(data_path))

Unnamed: 0,Time,PT100 1 [ºC],PT100 2 [ºC],PT100 3 [ºC],PT100 4 [ºC],Temp. TH 1 [ºC],Umidade 1 [%],Vel. do Ar [m/s],Temp. TH 2 [ºC],Umidade 2 [%],Umidade Produto [%],Ensaio
0,0.0,24.213534,24.5,-244.98909,1326.005777,20.134013,73.388713,1.075032,24.182938,90.653755,55.223333,0
1,900.485,27.270283,27.310531,-244.98909,1326.005777,22.176537,65.309984,1.075032,27.512709,92.391174,48.006667,0
2,1799.039,27.268392,27.39782,-244.98909,1326.005777,24.304439,58.981789,1.075032,27.571817,93.011919,41.003333,0
3,2699.241,34.421674,29.924256,1326.058821,-244.98909,25.338835,54.499243,1.075032,27.302546,93.185991,7.083333,0
4,3600.524,45.204301,43.769892,-244.98909,1326.005777,24.544156,56.508994,1.075032,39.971468,26.113811,2.283333,0
5,0.0,24.213534,24.5,-244.98909,1326.005777,20.134013,73.388713,1.075032,24.182938,90.653755,61.353333,1
6,900.485,27.270283,27.310531,-244.98909,1326.005777,22.176537,65.309984,1.075032,27.512709,92.391174,51.775,1
7,1799.039,27.268392,27.39782,-244.98909,1326.005777,24.304439,58.981789,1.075032,27.571817,93.011919,35.256667,1
8,2699.241,34.421674,29.924256,1326.058821,-244.98909,25.338835,54.499243,1.075032,27.302546,93.185991,4.863333,1
9,3600.524,45.204301,43.769892,-244.98909,1326.005777,24.544156,56.508994,1.075032,39.971468,26.113811,2.13,1


In [16]:
data_path = '../data/collected_data'

diggestor = DataDiggestor()
display(diggestor.get_dfs_without_humidity(data_path))

Unnamed: 0,Time,PT100 1 [ºC],PT100 2 [ºC],Temp. TH 1 [ºC],Umidade 1 [%],Temp. TH 2 [ºC],Umidade 2 [%],Ensaio,PT100 3 [ºC],PT100 4 [ºC],Vel. do Ar [m/s]
1321,0.000,26.356301,26.398715,25.890513,49.829562,26.698327,68.761479,1,1326.058821,1326.005777,1.075032
1322,3.348,26.549094,26.663662,25.880661,49.803291,26.928193,67.759852,1,1326.058821,1326.005777,1.075032
1323,23.438,27.643860,28.115128,25.903648,49.724478,28.514267,62.804307,1,1326.058821,-244.989090,1.075032
1324,28.777,27.906699,28.460447,25.959473,49.691640,28.783538,61.579389,1,1326.058821,-244.989090,1.075032
1325,37.780,28.341818,29.015147,25.772296,49.625962,29.446866,59.773218,1,-244.989090,1326.005777,1.075032
...,...,...,...,...,...,...,...,...,...,...,...
10920,3589.997,45.175779,43.714931,24.550723,56.548401,39.968184,26.399501,9,1326.058821,1326.005777,1.075032
10921,3593.550,45.181483,43.728711,24.448926,56.476155,39.932062,26.120379,9,1326.058821,1326.005777,1.075032
10922,3597.008,45.192734,43.747875,24.547440,56.466303,40.027293,26.035000,9,-244.989090,-244.989090,1.075032
10923,3600.524,45.204301,43.769892,24.544156,56.508994,39.971468,26.113811,9,-244.989090,1326.005777,1.075032
