In [1]:
from functools import partial

import numpy as np
import pandas as pd
from matplotlib import colors
from matplotlib import pyplot as plt

In [2]:
df = pd.read_csv('data.csv')
df = df[df['stages_number'] == 1].reset_index()

In [3]:
def split_conditions(conditions):
    if conditions is np.nan:
        return []
    conditions = conditions.replace(';', ',')
    return [
        condition.strip() for condition in conditions.split(',')
    ]

In [4]:
def parse_conditions(conditions, keywords):
    conditions_dict = {
        condition_type: [] for condition_type in keywords.keys()
    }
    for condition in conditions:
        is_match_found = False
        for condition_type, type_keywords in keywords.items():
            for keyword in type_keywords:
                if keyword in condition:
                    conditions_dict[condition_type].append(condition)
                    is_match_found = True
                    break
            if is_match_found:
                break
        if not is_match_found:
            raise ValueError('Unhandled conditions')
    return conditions_dict.values()

In [5]:
df['other_conditions_1'] = df['other_conditions_1'].apply(split_conditions)

In [6]:
keywords = {
    'temperature_1': [' °C', ' K', 'rt', 'heated', 'cooled', 'reflux'],
    'pressure_1': ['Pa', ' atm', ' psi', 'bar', ' torr', ' kg/cm2', 'mm'],
    'time_1': [' d', ' h', ' min', ' s', 'overnight', ' ms', ' w', ' mo'],
    'pH_1': ['pH', 'acidified', 'neutralized', 'basified'],
}
parse_conditions = partial(parse_conditions, keywords=keywords)
conditions = df['other_conditions_1'].apply(parse_conditions)
for column_name, column in zip(keywords.keys(), zip(*conditions)):
    df[column_name] = column

In [7]:
def convert_value(value_string, conversion_mapping):
    for measure, conversion in conversion_mapping.items():
        if measure in value_string:
            value_string = value_string.replace(measure, '')
            return conversion(value_string)
    return value_string


def parse_time(time_list, conversion_mapping):
    converted_time_list = []
    for time_string in time_list:
        symbols_to_remove = ['< ', '> ']
        for symbol in symbols_to_remove:
            time_string = time_string.replace(symbol, '')
        delimiters = [' - ']
        for delimiter in delimiters:
            if delimiter in time_string:
                time_range = time_string.split(' ')
                time_string = parse_range(time_range)
        converted_time_list.append(
            convert_value(time_string, conversion_mapping)
        )
    return converted_time_list


def parse_temperature(temperature_list, conversion_mapping):
    pass


def parse_pressure(pressure_list, conversion_mapping):
    converted_pressure_list = []
    for pressure_string in pressure_list:
        symbols_to_remove = ['< ', '> ']
        for symbol in symbols_to_remove:
            pressure_string = pressure_string.replace(symbol, '')
        delimiters = [' - ', ' → ']
        for delimiter in delimiters:
            if delimiter in pressure_string:
                pressure_range = pressure_string.split(' ')
                pressure_string = parse_range(pressure_range)
        converted_pressure_list.append(
            convert_value(pressure_string, conversion_mapping)
        )
    return converted_pressure_list


def parse_ph(ph_list, conversion_mapping):
    converted_ph_list = []
    for ph_string in ph_list:
        symbols_to_remove = ['< ', '> ']
        for symbol in symbols_to_remove:
            ph_string = ph_string.replace(symbol, '')
        delimiters = [' - ', ' → ']
        for delimiter in delimiters:
            if delimiter in ph_string:
                ph_range = ph_string.split(' ')
                ph_string = parse_range(ph_range, inverted=True)
        converted_ph_list.append(
            convert_value(ph_string, conversion_mapping)
        )
    return converted_ph_list


def parse_range(value_range, inverted=False):
    if len(value_range) == 4:
        left, right, measure = 0, 2, 3
        if inverted:
            left, right, measure = 1, 3, 0
        avg_value = (float(value_range[left]) + float(value_range[right])) / 2
        return '{0} {1}'.format(
            avg_value, value_range[measure],
        )
    elif len(value_range) == 5:
        left, right, measure_left, measure_right = 0, 3, 1, 4
        if inverted:
            left, measure_left = measure_left, left
            right, measure_right = measure_right, right
        if value_range[measure_left] == value_range[measure_right]:
            avg_value = (float(value_range[left]) + float(value_range[right])) / 2
            return '{0} {1}'.format(
                avg_value, value_range[measure_left],
            )
        else:
            left, measure = 0, 1
            if inverted:
                left, measure = measure, left
            return '{0} {1}'.format(
                value_range[0], value_range[1],
            ) 
    else:
        raise ValueError('Wrong range format')

            
time_conversion_mapping = {
    ' d': lambda x: float(x) * 24,
    ' h': lambda x: float(x),
    ' min': lambda x: float(x) / 60,
    ' s': lambda x: float(x) / 60 / 60,
    ' ms': lambda x: float(x) / 60 / 60 / 1000,
    'overnight': lambda _: 12.0,
    ' mo': lambda _: [0],
    ' week': lambda _: [0],
}

temperature_conversion_mapping = {
    ' °C': lambda x: float(x),
    ' K': lambda x: float(x) - 273,
}

pressure_conversion_mapping = {
    ' atm': lambda x: float(x) * 0.10133,
    ' psig': lambda x: float(x) * 0.00689,
    ' psi': lambda x: float(x) * 0.00689,
    ' bar': lambda x: float(x) * 0.1,
    ' kbar': lambda x: float(x) * 100,
    ' kPa': lambda x: float(x) * 0.001,
    ' Pa': lambda x: float(x) * 0.000001,
    ' MPa': lambda x: float(x),
    ' GPa': lambda x: float(x) * 1000,
    ' mbar': lambda x: float(x) * 0.0001,
    ' torr': lambda x: float(x) * 0.000133,
    ' hPa': lambda x: float(x) * 0.0001,
    ' kg/cm2': lambda x: float(x) * 0.0981,
    ' mm Hg': lambda x: float(x) * 0.000133,
    ' mm': lambda x: float(x) * 0.000133,   
}

ph_conversion_mapping = {
    ' pH': lambda x: float(x),
}

In [8]:
parse_time = partial(
    parse_time, conversion_mapping=time_conversion_mapping,
)
parse_pressure = partial(
    parse_pressure, conversion_mapping=pressure_conversion_mapping,
)
parse_ph = partial(
    parse_ph, conversion_mapping=ph_conversion_mapping,
)

In [9]:
df['time_1'] = df['time_1'].apply(parse_time)
df['pressure_1'] = df['pressure_1'].apply(parse_pressure)
df['pH_1'] = df['pH_1'].apply(parse_ph)