In [4]:
import pandas as pd
import re
import numpy as np
from collections import Counter

def detect_decimal_separator(number_string):
    # Remove all non-digit characters except for commas and periods
    cleaned_number_string = re.sub(r'[^\d,\.]', '', number_string)

    # Split the string based on comma and period
    comma_parts = cleaned_number_string.split(',')
    period_parts = cleaned_number_string.split('.')
    
    # Check if parts after split contain exactly one segment with less than 3 digits, identifying it as a possible decimal separator
    comma_decimal = any(len(part) == 3 for part in comma_parts[:-1]) and len(comma_parts[-1]) != 3
    period_decimal = any(len(part) == 3 for part in period_parts[:-1]) and len(period_parts[-1]) != 3
    
    # Detect based on the rules given
    if comma_decimal and not period_decimal:
        return '.'
    elif period_decimal and not comma_decimal:
        return ','
    else:
        return None  # Unable to determine based on given heuristics


def clean_value(value):
    """
    Remove all alphabetic characters and special characters except for '.' and ',' from the value.

    :param value: The string value to be cleaned.
    :return: The cleaned value.
    """
    if pd.isna(value):
        return value
    # Remove all characters except digits, '.' and ','
    cleaned_value = re.sub(r'[^0-9.,]', '', value)
    return cleaned_value

def parse_float(value, decimal_separator=None):
    """
    Parse a string to a float, handling various cases such as commas and different decimal separators.

    :param value: The string value to be parsed.
    :return: The parsed float value, or NaN if parsing fails.
    """
    try:
        if decimal_separator:
            value = value.replace(decimal_separator, '')
        if ',' in value:
            value = value.replace(',', '.')
        return float(value)
    except (ValueError, TypeError):
        return np.nan
    
def find_most_frequent(lst):
    """
    Find the value that appears most frequently in the list.

    :param lst: The list to be analyzed.
    :return: A tuple containing the most frequent value and its count.
    """
    if not lst:
        return None, 0
    # Count the occurrences of each element in the list
    counter = Counter(lst)
    # Find the most common element
    most_common_element, count = counter.most_common(1)[0]
    return most_common_element, count

def find_decimal_separator(df, columns_to_check=[]):
    """
    Find the most common decimal separator in the specified columns of the DataFrame.

    :param df: Pandas DataFrame to be processed.
    :param columns_to_check: List of column names to be checked for decimal separators.
    :return: The most common decimal separator found in the specified columns.
    """
    separators = []
    for column in columns_to_check:
        # Detect decimal separators in the column
        detected_separators = [detect_decimal_separator(value) for value in df[column] if not pd.isna(value)]
        # Find the most common decimal separator
        separator, _ = find_most_frequent(detected_separators)
        separators.append(separator)
    # Find the most common decimal separator among all columns
    most_common_separator, _ = find_most_frequent(separators)
    return most_common_separator


def convert_to_float(df, columns_to_convert=[]):
    """
    Convert specified columns in the DataFrame to float type using a generic float parser,
    and return a dictionary of mappings from original values to floats.

    :param df: Pandas DataFrame to be processed.
    :param columns_to_convert: List of column names to be converted to float type.
    :return: A tuple with two elements:
             1. DataFrame with specified columns converted to float type.
             2. Dictionary of mappings for each converted column.
    """
    mappings = {}
    for column in columns_to_convert:
        if df[column].dtype == "object":
            # Clean values
            df[column] = df[column].apply(clean_value)
            # Create a mapping from unique values to floats
            unique_values = df[column].unique()
            decimal_separator = find_decimal_separator(df, [column])
            value_to_float_map = {value: parse_float(value, decimal_separator) for value in unique_values}
            mappings[column] = value_to_float_map
            # Apply the mapping to the DataFrame
            df[column] = df[column].map(value_to_float_map)

    return df, mappings

# Example usage
if __name__ == "__main__":
    data = {
        'A': ['4', 'NaN', '4', 'NaN', 'NaN', 'NaN', 'NaN', '6', 'NaN'],
        'B': ['5 phòng', '3 phòng', '4 phòng', '6 phòng', '4 phòng', 'nhiều hơn 10 phòng', '3 phòng', '5 phòng', '4 phòng'],
        'C': ['46 m²', '37 m²', '40 m²', '51 m²', '36 m²', '46 m²', '52 m²', '32 m²', '75 m²'],
        'D': ['NaN', 'NaN', '10 m', '12.75 m', '9 m', '12.1 m', 'NaN', 'NaN', '12 m'],
        'E': ['NaN', 'NaN', '4 m', '4 m', '4 m', '3.8 m', '4.5 m', '6.8 m', '6.5 m'],
        'F': ['86,96 triệu/m²', '116,22 triệu/m²', '65 triệu/m²', '100 triệu/m²', '86,11 triệu/m²', '104,35 triệu/m²', '112,5 triệu/m²', '184,38 triệu/m²', '120 triệu/m²']
    }
    df = pd.DataFrame(data)
    print("Before conversion:")
    print(df)
    
    df, mappings = convert_to_float(df, ['A', 'B', 'C', 'D', 'E', 'F'])
    print("After conversion:")
    print(df)
    print("Mappings:")
    print(mappings)

Before conversion:
     A                   B      C        D      E                F
0    4             5 phòng  46 m²      NaN    NaN   86,96 triệu/m²
1  NaN             3 phòng  37 m²      NaN    NaN  116,22 triệu/m²
2    4             4 phòng  40 m²     10 m    4 m      65 triệu/m²
3  NaN             6 phòng  51 m²  12.75 m    4 m     100 triệu/m²
4  NaN             4 phòng  36 m²      9 m    4 m   86,11 triệu/m²
5  NaN  nhiều hơn 10 phòng  46 m²   12.1 m  3.8 m  104,35 triệu/m²
6  NaN             3 phòng  52 m²      NaN  4.5 m   112,5 triệu/m²
7    6             5 phòng  32 m²      NaN  6.8 m  184,38 triệu/m²
8  NaN             4 phòng  75 m²     12 m  6.5 m     120 triệu/m²
After conversion:
     A     B     C      D    E       F
0  4.0   5.0  46.0    NaN  NaN   86.96
1  NaN   3.0  37.0    NaN  NaN  116.22
2  4.0   4.0  40.0  10.00  4.0   65.00
3  NaN   6.0  51.0  12.75  4.0  100.00
4  NaN   4.0  36.0   9.00  4.0   86.11
5  NaN  10.0  46.0  12.10  3.8  104.35
6  NaN   3.0  52.0  