**Table of contents**<a id='toc0_'></a>    
- [Tabla de contenido](#toc1_)    
- [Dataset clean](#toc2_)    
  - [Generic functions](#toc2_1_)    
  - [load file](#toc2_2_)    
  - [remove all totals rows](#toc2_3_)    
  - [clean farm size](#toc2_4_)    
  - [Generic functions](#toc2_5_)    
- [Clean numeric data](#toc3_)    
  - [clean score lots in table](#toc3_1_)    
  - [clean Total Value](#toc3_2_)    
  - [clean weight in table](#toc3_3_)    
  - [clean Size 30Kg boxes](#toc3_4_)    
  - [clean high bid in table](#toc3_5_)    
  - [clean Total Comission](#toc3_6_)    
  - [clean Altitude](#toc3_7_)    
  - [clean growing area](#toc3_8_)    
  - [Clean Auction Lot Size](#toc3_9_)    
  - [Clean Auction Lot Size (Kg)](#toc3_10_)    
  - [complete weight Lb](#toc3_11_)    
  - [detect abnormal data](#toc3_12_)    
- [Clean non-numeric data](#toc4_)    
  - [Generic string functions](#toc4_1_)    
  - [clean country](#toc4_2_)    
  - [clean Rank](#toc4_3_)    
  - [Clean Farmer](#toc4_4_)    
  - [Clean Region](#toc4_5_)    
  - [Clean Variety](#toc4_6_)    
  - [Clean Processing](#toc4_7_)    
  - [Clean History](#toc4_8_)    
  - [Clean Company Name](#toc4_9_)    
  - [Clean COMPANY NAME](#toc4_10_)    
  - [Clean Aroma_Flavor](#toc4_11_)    
  - [Clean Acidity](#toc4_12_)    
  - [Clean Overall](#toc4_13_)    
  - [Clean Other](#toc4_14_)    
  - [Clean City](#toc4_15_)    
  - [Remove empty rows, by col](#toc4_16_)    
- [Convert to future value](#toc5_)    
  - [Generic functions](#toc5_1_)    
  - [future value converision](#toc5_2_)    
- [save all data](#toc6_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# <a id='toc1_'></a>[Tabla de contenido](#toc0_)

# <a id='toc2_'></a>[Dataset clean](#toc0_)

In [None]:
import re
import pandas as pd


## <a id='toc2_1_'></a>[Generic functions](#toc0_)

In [None]:
def verbosity(msj, is_verbosity):
  if is_verbosity:
    print(msj)

def search_str_in_df(df_link, df_col, str_):
  return [(link, farm_size) for farm_size, link in zip(df_col, df_link) if type(farm_size)==str and str_ in farm_size]

def load_df_from_csv(file_path):
    try:
        df_ipc = pd.read_csv(file_path)
        return df_ipc
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return False
    
def get_ipc_older_than_value(year, df_ipc):
    if type(year)!=float:
        year = float(year)
    return df_ipc[df_ipc['year']>=year]['ipc']

def future_value_convertion(present_value, array_interests):
    future_value = present_value 
    [future_value:=(future_value*(1+(interest/100))) for interest in array_interests]
    return future_value


## <a id='toc2_2_'></a>[load file](#toc0_)

In [None]:

file_path = '../data/Auctionsv2.2.xlsx'

try:
  global df
  df = pd.read_excel(file_path)
except FileNotFoundError:
  print(f"Error: File not found at {file_path}")
except Exception as e:
  print(f"An error occurred: {e}")


df_ipc = load_df_from_csv('../data/inflacion_EEUU.csv')




In [None]:
df.head(1)

In [None]:
df.describe()

In [None]:
# Order values counts by year
df['Year'].value_counts().sort_index()

In [None]:
df['Country'].value_counts()

In [None]:

df['Farm_Size'].value_counts().head(20)

In [None]:
# Remove "ha" from the 'Farm_Size' column
df['Farm_Size'] = df['Farm_Size'].str.replace('ha', '', case=False, regex=False)

In [None]:
# df[df['Farm_Size'].str.contains("hectares")]

In [None]:
df['Farm_Size'] = df['Farm_Size'].str.replace('hectares farm in production', '', case=False)

In [None]:
df['Farm_Size'] = df['Farm_Size'].str.replace('-?hec.*', '', case=False, regex=True)

In [None]:
df.loc[[4678, 4675]]

## <a id='toc2_3_'></a>[remove all totals rows](#toc0_)

In [None]:

def remove_rows_if_condition(condition, index_):
    global df
    if condition:
        print(f"drop {index_} row")
        df.drop(index=index_, inplace=True)

[remove_rows_if_condition(("totals:" in str(value_) or "total" in str(value_) or "stat" in str(value_)), index_) for value_, index_ in zip(df['Rank'], df.index)]
df.reset_index(drop=True, inplace=True)
df.drop(columns=['Unnamed: 0'], inplace=True)


[remove_rows_if_condition(pd.isna(value_), index_) for value_, index_ in zip(df['Rank'], df.index)]
df.reset_index(drop=True, inplace=True)
# df.drop(columns=['Unnamed: 0'], inplace=True)




## <a id='toc2_4_'></a>[clean farm size](#toc0_)

In [None]:
is_verbosity = False

# Function to clean and convert farm size
def clean_farm_size(link, size, index_):
  if size == "-":
    size = None

  if pd.isna(size):
      return size  # Return NaN values as they are

  size = str(size)
  
  if size == '.':
    return None
  
  if '()' in size:
    size = size.replace('()', '')
  if ' .' in size:
    size = size.replace(' .', '')

  if '/' in size:
    size = size.split('/')[1]
  
  if "m2" in size or "m²" in size or "metros cuadrados" in size:
    size = size.replace("m2", "").replace("m²", "").replace("metros cuadrados", "")
    aux_size = size
    size = size.replace(",", ".")
    verbosity(f"size m2 = {size}", is_verbosity)

    try:
      if float(size) <100:
        size = aux_size.replace(",", "")

      verbosity(f"Successfull changed size to ha = {float(size) / 10000}", is_verbosity)
      return float(size) / 10000
    except ValueError:
      verbosity(f"Error: {ValueError} in size: {size}", is_verbosity)
      return size # Return original string if conversion fails.
  
  if "acres" in size:
    size = size.replace("acres", "")
    aux_size = size
    size = size.replace(",", ".")
    verbosity(f"size acres = {size}", is_verbosity)

    try:
      verbosity(f"Successfull changed size to ha = {float(size) * 0.404686}", is_verbosity)
      return float(size) * 0.404686
    except ValueError:
      verbosity(f"Error: {ValueError} in size: {size}", is_verbosity)
      return size # Return original string if conversion fails.

  if "patok" in size:
    size = size.replace("patok", "")
    aux_size = size
    size = size.replace(",", ".")
    verbosity(f"size patok = {size}", is_verbosity)

    try:
      verbosity(f"Successfull changed size to ha = {float(size) * 0.1}", is_verbosity)
      return float(size) * 0.1
    except ValueError:
      verbosity(f"Error: {ValueError} in size: {size}", is_verbosity)
      return size # Return original string if conversion fails.
    
  

  if "MANZANAS" in size or "mzns" in size or "manzanas" in size or "mz" in size:
    size = size.replace("manzanas", "").replace("mzns", "").replace("MANZANAS", "").replace(",", "").replace("mz", "")
    verbosity(f"size mzn = {size}", is_verbosity)
    try:
      verbosity(f"Successfull changed size to ha = {float(size) * 0.698896}", is_verbosity)
      return float(size) * 0.698896
    except ValueError:
      if "y media" in size:
        size = size.replace("y media", "")
        try:
          verbosity(f"Successfull changed size to ha = {(float(size)+0.5) * 0.698896}", is_verbosity)
          return (float(size)+0.5) * 0.698896
        except ValueError as e:
          verbosity(f"Error: {e} in size: {size}", is_verbosity)
          return size # Return original string if conversion fails.
      verbosity(f"Error: {ValueError} in size: {size}", is_verbosity)
      return size # Return original string if conversion fails.

  try:
    float(size)
  except ValueError as e:
    if "," in size:
      try:
        size = size.replace(",", ".")
        return size
      except:
        print(f"Error: {e} in size: {size}")
        return size

    print(f"Second Error: {e} in size: {size}, index: {index_}, link {link}")
    return size

  return size

# [(link, value_) for value_, link in zip(df_col, df_link) if not(pd.isna(value_)) and type(float(value_))==float]
# Apply the function to the 'Farm_Size' column

df['Farm_size_he'] = pd.DataFrame( [clean_farm_size(link, value_, index_) for value_, link, index_ in zip(df['Farm_Size'], df['Url_farm'], df.index)])
# df['farm_size_he'] = df['Farm_Size'].apply(clean_farm_size, link=df['Url_farm'])




In [None]:
df.describe(include='all')

In [None]:
df.head()

In [None]:
def val_float_in_df(df_link, df_col):
  return [(link, value_) for value_, link in zip(df_col, df_link) if not(pd.isna(value_)) and type(float(value_))==float]
search_str_in_df(df['Url_farm'], df['Farm_size_he'], "m")
# val_float_in_df(df['Url_farm'], df['Farm_size_he'])

In [None]:
df.to_excel('../data/AuctionsV4.xlsx')

In [None]:
#Revisar caso row 4301
df.loc[[4289, 4298,4301]]

## <a id='toc2_5_'></a>[Generic functions](#toc0_)

In [None]:

def is_float_data_detection(link, value_, index_, verbosity):
    if pd.isna(value_):
        return True

    value_ = str(value_)

    try:
        float(value_)
        return True
    except:
        if verbosity:
            print(f"detected non float data {value_} in row {index_}; more information in {link}")
        return False

def non_float_data_detection(col_):
    [is_float_data_detection(link, value_, index_, True) for value_, link, index_ in zip(df[col_], df['Url_farm'], df.index)]

def str_2_float(value_):
    if pd.isna(value_):
        return True, value_
    try:
        value_ = float(value_)
        return True, value_
    except ValueError as e:
        print(f"error, parsing value {value_} Error: {e}")
        return False, value_


def apply_function_to_data(col_, function_):
    global df
    return pd.DataFrame( [function_(link, value_, index_) for value_, link, index_ in zip(df[col_], df['Url_farm'], df.index)])

def count_number_of_digits(value_):
    if type(value_) is float:
        value_ = str(value_)
    
    number_of_digits = len(value_)
    if '.' in value_:
        number_of_digits = number_of_digits - 3
    
    return number_of_digits
    
def replace_value(value_, str_replace, replace_for):
    if str_replace in value_:
        value_ = value_.replace(str_replace, replace_for)
    return value_

def normalize_number(number_string):
    # Primero, elimina las comas o puntos usados como separadores de miles
    number_string = re.sub(r'(?<=\d)[.,](?=\d{3})', '', number_string)
    # Luego, convierte las comas decimales en puntos decimales
    number_string = number_string.replace(',', '.')
    return number_string

def detect_abnormal_commas(value_, is_float):
    value_ = str(value_)
    value_ = re.sub(r'(?<=\d)[.,](?=\d{3})', '', value_)
    # Convierte las comas decimales en puntos decimales
    value_ = value_.replace(',', '.')

    is_float, value_ = str_2_float(value_)

    return is_float, value_

def detect_range(value_):
    patron = re.compile('\d{1,4}.*?[–a-].*?\d{1,4}')
    result = patron.search(value_)

    if result:
        return True
    else:
        return False
    
    
def get_average_from_range(value_):
    is_feet = False
    if 'feet' in value_:
        is_feet = True
    patron = re.compile('\d{1,3},\d\d\d')
    result = patron.search(value_)
    if(result):
        value_ = value_.replace(',', '')
    patron = re.compile('\d{1,3}.\d\d\d')
    result = patron.search(value_)
    if(result):
        value_ = value_.replace('.', '')

    patron = re.compile('\d{1,4}')
    result = patron.findall(value_)
    sum=0
    [sum:=sum+float(val) for val in result]
    value_ = sum/len(result)
    if is_feet:
        value_ = value_ * 0.3048
    is_float = True
    return is_float, value_

def units_conversion(value_, conversion_to_multiplly, unit):
    aux_value = value_
    value_ = value_.replace(unit, '')
    is_float=False
    is_float, value_ = detect_abnormal_commas(value_, is_float)

    try:
        value_ = float(value_)
        value_ = (value_ * conversion_to_multiplly)
        # print(f"se convirtió el valor: {aux_value}  a {value_} exitosamente")
        return True, value_
    except ValueError:
        print(f"No se pudo convertir {value_} a {unit} exitosamente")
        return True, value_

In [None]:
import re

def normalize_number(number_string):
    # Elimina las comas o puntos usados como separadores de miles
    number_string = re.sub(r'(?<=\d)[.,](?=\d{3})', '', number_string)
    # Convierte las comas decimales en puntos decimales
    number_string = number_string.replace(',', '.')

    number_string = float(number_string)
    return number_string

# Ejemplo de uso
values = ["1,000.20", "1000,20", "1.000,20", "1000,", "1000."]
normalized_values = [normalize_number(value) for value in values]
print(normalized_values)  # ['1000.20', '1000.20', '1000.20']

# <a id='toc3_'></a>[Clean numeric data](#toc0_)

## <a id='toc3_1_'></a>[clean score lots in table](#toc0_)

In [None]:
def clean_score_lots(link, value_, index_):
    is_float = False
    value_ = str(value_)
    if (not(is_float) and (',' in value_)):
        value_ = value_.replace(',', '.')
        is_float, value_ = str_2_float(value_)   
    
    is_float = is_float_data_detection(link, value_, index_, False)
    if is_float:
        return  float(value_)
    else:
        print(f"detected not float data {value_} in row {index_}; more information in {link}")
        return value_


df['Score_v2'] = apply_function_to_data('Score', clean_score_lots)
# non_float_data_detection('Score_v2')

## <a id='toc3_2_'></a>[clean Total Value](#toc0_)

In [None]:
def clean_total_value(link, value_, index_):
    global df
    value_ = str(value_)
    aux_value_2 = value_
    value_ = replace_value(value_, '$', '')
    value_ = replace_value(value_, '/lb', '')
    value_ = replace_value(value_, '/Ib', '')
    value_ = replace_value(value_, 'US', '')
    value_ = replace_value(value_, '.              ', '')

    is_float = is_float_data_detection(link, value_, index_, False)
    
    if is_float:
        return  float(value_)
    
    if ('-' in value_) and (not(is_float)):
        value_ = None
        return value_
    if ('–' in value_) and (not(is_float)):
        value_ = None
        return value_
    
    is_float, value_ = detect_abnormal_commas(value_, is_float)

    if is_float:
        if count_number_of_digits(value_) > 3:
            # TODO: divide high_bid / weight lb
            # print(f"detected abnormal number: weight lb: {df.loc()[index_]['Weigth_(Lb)']}; weight kg: {df.loc()[index_]['Weigth_(Kg)']};initial value: {aux_value_2}; processed value {value_} in row {index_}; more information in {link}")
            pass

        return  float(value_)
    else:
        print(f"detected not float data {value_} in row {index_}; more information in {link}")
        return value_

df['Total_Value_v2'] = apply_function_to_data('Total_Value', clean_total_value)

    

## <a id='toc3_3_'></a>[clean weight in table](#toc0_)

In [None]:
def clean_weight(link, value_, index_):
    global df

    value_lb = value_
    value_lb = str(value_lb)

    value_kg = df.loc[index_]['Weigth_(Kg)']
    value_kg = str(value_kg)



    if value_lb:
        value_lb =value_lb.replace('lbs', '')
        is_float, value_lb = detect_abnormal_commas(value_lb, False)
        value_ = value_lb
    elif 'lbs' in value_kg:
        value_kg = value_kg.replace('lbs', '')
        is_float, value_lb = detect_abnormal_commas(value_kg, False)    
        value_ = value_kg
        
    else:
        is_float_kg, value_kg = detect_abnormal_commas(value_kg, False)
        value_ = value_kg
        is_float, value_ = units_conversion(value_, 2.20462, 'kg')
    
        
    value_ = str(value_)
    aux_value_2 = value_

    # is_float = is_float_data_detection(link, value_, index_, False)
    
    if is_float:
        return  float(value_)
    
    if ('-' in value_) and (not(is_float)):
        value_ = None
        return value_
    

    if is_float:
        if count_number_of_digits(value_) > 4:
            print(f"detected abnormal number: weight lb: {df.loc()[index_]['Weigth_(Lb)']}; weight kg: {df.loc()[index_]['Weigth_(Kg)']};initial value: {aux_value_2}; processed value {value_} in row {index_}; more information in {link}")
            pass

        return  float(value_)
    else:
        print(f"detected not float data {value_} in row {index_}; more information in {link}")
        return value_

df['Weigth_(Lb)_v2'] = apply_function_to_data('Weigth_(Lb)', clean_weight)
# non_float_data_detection('Score_v2')

## <a id='toc3_4_'></a>[clean Size 30Kg boxes](#toc0_)

In [None]:
def clean_size_30kg_boxes(link, value_, index_):
    global df
    value_ = str(value_)
    aux_value_2 = value_
    # value_ = replace_value(value_, '$', '')
    # value_ = replace_value(value_, '/lb', '')
    # value_ = replace_value(value_, '/Ib', '')
    # value_ = replace_value(value_, 'US', '')

    is_float = is_float_data_detection(link, value_, index_, False)
    
    if is_float:
        return  float(value_)
    
    if ('-' in value_) and (not(is_float)):
        value_ = None
        return value_
    
    is_float, value_ = detect_abnormal_commas(value_, is_float)

    if is_float:
        if count_number_of_digits(value_) > 2:
            # TODO: divide high_bid / weight lb
            print(f"SE RECOMIENDA ELIMINAR ESTAS FILAS AL FINAL detected abnormal number: Size 30Kg boxes: {df.loc()[index_]['Size_30Kg_boxes']}; weight kg: {df.loc()[index_]['Weigth_(Kg)']};initial value: {aux_value_2}; processed value {value_} in row {index_}; more information in {link}")

        return  float(value_)
    else:
        print(f"detected not float data {value_} in row {index_}; more information in {link}")
        return value_

df['Size_30Kg_boxes_v2'] = apply_function_to_data('Size_30Kg_boxes', clean_size_30kg_boxes)
# non_float_data_detection('Score_v2')



## <a id='toc3_5_'></a>[clean high bid in table](#toc0_)

In [None]:
def clean_high_bid(link, value_, index_):
    global df
    value_ = str(value_)
    aux_value_2 = value_
    value_ = replace_value(value_, '$', '')
    value_ = replace_value(value_, '/lb', '')
    value_ = replace_value(value_, '/Ib', '')
    value_ = replace_value(value_, 'US', '')

    is_float = is_float_data_detection(link, value_, index_, False)
    
    if is_float:
        return  float(value_)
    
    if ('-' in value_) and (not(is_float)):
        value_ = None
        return value_
    
    is_float, value_ = detect_abnormal_commas(value_, is_float)

    if is_float:
        if count_number_of_digits(value_) > 3:
            # TODO: divide high_bid / weight lb
            # print(f"detected abnormal number: weight lb: {df.loc()[index_]['Weigth_(Lb)']}; weight kg: {df.loc()[index_]['Weigth_(Kg)']};initial value: {aux_value_2}; processed value {value_} in row {index_}; more information in {link}")
            pass

        return  float(value_)
    else:
        print(f"detected not float data {value_} in row {index_}; more information in {link}")
        return value_

df['High_Bid_v2'] = apply_function_to_data('High_Bid', clean_high_bid)
# non_float_data_detection('Score_v2')

## <a id='toc3_6_'></a>[clean Total Comission](#toc0_)

In [None]:
def clean_total_comission(link, value_, index_):
    global df

    if pd.isna(value_) or value_=='–':
        return None
    
    value_ = str(value_)
    aux_value_2 = value_
    value_ = replace_value(value_, '(', '')
    value_ = replace_value(value_, '$', '')
    value_ = replace_value(value_, ')', '')

    is_float = is_float_data_detection(link, value_, index_, False)
    
    if is_float:
        return  float(value_)
        
    is_float, value_ = detect_abnormal_commas(value_, is_float)

    if is_float:
        if count_number_of_digits(value_) > 5:
            print(f"detected abnormal number: initial value: {aux_value_2}; processed value {value_} in row {index_}; more information in {link}")

        return  float(value_)
    else:
        print(f"detected not float data ({value_}) in row {index_}; more information in {link}")
        return value_

df['Total_Comission_v2'] = apply_function_to_data('Total_Comission', clean_total_comission)
# non_float_data_detection('Score_v2')

## <a id='toc3_7_'></a>[clean Altitude](#toc0_)

In [None]:
def clean_altitude(link, value_, index_):
    global df

    if pd.isna(value_) or value_=='–' or value_ =='-' or value_=='-1':
        return None
    

    
    value_ = str(value_)
    aux_value = value_
    value_ = replace_value(value_, 'm,m.a.s.l.', '') 
    value_ = replace_value(value_, 'caíAverage  altitude:  ', '') 
    value_ = replace_value(value_, 'METERS ABOVE SEA LEVEL', '') 
    value_ = replace_value(value_, 'm.a.s.l.', '')
    value_ = replace_value(value_, 'a los', '-')
    value_ = replace_value(value_, 'm.a.s.l', '')
    value_ = replace_value(value_, 'M.A.S.L', '') 
    value_ = replace_value(value_, 'm. o. s. l.', '') 
    value_ = replace_value(value_, 'masl / ', '-')
    value_ = replace_value(value_, 'masl', '')
    value_ = replace_value(value_, 'Masl', '')
    value_ = replace_value(value_, 'm,a.s.l.', '') 
    value_ = replace_value(value_, 'm. a. s. l.', '') 
    value_ = replace_value(value_, 'MASL', '') 
    value_ = replace_value(value_, 'mts', '') 
    value_ = replace_value(value_, 'meters', '') 
    value_ = replace_value(value_, 'Msnm', '') 
    value_ = replace_value(value_, 'msnm.', '') 
    value_ = replace_value(value_, 'msnm', '') 
    value_ = replace_value(value_, 'MSNM', '') 
    value_ = replace_value(value_, 'm.s.l.m.', '') 
    value_ = replace_value(value_, 'm.a.s.s.', '') 
    value_ = replace_value(value_, 'Max. altitude:', '') 
    value_ = replace_value(value_, 'm.', '') 
    value_ = replace_value(value_, 'm', '') 
    value_ = replace_value(value_, 'above sea level', '') 
    value_ = replace_value(value_, 'Min. altitude:', '-') 
    value_ = replace_value(value_, ' to ', '-') 
    
    value_ = replace_value(value_, 'caíAverage altitude:', '')
    # value_ = replace_value(value_, '', '')

    is_float = is_float_data_detection(link, value_, index_, False)

    
    

    is_range=detect_range(value_)
    if is_range: 
        is_float, value_ = get_average_from_range(value_)

    if not(is_float) and 'hectares' in value_:
        value_ = replace_value(value_, 'hectares', '') 
        print(f"detected abnormal unit: initial value: {aux_value}; processed value {value_} in row {index_}; more information in {link}")

    if not(is_float) and 'ha' in value_:
        value_ = replace_value(value_, 'ha', '') 
        print(f"detected abnormal unit: initial value: {aux_value}; processed value {value_} in row {index_}; more information in {link}")
        

        # value_ = units_conversion(value_, 0.3048, 'hectares')
    
    if not(is_float) and 'Feet' in value_:
        is_float, value_ = units_conversion(value_, 0.3048, 'Feet')
    
    if not(is_float) and 'feet' in value_:
        is_float, value_ = units_conversion(value_, 0.3048, 'feet')
    
    if not(is_float) and 'Ft.' in value_:
        is_float, value_ = units_conversion(value_, 0.3048, 'Ft.')

    if not(is_float) and 'ft.' in value_:
        is_float, value_ = units_conversion(value_, 0.3048, 'ft.')

    if not(is_float) and 'ft' in value_:
        is_float, value_ = units_conversion(value_, 0.3048, 'ft')
    
    if not(is_float) and 'FAMSL' in value_:
        is_float, value_ = units_conversion(value_, 0.3048, 'FAMSL')

    if not(is_float) and 'fasl' in value_ :
        is_float, value_ = units_conversion(value_, 0.3048, 'fasl')

    
    
        
    # if not(is_float) and ''



    
    is_float, value_ = detect_abnormal_commas(value_, is_float)
    is_float = is_float_data_detection(link, value_, index_, True)

    
    
    
    if is_float:
        value_ = float(value_)
        if  value_ < 500 or value_>=2500:
            if value_ >= 4000:
                value_ = value_ * 0.3048
            else:
                print(f"detected abnormal number: initial value: {aux_value}; processed value {value_} in row {index_}; more information in {link}")

        return  float(value_)
    
    if is_float:
        return  float(value_)

    else:
        print(f"detected not float data {aux_value} processed value: {value_} in row {index_}; more information in {link}")
        return value_

df['Altitude_v2(masl)'] = apply_function_to_data('Altitude', clean_altitude)
# non_float_data_detection('Score_v2')

## <a id='toc3_8_'></a>[clean growing area](#toc0_)

In [None]:
def clean_growing_area(link, value_, index_):
    is_float = False
    value_ = str(value_)
    value_ = replace_value(value_, 'ha', '')
    value_ = replace_value(value_, 'Hectares', '')
    value_ = replace_value(value_, 'hectares', '')
    value_ = replace_value(value_, 'hectare', '')
    value_ = replace_value(value_, 'hectars', '')
    value_ = replace_value(value_, 'hectar', '')

    is_range=detect_range(value_)
    if is_range: 
        is_float, value_ = get_average_from_range(value_)

    is_float, value_ = str_2_float(value_)
    if not(is_float) and 'mzns' in value_:
        is_float, value_ = units_conversion(value_, 0.698896, 'mzns')
    if not(is_float) and 'mzn' in value_:
        is_float, value_ = units_conversion(value_, 0.698896, 'mzn')
    if not(is_float) and 'MANZANAS' in value_:
        is_float, value_ = units_conversion(value_, 0.698896, 'MANZANAS')




    # is_float, value_ = detect_abnormal_commas(value_)
    
    is_float, value_ = str_2_float(value_)
    if is_float:
        return  float(value_)
    else:
        print(f"detected not float data {value_} in row {index_}; more information in {link}")
        return value_


df['Coffee_Growing_Area_v2'] = apply_function_to_data('Coffee_Growing_Area', clean_growing_area)
# non_float_data_detection('Score_v2')

## <a id='toc3_9_'></a>[Clean Auction Lot Size](#toc0_)

In [None]:
def clean_auction_lot_size(link, value_, index_):
    is_float = False
    value_ = str(value_)
    value_ = replace_value(value_, '(split between both presidentials)', '')
    value_ = replace_value(value_, '(split between both presidential lots)', '')




    is_float, value_ = detect_abnormal_commas(value_, is_float)
    is_float, value_ = str_2_float(value_)
    if is_float:
        return  float(value_)
    else:
        print(f"detected not float data {value_} in row {index_}; more information in {link}")
        return value_
df['Auction_Lot_Size_v2'] = apply_function_to_data('Auction_Lot_Size', clean_auction_lot_size)


## <a id='toc3_10_'></a>[Clean Auction Lot Size (Kg)](#toc0_)

In [None]:
def clean_auction_lot_size_kg(link, value_, index_):
    is_float = False
    value_ = str(value_)
    value_ = replace_value(value_, '(split between both presidentials)', '')
    value_ = replace_value(value_, '(split between both presidential lots)', '')

    is_float, value_ = detect_abnormal_commas(value_, is_float)
    is_float, value_ = str_2_float(value_)
    
    if is_float:
        return  float(value_)
    else:
        print(f"detected not float data {value_} in row {index_}; more information in {link}")
        return None
df['Auction_Lot_Size(Kg)_v2'] = apply_function_to_data('Auction_Lot_Size(Kg)', clean_auction_lot_size_kg)


## <a id='toc3_11_'></a>[complete weight Lb](#toc0_)

In [None]:
def complete_weight_lb(link, value_size_30kg_boxes, index_):
    global df
    weight_lb = df.loc[index_]['Weigth_(Lb)_v2']
    lot_size = df.loc[index_]['Auction_Lot_Size_v2']
    lot_size_kg = df.loc[index_]['Auction_Lot_Size(Kg)_v2']

    
    size_30kg_boxes_is_float = is_float_data_detection(link, value_size_30kg_boxes, index_, False)
    lot_size_is_float = is_float_data_detection(link, lot_size, index_, False)
    lot_size_kg_is_float = is_float_data_detection(link, lot_size_kg, index_, False)

    if pd.isna(weight_lb):
        if (lot_size_is_float and (not(pd.isna(lot_size)))):
            weight_lb =lot_size
            print(f"used lot size: {lot_size} to complete weight_(lb): {weight_lb} in row {index_}; more information in {link}")
        elif (lot_size_kg_is_float and (not(pd.isna(lot_size_kg)))):
            weight_lb = lot_size_kg * 2.20462
            print(f"used lot size in kg: {lot_size_kg} to complete weight_(lb): {weight_lb} in row {index_}; more information in {link}")
        elif (size_30kg_boxes_is_float and not(pd.isna(value_size_30kg_boxes))):
            weight_lb = (value_size_30kg_boxes*132.277)/2
            print(f"used size_30Kb boxes: {value_size_30kg_boxes} to complete weight_(lb): {weight_lb} in row {index_}; more information in {link}")
    
    return weight_lb

df['Weigth_(Lb)_v2'] = apply_function_to_data('Size_30Kg_boxes_v2', complete_weight_lb)
pass




## <a id='toc3_12_'></a>[detect abnormal data](#toc0_)

In [None]:
def look_for_abnormal_data(link, value_, index_):

    if pd.isna(value_):
        return

    weight_lb = value_
    high_bid = df.loc[index_]['High_Bid_v2']
    total_value = df.loc[index_]['Total_Value_v2']

    is_total_value_float, total_value = str_2_float(total_value)
    is_high_bid_float, high_bid = str_2_float(high_bid)
    is_weight_lb_float, weight_lb = str_2_float(weight_lb)

    if (is_total_value_float and is_high_bid_float and is_weight_lb_float):
        if( not(pd.isna(total_value)) and not(pd.isna(high_bid)) and not(pd.isna(weight_lb))):
            aux_weight_lb = total_value/high_bid
            aux_weight_lb = round(aux_weight_lb, 2)
            difference = aux_weight_lb-weight_lb
            if(difference>1):
                print(f"detected inconsistences in row {index_}; more information in {link}")
        


apply_function_to_data('Weigth_(Lb)_v2', look_for_abnormal_data)
pass

# <a id='toc4_'></a>[Clean non-numeric data](#toc0_)

## <a id='toc4_1_'></a>[Generic string functions](#toc0_)

In [None]:
def clean_data_from_pattern(value_, pattern):

    value_ = str(value_)    
    pattern = re.compile(pattern)
    result = pattern.search(value_)
    if(result):
        value_ =value_.replace(result.group(), '')
        return value_
    else:
        return value_
    
def search_from_pattern(value_, pattern):
    value_ = str(value_)    
    pattern = re.compile(pattern)
    result = pattern.search(value_)
    if(result):
        return True
    else:
        return False
    
def remove_data_from_pattern(value_, pattern, count=0):
    value = str(value_)
    result = re.sub(pattern, '', value_)
    if count:
        result = re.sub(pattern, '', value_, count)
        
    if result:
        return result
    else:
        return value_


In [None]:
df[df['Url_farm']=='https://allianceforcoffeeexcellence.org/farm-directory/87-23/']['Aroma_Flavor']
# remove_data_from_pattern('\n[w]  Agricafe, S.A. De C.V.\n', r'\[\w\]|^[\s\n]+|[\s\n]+$')
# remove_data_from_pattern(', floral, honey, sugar berries, vanilla, plum,...', r',\s*', count=1)
remove_data_from_pattern('"floral, "honey, sugar berries, vanilla, plum,..."', r'^"|"$')

In [None]:
import re

# Ejemplo de string
string_con_espacios = "\n[w]  Rafael Gerardo Silva Esteves\n"

# Usar expresión regular para quitar espacios y saltos de línea al inicio y al final
string_limpio = re.sub(r'^[\s\n]+|[\s\n]+$', '', string_con_espacios)

print(f"'{string_limpio}'")

## <a id='toc4_2_'></a>[clean country](#toc0_)

In [None]:
def clean_country(link, value_, index_):

    if pd.isna(value_):
        return value_
    
    value_ = clean_data_from_pattern(value_, '-\d\d\d\d')
    value_ = clean_data_from_pattern(value_, '\d\d\d\d-')
    value_ = clean_data_from_pattern(value_, '-naturals')
    value_ = clean_data_from_pattern(value_, '-january')
    value_ = clean_data_from_pattern(value_, '-pulped')
    value_ = clean_data_from_pattern(value_, '-north')
    value_ = clean_data_from_pattern(value_, '-south')
    value_ = clean_data_from_pattern(value_, '-coe')
    value_ = clean_data_from_pattern(value_, 'best-of-')
    

    if ('-' in value_ and not(value_=='costa-rica') and not(value_=='el-salvador')):
        print(f"value: {value_}; processing: {df.loc[index_]['Processing']}")

    return value_


df['Country_v2'] = apply_function_to_data('Country', clean_country)


## <a id='toc4_3_'></a>[clean Rank](#toc0_)

In [None]:
def clean_rank(link, value_, index_):

    if pd.isna(value_):
        return value_
    
    value_ = value_.replace('a', '')
    value_ = value_.replace('b', '')
    value_ = value_.replace('c', '')
    
    return value_


df['Rank_v2'] = apply_function_to_data('Rank', clean_rank)


## <a id='toc4_4_'></a>[Clean Farmer](#toc0_)

In [None]:
def clean_farmer(link, value_, index_):

    if pd.isna(value_):
        return value_
    
    value_ = remove_data_from_pattern(value_, r'\[\w\]|^[\s\n]+|[\s\n]+$')
    value_ = replace_value(value_, '"', '\"')

    

    return value_

df['Farmer'] = apply_function_to_data('Farmer', clean_farmer)

## <a id='toc4_5_'></a>[Clean Region](#toc0_)

In [None]:
def clean_region(link, value_, index_):

    if pd.isna(value_):
        return value_
    
    value_ = remove_data_from_pattern(value_, r'\[\w\]|^[\s\n]+|[\s\n]+$')
    value_ = replace_value(value_, '"', '\"')

    

    return value_

df['Region_v2'] = apply_function_to_data('Region', clean_region)

## <a id='toc4_6_'></a>[Clean Variety](#toc0_)

In [None]:
def clean_variety(link, value_, index_):

    if pd.isna(value_):
        return value_
    
    value_ = remove_data_from_pattern(value_, r'\[\w\]|^[\s\n]+|[\s\n]+$')
    value_ = replace_value(value_, '"', '\"')

    

    return value_

df['Variety_v2'] = apply_function_to_data('Variety', clean_variety)

## <a id='toc4_7_'></a>[Clean Processing](#toc0_)

In [None]:
def clean_processing(link, value_, index_):

    if pd.isna(value_):
        return value_
    
    value_ = remove_data_from_pattern(value_, r'\[\w\]|^[\s\n]+|[\s\n]+$')
    value_ = replace_value(value_, '"', '\"')

    

    return value_

df['Processing_v2'] = apply_function_to_data('Processing', clean_processing)

## <a id='toc4_8_'></a>[Clean History](#toc0_)

In [None]:
def clean_history(link, value_, index_):

    if pd.isna(value_):
        return value_
    
    value_ = remove_data_from_pattern(value_, r'\[\w\]|^[\s\n]+|[\s\n]+$')
    value_ = remove_data_from_pattern(value_, r'^"|"$')
    value_ = replace_value(value_, '"', '\"')

    

    return value_

df['History_v2'] = apply_function_to_data('History', clean_history)

## <a id='toc4_9_'></a>[Clean Company Name](#toc0_)

In [None]:
def clear_company_name(link, value_, index_):

    if pd.isna(value_):
        return value_
    
    value_ = remove_data_from_pattern(value_, r'\[\w\]|^[\s\n]+|[\s\n]+$')
    value_ = remove_data_from_pattern(value_, r'^"|"$')
    value_ = replace_value(value_, '"', '\"')

    

    return value_

df['Company_Name_v2'] = apply_function_to_data('Company_Name', clear_company_name)

## <a id='toc4_10_'></a>[Clean COMPANY NAME](#toc0_)

In [None]:
def clear_company_name(link, value_, index_):

    if pd.isna(value_):
        return value_
    
    value_ = remove_data_from_pattern(value_, r'\[\w\]|^[\s\n]+|[\s\n]+$')
    value_ = remove_data_from_pattern(value_, r'^"|"$')
    value_ = replace_value(value_, '"', '\"')

    

    return value_

df['COMPANY NAME_v2'] = apply_function_to_data('COMPANY NAME', clear_company_name)

## <a id='toc4_11_'></a>[Clean Aroma_Flavor](#toc0_)

In [None]:
def clean_aroma(link, value_, index_):

    if pd.isna(value_):
        return value_
    
    value_ = remove_data_from_pattern(value_, r'\[\w\]|^[\s\n]+|[\s\n]+$')
    value_ = remove_data_from_pattern(value_, r',\s*', count=1)
    value_ = remove_data_from_pattern(value_, r'^"|"$')
    value_ = value_.replace('#NAME?', '')
    value_ = replace_value(value_, '"', '\"')

    

    return value_

df['Aroma_Flavor_v2'] = apply_function_to_data('Aroma_Flavor', clean_aroma)

## <a id='toc4_12_'></a>[Clean Acidity](#toc0_)

In [None]:
def clean_acidity(link, value_, index_):

    if pd.isna(value_):
        return value_
    
    value_ = remove_data_from_pattern(value_, r'\[\w\]|^[\s\n]+|[\s\n]+$')
    value_ = remove_data_from_pattern(value_, r',\s*', count=1)
    value_ = remove_data_from_pattern(value_, r'^"|"$')
    value_ = replace_value(value_, '"', '\"')

    

    return value_

df['Acidity_v2'] = apply_function_to_data('Acidity', clean_acidity)

## <a id='toc4_13_'></a>[Clean Overall](#toc0_)

In [None]:
def clean_overall(link, value_, index_):

    if pd.isna(value_):
        return value_
    
    value_ = remove_data_from_pattern(value_, r'\[\w\]|^[\s\n]+|[\s\n]+$')
    value_ = remove_data_from_pattern(value_, r',\s*', count=1)
    value_ = remove_data_from_pattern(value_, r'^"|"$')
    value_ = replace_value(value_, '"', '\"')
    

    return value_

df['Overall_v2'] = apply_function_to_data('Overall', clean_overall)

## <a id='toc4_14_'></a>[Clean Other](#toc0_)

In [None]:
def clean_other(link, value_, index_):

    if pd.isna(value_):
        return value_
    
    value_ = remove_data_from_pattern(value_, r'\[\w\]|^[\s\n]+|[\s\n]+$')
    value_ = remove_data_from_pattern(value_, r',\s*', count=1)
    value_ = remove_data_from_pattern(value_, r'^"|"$')
    value_ = value_.replace('#NAME?', '')
    value_ = replace_value(value_, '"', '\"')


    

    return value_

df['Other_v2'] = apply_function_to_data('Other', clean_other)

## <a id='toc4_15_'></a>[Clean City](#toc0_)

In [None]:
def clean_city(link, value_, index_):

    if pd.isna(value_):
        return value_
    
    value_ = remove_data_from_pattern(value_, r'\[\w\]|^[\s\n]+|[\s\n]+$')
    value_ = remove_data_from_pattern(value_, r',\s*', count=1)
    value_ = remove_data_from_pattern(value_, r'^"|"$')
    value_ = replace_value(value_, '"', '\"')

    return value_

df['City_v2'] = apply_function_to_data('City', clean_city)

# <a id='toc4_16_'></a>[Remove empty rows, by col](#toc0_)

In [None]:
# TODO: para tomar las estadísticas, comentar este bloque
def clean_nan_cols(nan_cols):
    df.replace("", float("NaN"), inplace=True)
    df.dropna(subset=nan_cols, how='all', inplace=True)

nan_cols = ['High_Bid_v2']

clean_nan_cols(nan_cols)


# <a id='toc5_'></a>[Convert to future value](#toc0_)

In [None]:
vp = 4200000
in_ = [1/100, 1.3/100, 1.6/100, 1.9/100, 2.2/100, 2.5/100]
vf=vp
[vf:=(vf*(1+i)) for i in in_]
vf

## <a id='toc5_1_'></a>[Generic functions](#toc0_)

In [None]:

df_ipc.head()

## <a id='toc5_2_'></a>[future value converision](#toc0_)

In [None]:
var = get_ipc_older_than_value(2020, df_ipc)
future_value_convertion(5000, var)


def high_bid_to_future_value_convertion(link, value_, index_):

    global df
    global df_ipc
    initial_year = df.loc[index_]['Year']
    if pd.isna(value_):
        return value_
    array_ipc_value = get_ipc_older_than_value(initial_year, df_ipc)
    value_ = future_value_convertion(value_, array_ipc_value)
    print(f"initial_year {initial_year}; value: {value_}")
    
    
    

    return value_

df['High_Bid_Future_Value'] = apply_function_to_data('High_Bid_v2', high_bid_to_future_value_convertion)

# <a id='toc6_'></a>[save all data](#toc0_)

## Drop repeated columns 

In [None]:
def drop_columns(column_name):
    try:
        df.drop(columns=[column_name], inplace=True)
    except KeyError as e:
        print(e)



df.to_excel("../data/AuctionsV6.7.xlsx", engine='xlsxwriter')

columns_to_drop = ['Weigth_(Kg)', 'Weigth_(Lb)', 'High_Bid', 'Farm_Size', 'Score', 'Total_Value', 'Size_30Kg_boxes', 'Total_Comission', 'Altitude', 
                   'Coffee_Growing_Area', 'Auction_Lot_Size', 'Auction_Lot_Size(Kg)', 'Country', 'Region', 'Variety', 'Processing', 
                   'History', 'Aroma_Flavor', 'Acidity', 'Overall', 'Other', 'City', 'Enviromental_Care', 'Coffee_Processing_Information', 'Annual_Precipitation', 'Shade_Grown_Type', 'Water_Source', 
                   'Company_Name', 'COMPANY NAME','Annual_Production', 'Rank']



[drop_columns(column_name) for column_name in columns_to_drop]

## Order columns

In [None]:




new_order = ['Year', 'Country_v2', 'Rank_v2', 'High_Bid_v2', 'Url_farm', 'Root_url',
             'Score_v2', 'Origin_Bidder', 'Weigth_(Lb)_v2', 'Farm',  'Farmer', 'Certifications',
              'Aroma_Flavor_v2', 'Acidity_v2', 'Overall_v2',
              'Altitude_v2(masl)',
              'High_Bidders', 'Unnamed: 39',
              'Farm_size_he', 'Total_Value_v2', 
              'Size_30Kg_boxes_v2', 'Total_Comission_v2',
              'Coffee_Growing_Area_v2', 'Auction_Lot_Size_v2',
              'Auction_Lot_Size(Kg)_v2', 'Region_v2',
              'Variety_v2', 'Processing_v2', 'History_v2', 'Company_Name_v2',
              'COMPANY NAME_v2', 
              'Other_v2', 'City_v2', 'High_Bid_Future_Value']

df = df[new_order]







## Save table with dropped and ordered columns

In [None]:
df.to_excel("../data/AuctionsV6.8_dropped_and_order_columns.xlsx", engine='xlsxwriter')

In [None]:
import re

def normalize_number(number_string):
    # Elimina las comas o puntos usados como separadores de miles
    number_string = re.sub(r'(?<=\d)[.,](?=\d{3})', '', number_string)
    # Convierte las comas decimales en puntos decimales
    number_string = number_string.replace(',', '.')
    return number_string

# Ejemplo de uso
values = ["1,000.20", "1000,20", "1.000,20", "1.000.000.20", "10,20", "10.20", '1000.20']
normalized_values = [normalize_number(value) for value in values]
print(normalized_values)  # ['1000.20', '1000.20', '1000.20']

# Other functions

In [None]:
# df.columns.tolist()
df.columns