# Generic functions

In [1]:
import re
import pandas as pd

In [2]:
def verbosity(msj, is_verbosity):
  if is_verbosity:
    print(msj)

def search_str_in_df(df_link, df_col, str_):
  return [(link, farm_size) for farm_size, link in zip(df_col, df_link) if type(farm_size)==str and str_ in farm_size]

def load_df_from_csv(file_path):
    try:
        df_ipc = pd.read_csv(file_path)
        return df_ipc
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return False
    
def get_ipc_older_than_value(year, df_ipc):
    if type(year)!=float:
        year = float(year)
    return df_ipc[df_ipc['year']>=year]['ipc']

def future_value_convertion(present_value, array_interests):
    future_value = present_value 
    [future_value:=(future_value*(1+(interest/100))) for interest in array_interests]
    return future_value

## load data

In [3]:

file_path = '../data/AuctionsV6.8_dropped_and_order_columns.xlsx'

try:
  global df
  df = pd.read_excel(file_path)
except FileNotFoundError:
  print(f"Error: File not found at {file_path}")
except Exception as e:
  print(f"An error occurred: {e}")


df_ipc = load_df_from_csv('../data/inflacion_EEUU.csv')

## Define Donnet Variables, using own dataset


In [4]:
def clean_nan_cols(nan_cols):
    global df
    df.replace("", float("NaN"), inplace=True)
    df.dropna(subset=nan_cols, how='any', inplace=True)


In [None]:
def change_value(value, rank_filter):
    try:
        return rank_filter[value]
    except KeyError:
        if type(value) == int:
            return 'other'
        else:
            return value
        
def change_variety_value(value, variety_filter):
    try:
        return variety_filter[value]
    except KeyError:
        return value

columns_used_by_donnet = ['Year', 'Country_v2', 'Rank_v2', 'Url_farm', 'Root_url', 'Score_v2', 'High_Bid_v2', 'Variety_v2', 'Size_30Kg_boxes_v2']

countries_used = ['bolivia', 'brazil', 'colombia', 'el-salvador', 'honduras', 'nicaragua']
varieties = ['bourbon', 'catuai', 'caturra', 'typica', 'pacamara']
harvest_years = [i for i in range(2003,2006+1)]

df.query("2003 <= Year <= 2006", inplace=True)



rank_filter = {
    1:"first",
    2:"second",
    3:"third",
}

variety_filter = {
    'Acaiá': 'Other',
    'Obatan': 'Other',
    'Paca': 'Other',
    'Acaiá': 'Other',
    'Catucai': 'Other',
    'Obatã': 'Other',
    'Pacamur': 'Other',
    'San Ramon': 'Other',
    'Mundo Novo	 Acaiá': 'Other',
    'Iapar': 'Other',
    'Icatu': 'Other',
    'Pacas': 'Other',
    'Yellow Catucai': 'Other',
    'IHCAFE 90': 'Other',
    'Mundo Novo': 'Other',
    'Colombia': 'Other',
    'Caturra (15%	)	 Colombia  (70%	)	 Typica (15%	)': 'Other',
    'Caturra (5%	)	 Colombia  (95%	)': 'Other',
    'Bourbón': 'Bourbon',
    'Red Bourbón': 'Bourbon',
    'Paca	 Bourbón': 'Bourbon',
    'Bourbón	 Pacas': 'Bourbon',
    'Bourbón	 Acaiá': 'Bourbon',
    'Bourbón	 Paca': 'Bourbon',
    'Yellow Bourbón': 'Bourbon',
    'Red Bourbón': 'Bourbon',
    'Yellow Bourbón	 Red Bourbón': 'Bourbon',
    'Bourbón	Caturra	Maragogype': 'Bourbon',
    'Bourbón	 Typica': 'Bourbon',
    'Bourbón	 Typica	 Pacas': 'Bourbon',
    'Bourbón 	 Catuai': 'Bourbon',
    'Caturra Estrella': 'Caturra',
    'Caturra (60%	)	 Typica (40%	)': 'Caturra',
    'Caturra (50%	)	 Colombia  (50%	)': 'Caturra',
    'Caturra (80%	)	 Colombia  (20%	)': 'Caturra',
    'Caturra (90%	)	 Colombia  (10%	)': 'Caturra',
    'Caturra (60%	)	 Colombia  (20%	)	 Typica (20%	)': 'Caturra',
    'Caturra (90%	)	 Colombia  (5%	)	 Typica (5%	)': 'Caturra',
    'Caturra (80%	)	 Typica (20%	)': 'Caturra',
    'Caturra (70%	)	 Typica (30)': 'Caturra',
    'Caturra	 Catimor': 'Caturra',
    'Caturra 	 Catuai': 'Caturra',
    'Caturra 	 Maracatu': 'Caturra',
    'Caturra	 Paca': 'Caturra',
    'Catuai	 Icatu': 'Catuai',
    'Red Catuai': 'Catuai',
    'Red	 Yellow	 Red Catuai': 'Catuai',
    'Catuai 	 Icatu': 'Catuai',
    'Red 	 Red Catuai': 'Catuai',
    'Red 	 Yellow Catuai': 'Catuai',
    'Catimor	 Catuai': 'Catuai',
    'Yellow Catuai': 'Catuai',
    'Paca	 Catuai': 'Catuai',
    'Red Catuai': 'Catuai',
    'Pacamara Peaberry': 'Pacamara',
    'Pacamara	 Caturra	 Bourbón	 Catuai': 'Pacamara',
    'Caturra (30%	)	 Typica (70%	)': 'Typica',
    'Caturra (45%	)	 Typica (55%	)': 'Typica',
    'Caturra (40%	)	 Typica (60%	)': 'Typica',
    'Typica	 Caturra': 'Typica',
    'Typica	 Caturra	 Catuai Yellow': 'Typica',
    'Typica 	 Caturra': 'Typica',
    'Bourbón	 Catuai': 'Bourbon',
    'Bourbón	Kenya': 'Bourbon',
    'Bourbón	 Caturra': 'Bourbon',
    'Bourbón	 Paca	 Catuai': 'Bourbon',
    'Bourbón	Caturra	Maragogype': 'Bourbon',
    'Caturra	 Catuai': 'Caturra',
    'Caturra	 Paca	 Bourbón': 'Caturra',
    'Caturra	 Catuai	 Bourbón': 'Caturra',
    'Caturra	 Bourbón	 Catuai': 'Caturra',
    'Caturra	 Bourbón': 'Caturra',
    'Caturra	 Pacamarra': 'Caturra',
    'Pacas	 Catuai': 'Other',
    'Catuai	 Lempira	 Pacamara 	 IHCAFE 90': 'Catuai',
    'Catuai	 Caturra': 'Catuai',
    
}


df.query("Country_v2 in @countries_used", inplace=True)
df['Rank_v2'] = df['Rank_v2'].apply(lambda x: change_value(x, rank_filter))
df['Variety_v2'] = df['Variety_v2'].apply(lambda x: change_variety_value(x, variety_filter))
df = df[columns_used_by_donnet]

nan_cols = ['Variety_v2']

clean_nan_cols(nan_cols)



df.to_excel("../data/Donnet_Auctions.xlsx", engine='xlsxwriter')
varieties_to_exclude = ["Bourbon", "Catuai", "Caturra", "Typica", "Pacamara", "Other", "other"]
filtered_df = df[~df["Variety_v2"].isin(varieties_to_exclude)]
