In [None]:
# Necessary imports and libraries
import csv
import math
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.utils import check_random_state

In [None]:
# Definition of auxiliar methods and variables
actual_price = {}
first_price = {}
# Complete the prices with the sample with the same "id"
def __price_completer(row):
    precio = float(row['precio'].replace(',', '.')) if isinstance(row['precio'], str) else row['precio']
    identificador = str(row['id'])
    if math.isnan(precio):
        precio = actual_price.get(identificador, -1.0)
    else:
        actual_price[identificador] = precio
        if first_price.get(identificador) == None:
            first_price[identificador] = precio
    return precio

# Complete the price by proximity
def __price_completer_proximity(row):
    precio = row['precio']
    if precio == -1.0:
        identificador = str(row['id'])
        precio = first_price.get(identificador)
    return precio

# Calculate favorable cases
def __favorable_cases(test, pred):
    rotura = 0
    total = len(test)
    for i in range(total):
        if pred[i] < test[i]:
            rotura += 1

    return (total - rotura) / total

# Calculate datathon metric
def __datathon_metric(y_test, y_train, pred):
    rrmse = math.sqrt(mean_squared_error(
        y_test.values, pred)) / y_train.mean()
    cf = __favorable_cases(y_test.values, pred)
    return (0.7 * rrmse) + (0.3 * (1 - cf))

In [None]:
# Method to preprocesing the input files
def input_parser(path, option):
    df = pd.read_csv(filepath_or_buffer=path, sep='|')
    # There are duplicated samples
    df.drop_duplicates(inplace=True)
    if option != 'base':
        # Drop the useless hour of 'fecha' column
        df['fecha'] = df['fecha'].apply(lambda x: x.replace(' 0:00:00', ''))
        # Completation of 'precio' column
        df['precio'] = df.apply(__price_completer, axis=1)
        df['precio'] = df.apply(__price_completer_proximity, axis=1)
        # Split of 'fecha' column
        df['fecha'] = pd.to_datetime(df['fecha'])
        df['dia'] = pd.DatetimeIndex(df['fecha']).day
        df['mes'] = pd.DatetimeIndex(df['fecha']).month
        df['anyo'] = pd.DatetimeIndex(df['fecha']).year
        df.drop('fecha', axis=1, inplace=True)

        # One-hot encoding of 'estado'
        df = pd.concat([df, pd.get_dummies(pd.get_dummies(
            df['estado'], prefix='estado'))], axis=1).drop(['estado'], axis=1)

        # One-hot encoding of 'categoria_uno'
        df = pd.concat([df, pd.get_dummies(pd.get_dummies(
            df['categoria_uno'], prefix='categoria_uno'))], axis=1).drop(['categoria_uno'], axis=1)

        # One-hot encoding of 'dia_atipico'
        df = pd.concat([df, pd.get_dummies(pd.get_dummies(
            df['dia_atipico'], prefix='dia_atipico'))], axis=1).drop(['dia_atipico'], axis=1)

        df['antiguedad'].fillna(0, inplace=True)

        if option == 'drop':
            # 'categoria_dos' drop
            df.drop('categoria_dos', axis=1, inplace=True)
        else:
            # 'categoria_dos' corrupted samples correction
            df['categoria_dos'] = df['categoria_dos'].apply(
                lambda x: 0 if math.isnan(x) else x)

    df.to_csv(index=False, path_or_buf=path.replace(
        '.txt', '') + "_" + option + ".csv", sep='|')

In [None]:
# "Modelar" and "Estimar" dataframes
input_parser(r"data/Modelar_UH2021.txt", 'drop')
modelar = pd.read_csv(r'data/Modelar_UH2021_drop.csv', sep='|', low_memory=False)
# "Estimar" dataframe has not samples with "estado" = "Rotura" and that column is
# converted to one-hot vector so it must be dropped
modelar.drop('estado_Rotura', axis=1, inplace=True)
input_parser(r"data/Estimar_UH2021.txt", 'drop')
estimar = pd.read_csv(r"data/Estimar_UH2021_drop.csv", sep='|', low_memory=False)
ids_estimar = set(estimar['id'].unique())

In [None]:
model_dict = {}
total_mse = 0
total_mae = 0
total_cf = 0
total_metrica = 0
for index, row in modelar.iterrows():
    if row['id'] in ids_estimar and row['id'] not in model_dict:
        df_aux = modelar[modelar['id'] == row['id']]
        X_train, X_test, y_train, y_test = train_test_split(
            df_aux.loc[:, df_aux.columns != 'unidades_vendidas'], df_aux['unidades_vendidas'], test_size=0.25)

        reg = RandomForestRegressor(
            verbose=0, n_jobs=-1, n_estimators=150, min_samples_split=3, min_samples_leaf=2)
        reg.fit(X_train, y_train)
        pred = reg.predict(X_test)

        rrmse = math.sqrt(mean_squared_error(y_test, pred)) / y_train.mean()
        cf = __favorable_cases(y_test.values, pred)
        metric = (0.7 * rrmse) + (0.3 * (1 - cf))

        total_mse += mean_squared_error(y_test, pred)
        total_mae += mean_absolute_error(y_test, pred)
        total_cf += cf
        total_metrica += metric

        model_dict[row['id']] = reg

print(f"MSE: {total_mse / len(model_dict)}")
print(f"MAE: {total_mae / len(model_dict)}")
print(f"Favorable cases: {total_cf / len(model_dict)}")
print(f"Datathon metric: {total_metrica / len(model_dict)}")

In [None]:
# Finally, the generation of response file
with open(r'res/Nevermore.txt', 'w', newline='', encoding='utf-8') as csv_file:
    csvwriter = csv.writer(csv_file, delimiter='|')
    csvwriter.writerow(['FECHA', 'ID', 'UNIDADES'])

    for index, row in estimar.iterrows():
        aux = pd.DataFrame(columns=estimar.columns)
        aux.loc[0] = row
        est_pred = model_dict[row['id']].predict(aux)[0]
        data_txt = f"{int(estimar.iloc[index]['dia'])}/{int(estimar.iloc[index]['mes'])}/{int(estimar.iloc[index]['anyo'])}"
        csvwriter.writerow(
            [data_txt, int(estimar.iloc[index]['id']), round(est_pred)])